1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_config.h"
13 #include "config/av1_rtcd.h"
14 
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 
19 // TODO(venkatsanampudi@ittiam.com): move this to header file
20 
21 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
22 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
23                                           4 * 5793 };
24 
25 // TODO(binpengsmail@gmail.com): replace some for loop with do {} while
26 
idct4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)27 static void idct4_new_sse2(const __m128i *input, __m128i *output,
28                            int8_t cos_bit) {
29   (void)cos_bit;
30   const int32_t *cospi = cospi_arr(INV_COS_BIT);
31   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
32 
33   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
34   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
35   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
36   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
37 
38   // stage 1
39   __m128i x[4];
40   x[0] = input[0];
41   x[1] = input[2];
42   x[2] = input[1];
43   x[3] = input[3];
44 
45   // stage 2
46   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
47   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
48 
49   // stage 3
50   btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
51   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
52 }
53 
idct4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)54 void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
55   (void)cos_bit;
56   const int32_t *cospi = cospi_arr(INV_COS_BIT);
57   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
58 
59   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
60   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
61   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
62   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
63 
64   // stage 1
65   __m128i x[4];
66   x[0] = input[0];
67   x[1] = input[2];
68   x[2] = input[1];
69   x[3] = input[3];
70 
71   // stage 2
72   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
73   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
74 
75   // stage 3
76   btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
77   btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
78 }
79 
idct8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)80 void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
81                           int8_t cos_bit) {
82   (void)cos_bit;
83   const int32_t *cospi = cospi_arr(INV_COS_BIT);
84 
85   // stage 1
86   __m128i x[2];
87   x[0] = input[0];
88 
89   // stage 2
90   // stage 3
91   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
92 
93   // stage 4
94   // stage 5
95   output[0] = x[0];
96   output[7] = x[0];
97   output[1] = x[1];
98   output[6] = x[1];
99   output[2] = x[1];
100   output[5] = x[1];
101   output[3] = x[0];
102   output[4] = x[0];
103 }
104 
idct8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)105 void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
106   (void)cos_bit;
107   const int32_t *cospi = cospi_arr(INV_COS_BIT);
108   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
109 
110   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
111   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
112   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
113   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
114   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
115   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
116   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
117   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
118   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
119 
120   // stage 1
121   __m128i x[8];
122   x[0] = input[0];
123   x[1] = input[4];
124   x[2] = input[2];
125   x[3] = input[6];
126   x[4] = input[1];
127   x[5] = input[5];
128   x[6] = input[3];
129   x[7] = input[7];
130 
131   // stage 2
132   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
133   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
134 
135   // stage 3
136   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
137   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
138   btf_16_adds_subs_sse2(x[4], x[5]);
139   btf_16_subs_adds_sse2(x[7], x[6]);
140 
141   // stage 4
142   btf_16_adds_subs_sse2(x[0], x[3]);
143   btf_16_adds_subs_sse2(x[1], x[2]);
144   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
145 
146   // stage 5
147   btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
148   btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
149   btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
150   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
151 }
152 
idct8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)153 void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
154   (void)cos_bit;
155   const int32_t *cospi = cospi_arr(INV_COS_BIT);
156   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
157 
158   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
159   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
160   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
161   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
162   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
163   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
164   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
165   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
166   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
167 
168   // stage 1
169   __m128i x[8];
170   x[0] = input[0];
171   x[1] = input[4];
172   x[2] = input[2];
173   x[3] = input[6];
174   x[4] = input[1];
175   x[5] = input[5];
176   x[6] = input[3];
177   x[7] = input[7];
178 
179   // stage 2
180   btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
181   btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
182 
183   // stage 3
184   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
185   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
186   btf_16_adds_subs_sse2(x[4], x[5]);
187   btf_16_subs_adds_sse2(x[7], x[6]);
188 
189   // stage 4
190   btf_16_adds_subs_sse2(x[0], x[3]);
191   btf_16_adds_subs_sse2(x[1], x[2]);
192   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
193 
194   // stage 5
195   btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
196   btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
197   btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
198   btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
199 }
200 
idct16_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)201 static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
202                                       const __m128i __rounding,
203                                       int8_t cos_bit) {
204   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
205   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
206   btf_16_adds_subs_sse2(x[0], x[3]);
207   btf_16_adds_subs_sse2(x[1], x[2]);
208   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
209   btf_16_adds_subs_sse2(x[8], x[11]);
210   btf_16_adds_subs_sse2(x[9], x[10]);
211   btf_16_subs_adds_sse2(x[15], x[12]);
212   btf_16_subs_adds_sse2(x[14], x[13]);
213 }
214 
idct16_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)215 static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
216                                       const __m128i __rounding,
217                                       int8_t cos_bit) {
218   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
219   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
220   btf_16_adds_subs_sse2(x[0], x[7]);
221   btf_16_adds_subs_sse2(x[1], x[6]);
222   btf_16_adds_subs_sse2(x[2], x[5]);
223   btf_16_adds_subs_sse2(x[3], x[4]);
224   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
225   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
226 }
227 
idct16_stage7_sse2(__m128i * output,__m128i * x)228 static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
229   btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
230   btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
231   btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
232   btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
233   btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
234   btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
235   btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
236   btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
237 }
238 
idct16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)239 static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
240                                   int8_t cos_bit) {
241   (void)cos_bit;
242   const int32_t *cospi = cospi_arr(INV_COS_BIT);
243 
244   // stage 1
245   __m128i x[2];
246   x[0] = input[0];
247 
248   // stage 2
249   // stage 3
250   // stage 4
251   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
252 
253   // stage 5
254   // stage 6
255   // stage 7
256   output[0] = x[0];
257   output[15] = x[0];
258   output[1] = x[1];
259   output[14] = x[1];
260   output[2] = x[1];
261   output[13] = x[1];
262   output[3] = x[0];
263   output[12] = x[0];
264   output[4] = x[0];
265   output[11] = x[0];
266   output[5] = x[1];
267   output[10] = x[1];
268   output[6] = x[1];
269   output[9] = x[1];
270   output[7] = x[0];
271   output[8] = x[0];
272 }
273 
idct16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)274 static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
275                                   int8_t cos_bit) {
276   (void)cos_bit;
277   const int32_t *cospi = cospi_arr(INV_COS_BIT);
278   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
279   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
280   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
281   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
282 
283   // stage 1
284   __m128i x[16];
285   x[0] = input[0];
286   x[2] = input[4];
287   x[4] = input[2];
288   x[6] = input[6];
289   x[8] = input[1];
290   x[10] = input[5];
291   x[12] = input[3];
292   x[14] = input[7];
293 
294   // stage 2
295   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
296   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
297   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
298   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
299 
300   // stage 3
301   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
302   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
303   btf_16_adds_subs_sse2(x[8], x[9]);
304   btf_16_subs_adds_sse2(x[11], x[10]);
305   btf_16_adds_subs_sse2(x[12], x[13]);
306   btf_16_subs_adds_sse2(x[15], x[14]);
307 
308   // stage 4
309   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
310   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
311   btf_16_adds_subs_sse2(x[4], x[5]);
312   btf_16_subs_adds_sse2(x[7], x[6]);
313   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
314   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
315 
316   idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
317   idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
318   idct16_stage7_sse2(output, x);
319 }
320 
idct16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)321 void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
322   (void)cos_bit;
323   const int32_t *cospi = cospi_arr(INV_COS_BIT);
324   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
325 
326   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
327   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
328   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
329   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
330   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
331   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
332   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
333   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
334   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
335   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
336   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
337   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
338   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
339   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
340   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
341   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
342   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
343   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
344   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
345 
346   // stage 1
347   __m128i x[16];
348   x[0] = input[0];
349   x[1] = input[8];
350   x[2] = input[4];
351   x[3] = input[12];
352   x[4] = input[2];
353   x[5] = input[10];
354   x[6] = input[6];
355   x[7] = input[14];
356   x[8] = input[1];
357   x[9] = input[9];
358   x[10] = input[5];
359   x[11] = input[13];
360   x[12] = input[3];
361   x[13] = input[11];
362   x[14] = input[7];
363   x[15] = input[15];
364 
365   // stage 2
366   btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
367   btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
368   btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
369   btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
370 
371   // stage 3
372   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
373   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
374   btf_16_adds_subs_sse2(x[8], x[9]);
375   btf_16_subs_adds_sse2(x[11], x[10]);
376   btf_16_adds_subs_sse2(x[12], x[13]);
377   btf_16_subs_adds_sse2(x[15], x[14]);
378 
379   // stage 4
380   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
381   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
382   btf_16_adds_subs_sse2(x[4], x[5]);
383   btf_16_subs_adds_sse2(x[7], x[6]);
384   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
385   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
386 
387   // stage 5~7
388   idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
389   idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
390   idct16_stage7_sse2(output, x);
391 }
392 
idct16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)393 void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
394   (void)cos_bit;
395   const int32_t *cospi = cospi_arr(INV_COS_BIT);
396   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
397 
398   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
399   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
400   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
401   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
402   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
403   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
404   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
405   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
406   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
407   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
408   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
409   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
410   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
411   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
412   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
413   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
414   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
415   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
416   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
417   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
418 
419   // stage 1
420   __m128i x[16];
421   x[0] = input[0];
422   x[1] = input[8];
423   x[2] = input[4];
424   x[3] = input[12];
425   x[4] = input[2];
426   x[5] = input[10];
427   x[6] = input[6];
428   x[7] = input[14];
429   x[8] = input[1];
430   x[9] = input[9];
431   x[10] = input[5];
432   x[11] = input[13];
433   x[12] = input[3];
434   x[13] = input[11];
435   x[14] = input[7];
436   x[15] = input[15];
437 
438   // stage 2
439   btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
440   btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
441   btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
442   btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
443 
444   // stage 3
445   btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
446   btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
447   btf_16_adds_subs_sse2(x[8], x[9]);
448   btf_16_subs_adds_sse2(x[11], x[10]);
449   btf_16_adds_subs_sse2(x[12], x[13]);
450   btf_16_subs_adds_sse2(x[15], x[14]);
451 
452   // stage 4
453   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
454   btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
455   btf_16_adds_subs_sse2(x[4], x[5]);
456   btf_16_subs_adds_sse2(x[7], x[6]);
457   btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
458   btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
459 
460   // stage 5
461   btf_16_adds_subs_sse2(x[0], x[3]);
462   btf_16_adds_subs_sse2(x[1], x[2]);
463   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
464   btf_16_adds_subs_sse2(x[8], x[11]);
465   btf_16_adds_subs_sse2(x[9], x[10]);
466   btf_16_subs_adds_sse2(x[15], x[12]);
467   btf_16_subs_adds_sse2(x[14], x[13]);
468 
469   // stage 6
470   btf_16_adds_subs_sse2(x[0], x[7]);
471   btf_16_adds_subs_sse2(x[1], x[6]);
472   btf_16_adds_subs_sse2(x[2], x[5]);
473   btf_16_adds_subs_sse2(x[3], x[4]);
474   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
475   btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
476 
477   // stage 7
478   idct16_stage7_sse2(output, x);
479 }
480 
idct32_high16_stage3_sse2(__m128i * x)481 static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
482   btf_16_adds_subs_sse2(x[16], x[17]);
483   btf_16_subs_adds_sse2(x[19], x[18]);
484   btf_16_adds_subs_sse2(x[20], x[21]);
485   btf_16_subs_adds_sse2(x[23], x[22]);
486   btf_16_adds_subs_sse2(x[24], x[25]);
487   btf_16_subs_adds_sse2(x[27], x[26]);
488   btf_16_adds_subs_sse2(x[28], x[29]);
489   btf_16_subs_adds_sse2(x[31], x[30]);
490 }
491 
idct32_high16_stage4_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)492 static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
493                                              const __m128i __rounding,
494                                              int8_t cos_bit) {
495   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
496   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
497   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
498   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
499   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
500   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
501   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
502   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
503   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
504   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
505 }
506 
idct32_high24_stage5_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)507 static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
508                                              const __m128i __rounding,
509                                              int8_t cos_bit) {
510   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
511   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
512   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
513   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
514   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
515   btf_16_adds_subs_sse2(x[16], x[19]);
516   btf_16_adds_subs_sse2(x[17], x[18]);
517   btf_16_subs_adds_sse2(x[23], x[20]);
518   btf_16_subs_adds_sse2(x[22], x[21]);
519   btf_16_adds_subs_sse2(x[24], x[27]);
520   btf_16_adds_subs_sse2(x[25], x[26]);
521   btf_16_subs_adds_sse2(x[31], x[28]);
522   btf_16_subs_adds_sse2(x[30], x[29]);
523 }
524 
idct32_high28_stage6_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)525 static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
526                                              const __m128i __rounding,
527                                              int8_t cos_bit) {
528   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
529   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
530   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
531   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
532   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
533   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
534   btf_16_adds_subs_sse2(x[8], x[11]);
535   btf_16_adds_subs_sse2(x[9], x[10]);
536   btf_16_subs_adds_sse2(x[15], x[12]);
537   btf_16_subs_adds_sse2(x[14], x[13]);
538   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
539   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
540   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
541   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
542 }
543 
idct32_stage7_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)544 static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
545                                       const __m128i __rounding,
546                                       int8_t cos_bit) {
547   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
548   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
549   btf_16_adds_subs_sse2(x[0], x[7]);
550   btf_16_adds_subs_sse2(x[1], x[6]);
551   btf_16_adds_subs_sse2(x[2], x[5]);
552   btf_16_adds_subs_sse2(x[3], x[4]);
553   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
554   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
555   btf_16_adds_subs_sse2(x[16], x[23]);
556   btf_16_adds_subs_sse2(x[17], x[22]);
557   btf_16_adds_subs_sse2(x[18], x[21]);
558   btf_16_adds_subs_sse2(x[19], x[20]);
559   btf_16_subs_adds_sse2(x[31], x[24]);
560   btf_16_subs_adds_sse2(x[30], x[25]);
561   btf_16_subs_adds_sse2(x[29], x[26]);
562   btf_16_subs_adds_sse2(x[28], x[27]);
563 }
564 
idct32_stage8_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)565 static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
566                                       const __m128i __rounding,
567                                       int8_t cos_bit) {
568   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
569   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
570   btf_16_adds_subs_sse2(x[0], x[15]);
571   btf_16_adds_subs_sse2(x[1], x[14]);
572   btf_16_adds_subs_sse2(x[2], x[13]);
573   btf_16_adds_subs_sse2(x[3], x[12]);
574   btf_16_adds_subs_sse2(x[4], x[11]);
575   btf_16_adds_subs_sse2(x[5], x[10]);
576   btf_16_adds_subs_sse2(x[6], x[9]);
577   btf_16_adds_subs_sse2(x[7], x[8]);
578   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
579   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
580   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
581   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
582 }
583 
idct32_stage9_sse2(__m128i * output,__m128i * x)584 static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
585   btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
586   btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
587   btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
588   btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
589   btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
590   btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
591   btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
592   btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
593   btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
594   btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
595   btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
596   btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
597   btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
598   btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
599   btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
600   btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
601 }
602 
idct32_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)603 static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
604                                   int8_t cos_bit) {
605   (void)cos_bit;
606   const int32_t *cospi = cospi_arr(INV_COS_BIT);
607 
608   // stage 1
609   __m128i x[2];
610   x[0] = input[0];
611 
612   // stage 2
613   // stage 3
614   // stage 4
615   // stage 5
616   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
617 
618   // stage 6
619   // stage 7
620   // stage 8
621   // stage 9
622   output[0] = x[0];
623   output[31] = x[0];
624   output[1] = x[1];
625   output[30] = x[1];
626   output[2] = x[1];
627   output[29] = x[1];
628   output[3] = x[0];
629   output[28] = x[0];
630   output[4] = x[0];
631   output[27] = x[0];
632   output[5] = x[1];
633   output[26] = x[1];
634   output[6] = x[1];
635   output[25] = x[1];
636   output[7] = x[0];
637   output[24] = x[0];
638   output[8] = x[0];
639   output[23] = x[0];
640   output[9] = x[1];
641   output[22] = x[1];
642   output[10] = x[1];
643   output[21] = x[1];
644   output[11] = x[0];
645   output[20] = x[0];
646   output[12] = x[0];
647   output[19] = x[0];
648   output[13] = x[1];
649   output[18] = x[1];
650   output[14] = x[1];
651   output[17] = x[1];
652   output[15] = x[0];
653   output[16] = x[0];
654 }
655 
idct32_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)656 static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
657                                   int8_t cos_bit) {
658   (void)cos_bit;
659   const int32_t *cospi = cospi_arr(INV_COS_BIT);
660   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
661 
662   // stage 1
663   __m128i x[32];
664   x[0] = input[0];
665   x[4] = input[4];
666   x[8] = input[2];
667   x[12] = input[6];
668   x[16] = input[1];
669   x[20] = input[5];
670   x[24] = input[3];
671   x[28] = input[7];
672 
673   // stage 2
674   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
675   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
676   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
677   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
678 
679   // stage 3
680   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
681   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
682   x[17] = x[16];
683   x[18] = x[19];
684   x[21] = x[20];
685   x[22] = x[23];
686   x[25] = x[24];
687   x[26] = x[27];
688   x[29] = x[28];
689   x[30] = x[31];
690 
691   // stage 4
692   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
693   x[9] = x[8];
694   x[10] = x[11];
695   x[13] = x[12];
696   x[14] = x[15];
697   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
698 
699   // stage 5
700   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
701   x[5] = x[4];
702   x[6] = x[7];
703   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
704   // stage 6
705   x[3] = x[0];
706   x[2] = x[1];
707   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
708 
709   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
710   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
711   idct32_stage9_sse2(output, x);
712 }
713 
idct32_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)714 static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
715                                    int8_t cos_bit) {
716   (void)cos_bit;
717   const int32_t *cospi = cospi_arr(INV_COS_BIT);
718   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
719 
720   // stage 1
721   __m128i x[32];
722   x[0] = input[0];
723   x[2] = input[8];
724   x[4] = input[4];
725   x[6] = input[12];
726   x[8] = input[2];
727   x[10] = input[10];
728   x[12] = input[6];
729   x[14] = input[14];
730   x[16] = input[1];
731   x[18] = input[9];
732   x[20] = input[5];
733   x[22] = input[13];
734   x[24] = input[3];
735   x[26] = input[11];
736   x[28] = input[7];
737   x[30] = input[15];
738 
739   // stage 2
740   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
741   btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
742   btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
743   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
744   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
745   btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
746   btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
747   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
748 
749   // stage 3
750   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
751   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
752   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
753   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
754   idct32_high16_stage3_sse2(x);
755 
756   // stage 4
757   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
758   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
759   btf_16_adds_subs_sse2(x[8], x[9]);
760   btf_16_subs_adds_sse2(x[11], x[10]);
761   btf_16_adds_subs_sse2(x[12], x[13]);
762   btf_16_subs_adds_sse2(x[15], x[14]);
763   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
764 
765   // stage 5
766   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
767   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
768   btf_16_adds_subs_sse2(x[4], x[5]);
769   btf_16_subs_adds_sse2(x[7], x[6]);
770   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
771 
772   btf_16_adds_subs_sse2(x[0], x[3]);
773   btf_16_adds_subs_sse2(x[1], x[2]);
774   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
775 
776   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
777   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
778   idct32_stage9_sse2(output, x);
779 }
780 
idct32_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)781 static void idct32_new_sse2(const __m128i *input, __m128i *output,
782                             int8_t cos_bit) {
783   (void)cos_bit;
784   const int32_t *cospi = cospi_arr(INV_COS_BIT);
785   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
786 
787   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
788   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
789   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
790   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
791   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
792   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
793   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
794   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
795   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
796   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
797   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
798   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
799   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
800   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
801   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
802   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
803   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
804   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
805   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
806   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
807   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
808   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
809   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
810   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
811   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
812   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
813   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
814   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
815   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
816   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
817   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
818   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
819 
820   // stage 1
821   __m128i x[32];
822   x[0] = input[0];
823   x[1] = input[16];
824   x[2] = input[8];
825   x[3] = input[24];
826   x[4] = input[4];
827   x[5] = input[20];
828   x[6] = input[12];
829   x[7] = input[28];
830   x[8] = input[2];
831   x[9] = input[18];
832   x[10] = input[10];
833   x[11] = input[26];
834   x[12] = input[6];
835   x[13] = input[22];
836   x[14] = input[14];
837   x[15] = input[30];
838   x[16] = input[1];
839   x[17] = input[17];
840   x[18] = input[9];
841   x[19] = input[25];
842   x[20] = input[5];
843   x[21] = input[21];
844   x[22] = input[13];
845   x[23] = input[29];
846   x[24] = input[3];
847   x[25] = input[19];
848   x[26] = input[11];
849   x[27] = input[27];
850   x[28] = input[7];
851   x[29] = input[23];
852   x[30] = input[15];
853   x[31] = input[31];
854 
855   // stage 2
856   btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
857   btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
858   btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
859   btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
860   btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
861   btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
862   btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
863   btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
864 
865   // stage 3
866   btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
867   btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
868   btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
869   btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
870   idct32_high16_stage3_sse2(x);
871 
872   // stage 4
873   btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
874   btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
875   btf_16_adds_subs_sse2(x[8], x[9]);
876   btf_16_subs_adds_sse2(x[11], x[10]);
877   btf_16_adds_subs_sse2(x[12], x[13]);
878   btf_16_subs_adds_sse2(x[15], x[14]);
879   idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
880 
881   // stage 5
882   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
883   btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
884   btf_16_adds_subs_sse2(x[4], x[5]);
885   btf_16_adds_subs_sse2(x[7], x[6]);
886   idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
887 
888   // stage 6
889   btf_16_adds_subs_sse2(x[0], x[3]);
890   btf_16_adds_subs_sse2(x[1], x[2]);
891   idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
892 
893   // stage 7~8
894   idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
895   idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
896   idct32_stage9_sse2(output, x);
897 }
898 
idct64_stage4_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)899 static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
900                                              const __m128i __rounding,
901                                              int8_t cos_bit) {
902   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
903   const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
904   const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
905   const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
906   const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
907   const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
908   const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
909   const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
910   const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
911   const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
912   const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
913   const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
914   btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
915   btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
916   btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
917   btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
918   btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
919   btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
920   btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
921   btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
922 }
923 
idct64_stage5_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)924 static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
925                                              const __m128i __rounding,
926                                              int8_t cos_bit) {
927   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
928   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
929   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
930   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
931   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
932   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
933   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
934   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
935   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
936   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
937   btf_16_adds_subs_sse2(x[32], x[35]);
938   btf_16_adds_subs_sse2(x[33], x[34]);
939   btf_16_subs_adds_sse2(x[39], x[36]);
940   btf_16_subs_adds_sse2(x[38], x[37]);
941   btf_16_adds_subs_sse2(x[40], x[43]);
942   btf_16_adds_subs_sse2(x[41], x[42]);
943   btf_16_subs_adds_sse2(x[47], x[44]);
944   btf_16_subs_adds_sse2(x[46], x[45]);
945   btf_16_adds_subs_sse2(x[48], x[51]);
946   btf_16_adds_subs_sse2(x[49], x[50]);
947   btf_16_subs_adds_sse2(x[55], x[52]);
948   btf_16_subs_adds_sse2(x[54], x[53]);
949   btf_16_adds_subs_sse2(x[56], x[59]);
950   btf_16_adds_subs_sse2(x[57], x[58]);
951   btf_16_subs_adds_sse2(x[63], x[60]);
952   btf_16_subs_adds_sse2(x[62], x[61]);
953 }
954 
idct64_stage6_high32_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)955 static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
956                                              const __m128i __rounding,
957                                              int8_t cos_bit) {
958   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
959   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
960   const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
961   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
962   const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
963   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
964   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
965   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
966   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
967   btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
968   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
969   btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
970   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
971   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
972 }
973 
idct64_stage6_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)974 static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
975                                              const __m128i __rounding,
976                                              int8_t cos_bit) {
977   btf_16_adds_subs_sse2(x[16], x[19]);
978   btf_16_adds_subs_sse2(x[17], x[18]);
979   btf_16_subs_adds_sse2(x[23], x[20]);
980   btf_16_subs_adds_sse2(x[22], x[21]);
981   btf_16_adds_subs_sse2(x[24], x[27]);
982   btf_16_adds_subs_sse2(x[25], x[26]);
983   btf_16_subs_adds_sse2(x[31], x[28]);
984   btf_16_subs_adds_sse2(x[30], x[29]);
985   idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
986 }
987 
idct64_stage7_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)988 static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
989                                              const __m128i __rounding,
990                                              int8_t cos_bit) {
991   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
992   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
993   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
994   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
995   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
996   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
997   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
998   btf_16_adds_subs_sse2(x[32], x[39]);
999   btf_16_adds_subs_sse2(x[33], x[38]);
1000   btf_16_adds_subs_sse2(x[34], x[37]);
1001   btf_16_adds_subs_sse2(x[35], x[36]);
1002   btf_16_subs_adds_sse2(x[47], x[40]);
1003   btf_16_subs_adds_sse2(x[46], x[41]);
1004   btf_16_subs_adds_sse2(x[45], x[42]);
1005   btf_16_subs_adds_sse2(x[44], x[43]);
1006   btf_16_adds_subs_sse2(x[48], x[55]);
1007   btf_16_adds_subs_sse2(x[49], x[54]);
1008   btf_16_adds_subs_sse2(x[50], x[53]);
1009   btf_16_adds_subs_sse2(x[51], x[52]);
1010   btf_16_subs_adds_sse2(x[63], x[56]);
1011   btf_16_subs_adds_sse2(x[62], x[57]);
1012   btf_16_subs_adds_sse2(x[61], x[58]);
1013   btf_16_subs_adds_sse2(x[60], x[59]);
1014 }
1015 
idct64_stage8_high48_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1016 static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
1017                                              const __m128i __rounding,
1018                                              int8_t cos_bit) {
1019   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1020   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1021   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1022   btf_16_adds_subs_sse2(x[16], x[23]);
1023   btf_16_adds_subs_sse2(x[17], x[22]);
1024   btf_16_adds_subs_sse2(x[18], x[21]);
1025   btf_16_adds_subs_sse2(x[19], x[20]);
1026   btf_16_subs_adds_sse2(x[31], x[24]);
1027   btf_16_subs_adds_sse2(x[30], x[25]);
1028   btf_16_subs_adds_sse2(x[29], x[26]);
1029   btf_16_subs_adds_sse2(x[28], x[27]);
1030   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
1031   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
1032   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
1033   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
1034   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
1035   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
1036   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
1037   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
1038 }
1039 
idct64_stage9_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1040 static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
1041                                       const __m128i __rounding,
1042                                       int8_t cos_bit) {
1043   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1044   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1045   btf_16_adds_subs_sse2(x[0], x[15]);
1046   btf_16_adds_subs_sse2(x[1], x[14]);
1047   btf_16_adds_subs_sse2(x[2], x[13]);
1048   btf_16_adds_subs_sse2(x[3], x[12]);
1049   btf_16_adds_subs_sse2(x[4], x[11]);
1050   btf_16_adds_subs_sse2(x[5], x[10]);
1051   btf_16_adds_subs_sse2(x[6], x[9]);
1052   btf_16_adds_subs_sse2(x[7], x[8]);
1053   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
1054   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
1055   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
1056   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
1057   btf_16_adds_subs_sse2(x[32], x[47]);
1058   btf_16_adds_subs_sse2(x[33], x[46]);
1059   btf_16_adds_subs_sse2(x[34], x[45]);
1060   btf_16_adds_subs_sse2(x[35], x[44]);
1061   btf_16_adds_subs_sse2(x[36], x[43]);
1062   btf_16_adds_subs_sse2(x[37], x[42]);
1063   btf_16_adds_subs_sse2(x[38], x[41]);
1064   btf_16_adds_subs_sse2(x[39], x[40]);
1065   btf_16_subs_adds_sse2(x[63], x[48]);
1066   btf_16_subs_adds_sse2(x[62], x[49]);
1067   btf_16_subs_adds_sse2(x[61], x[50]);
1068   btf_16_subs_adds_sse2(x[60], x[51]);
1069   btf_16_subs_adds_sse2(x[59], x[52]);
1070   btf_16_subs_adds_sse2(x[58], x[53]);
1071   btf_16_subs_adds_sse2(x[57], x[54]);
1072   btf_16_subs_adds_sse2(x[56], x[55]);
1073 }
1074 
idct64_stage10_sse2(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1075 static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
1076                                        const __m128i __rounding,
1077                                        int8_t cos_bit) {
1078   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1079   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1080   btf_16_adds_subs_sse2(x[0], x[31]);
1081   btf_16_adds_subs_sse2(x[1], x[30]);
1082   btf_16_adds_subs_sse2(x[2], x[29]);
1083   btf_16_adds_subs_sse2(x[3], x[28]);
1084   btf_16_adds_subs_sse2(x[4], x[27]);
1085   btf_16_adds_subs_sse2(x[5], x[26]);
1086   btf_16_adds_subs_sse2(x[6], x[25]);
1087   btf_16_adds_subs_sse2(x[7], x[24]);
1088   btf_16_adds_subs_sse2(x[8], x[23]);
1089   btf_16_adds_subs_sse2(x[9], x[22]);
1090   btf_16_adds_subs_sse2(x[10], x[21]);
1091   btf_16_adds_subs_sse2(x[11], x[20]);
1092   btf_16_adds_subs_sse2(x[12], x[19]);
1093   btf_16_adds_subs_sse2(x[13], x[18]);
1094   btf_16_adds_subs_sse2(x[14], x[17]);
1095   btf_16_adds_subs_sse2(x[15], x[16]);
1096   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
1097   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
1098   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
1099   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
1100   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
1101   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
1102   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
1103   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
1104 }
1105 
idct64_stage11_sse2(__m128i * output,__m128i * x)1106 static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
1107   btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
1108   btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
1109   btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
1110   btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
1111   btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
1112   btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
1113   btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
1114   btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
1115   btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
1116   btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
1117   btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
1118   btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
1119   btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
1120   btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
1121   btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
1122   btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
1123   btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
1124   btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
1125   btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
1126   btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
1127   btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
1128   btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
1129   btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
1130   btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
1131   btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
1132   btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
1133   btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
1134   btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
1135   btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
1136   btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
1137   btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
1138   btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
1139 }
1140 
idct64_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1141 static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
1142                                   int8_t cos_bit) {
1143   (void)cos_bit;
1144   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1145 
1146   // stage 1
1147   __m128i x[32];
1148   x[0] = input[0];
1149 
1150   // stage 2
1151   // stage 3
1152   // stage 4
1153   // stage 5
1154   // stage 6
1155   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1156 
1157   // stage 7
1158   // stage 8
1159   // stage 9
1160   // stage 10
1161   // stage 11
1162   output[0] = x[0];
1163   output[63] = x[0];
1164   output[1] = x[1];
1165   output[62] = x[1];
1166   output[2] = x[1];
1167   output[61] = x[1];
1168   output[3] = x[0];
1169   output[60] = x[0];
1170   output[4] = x[0];
1171   output[59] = x[0];
1172   output[5] = x[1];
1173   output[58] = x[1];
1174   output[6] = x[1];
1175   output[57] = x[1];
1176   output[7] = x[0];
1177   output[56] = x[0];
1178   output[8] = x[0];
1179   output[55] = x[0];
1180   output[9] = x[1];
1181   output[54] = x[1];
1182   output[10] = x[1];
1183   output[53] = x[1];
1184   output[11] = x[0];
1185   output[52] = x[0];
1186   output[12] = x[0];
1187   output[51] = x[0];
1188   output[13] = x[1];
1189   output[50] = x[1];
1190   output[14] = x[1];
1191   output[49] = x[1];
1192   output[15] = x[0];
1193   output[48] = x[0];
1194   output[16] = x[0];
1195   output[47] = x[0];
1196   output[17] = x[1];
1197   output[46] = x[1];
1198   output[18] = x[1];
1199   output[45] = x[1];
1200   output[19] = x[0];
1201   output[44] = x[0];
1202   output[20] = x[0];
1203   output[43] = x[0];
1204   output[21] = x[1];
1205   output[42] = x[1];
1206   output[22] = x[1];
1207   output[41] = x[1];
1208   output[23] = x[0];
1209   output[40] = x[0];
1210   output[24] = x[0];
1211   output[39] = x[0];
1212   output[25] = x[1];
1213   output[38] = x[1];
1214   output[26] = x[1];
1215   output[37] = x[1];
1216   output[27] = x[0];
1217   output[36] = x[0];
1218   output[28] = x[0];
1219   output[35] = x[0];
1220   output[29] = x[1];
1221   output[34] = x[1];
1222   output[30] = x[1];
1223   output[33] = x[1];
1224   output[31] = x[0];
1225   output[32] = x[0];
1226 }
1227 
idct64_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1228 static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
1229                                   int8_t cos_bit) {
1230   (void)cos_bit;
1231   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1232   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1233   const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
1234   const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
1235   const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
1236   const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
1237   const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
1238   const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
1239   const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
1240   const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
1241   const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
1242   const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
1243   const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
1244   const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
1245   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1246   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1247   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1248   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1249 
1250   // stage 1
1251   __m128i x[64];
1252   x[0] = input[0];
1253   x[8] = input[4];
1254   x[16] = input[2];
1255   x[24] = input[6];
1256   x[32] = input[1];
1257   x[40] = input[5];
1258   x[48] = input[3];
1259   x[56] = input[7];
1260 
1261   // stage 2
1262   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1263   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1264   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1265   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1266 
1267   // stage 3
1268   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1269   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1270   x[33] = x[32];
1271   x[38] = x[39];
1272   x[41] = x[40];
1273   x[46] = x[47];
1274   x[49] = x[48];
1275   x[54] = x[55];
1276   x[57] = x[56];
1277   x[62] = x[63];
1278 
1279   // stage 4
1280   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1281   x[17] = x[16];
1282   x[22] = x[23];
1283   x[25] = x[24];
1284   x[30] = x[31];
1285   btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
1286   btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
1287   btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
1288   btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
1289 
1290   // stage 5
1291   x[9] = x[8];
1292   x[14] = x[15];
1293   btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
1294   btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
1295   x[35] = x[32];
1296   x[34] = x[33];
1297   x[36] = x[39];
1298   x[37] = x[38];
1299   x[43] = x[40];
1300   x[42] = x[41];
1301   x[44] = x[47];
1302   x[45] = x[46];
1303   x[51] = x[48];
1304   x[50] = x[49];
1305   x[52] = x[55];
1306   x[53] = x[54];
1307   x[59] = x[56];
1308   x[58] = x[57];
1309   x[60] = x[63];
1310   x[61] = x[62];
1311 
1312   // stage 6
1313   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1314   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1315   x[19] = x[16];
1316   x[18] = x[17];
1317   x[20] = x[23];
1318   x[21] = x[22];
1319   x[27] = x[24];
1320   x[26] = x[25];
1321   x[28] = x[31];
1322   x[29] = x[30];
1323   idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
1324 
1325   // stage 7
1326   x[3] = x[0];
1327   x[2] = x[1];
1328   x[11] = x[8];
1329   x[10] = x[9];
1330   x[12] = x[15];
1331   x[13] = x[14];
1332   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1333 
1334   // stage 8
1335   x[7] = x[0];
1336   x[6] = x[1];
1337   x[5] = x[2];
1338   x[4] = x[3];
1339   x[9] = x[9];
1340   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1341   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1342   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1343 
1344   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1345   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1346   idct64_stage11_sse2(output, x);
1347 }
1348 
idct64_low16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1349 static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
1350                                    int8_t cos_bit) {
1351   (void)cos_bit;
1352   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1353   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1354 
1355   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1356   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1357   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1358   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1359   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1360 
1361   // stage 1
1362   __m128i x[64];
1363   x[0] = input[0];
1364   x[4] = input[8];
1365   x[8] = input[4];
1366   x[12] = input[12];
1367   x[16] = input[2];
1368   x[20] = input[10];
1369   x[24] = input[6];
1370   x[28] = input[14];
1371   x[32] = input[1];
1372   x[36] = input[9];
1373   x[40] = input[5];
1374   x[44] = input[13];
1375   x[48] = input[3];
1376   x[52] = input[11];
1377   x[56] = input[7];
1378   x[60] = input[15];
1379 
1380   // stage 2
1381   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1382   btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1383   btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1384   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1385   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1386   btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1387   btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1388   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1389 
1390   // stage 3
1391   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1392   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1393   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1394   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1395   x[33] = x[32];
1396   x[34] = x[35];
1397   x[37] = x[36];
1398   x[38] = x[39];
1399   x[41] = x[40];
1400   x[42] = x[43];
1401   x[45] = x[44];
1402   x[46] = x[47];
1403   x[49] = x[48];
1404   x[50] = x[51];
1405   x[53] = x[52];
1406   x[54] = x[55];
1407   x[57] = x[56];
1408   x[58] = x[59];
1409   x[61] = x[60];
1410   x[62] = x[63];
1411 
1412   // stage 4
1413   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1414   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1415   x[17] = x[16];
1416   x[18] = x[19];
1417   x[21] = x[20];
1418   x[22] = x[23];
1419   x[25] = x[24];
1420   x[26] = x[27];
1421   x[29] = x[28];
1422   x[30] = x[31];
1423   idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1424 
1425   // stage 5
1426   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1427   x[9] = x[8];
1428   x[10] = x[11];
1429   x[13] = x[12];
1430   x[14] = x[15];
1431   idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1432 
1433   // stage 6
1434   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1435   x[5] = x[4];
1436   x[6] = x[7];
1437   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1438   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1439   idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1440 
1441   // stage 7
1442   x[3] = x[0];
1443   x[2] = x[1];
1444   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1445   btf_16_adds_subs_sse2(x[8], x[11]);
1446   btf_16_adds_subs_sse2(x[9], x[10]);
1447   btf_16_subs_adds_sse2(x[15], x[12]);
1448   btf_16_subs_adds_sse2(x[14], x[13]);
1449   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1450 
1451   // stage 8
1452   btf_16_adds_subs_sse2(x[0], x[7]);
1453   btf_16_adds_subs_sse2(x[1], x[6]);
1454   btf_16_adds_subs_sse2(x[2], x[5]);
1455   btf_16_adds_subs_sse2(x[3], x[4]);
1456   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1457   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1458   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1459 
1460   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1461   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1462   idct64_stage11_sse2(output, x);
1463 }
1464 
idct64_low32_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1465 static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
1466                                    int8_t cos_bit) {
1467   (void)cos_bit;
1468   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1469   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1470 
1471   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1472   const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
1473   const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
1474   const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
1475   const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
1476 
1477   // stage 1
1478   __m128i x[64];
1479   x[0] = input[0];
1480   x[2] = input[16];
1481   x[4] = input[8];
1482   x[6] = input[24];
1483   x[8] = input[4];
1484   x[10] = input[20];
1485   x[12] = input[12];
1486   x[14] = input[28];
1487   x[16] = input[2];
1488   x[18] = input[18];
1489   x[20] = input[10];
1490   x[22] = input[26];
1491   x[24] = input[6];
1492   x[26] = input[22];
1493   x[28] = input[14];
1494   x[30] = input[30];
1495   x[32] = input[1];
1496   x[34] = input[17];
1497   x[36] = input[9];
1498   x[38] = input[25];
1499   x[40] = input[5];
1500   x[42] = input[21];
1501   x[44] = input[13];
1502   x[46] = input[29];
1503   x[48] = input[3];
1504   x[50] = input[19];
1505   x[52] = input[11];
1506   x[54] = input[27];
1507   x[56] = input[7];
1508   x[58] = input[23];
1509   x[60] = input[15];
1510   x[62] = input[31];
1511 
1512   // stage 2
1513   btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
1514   btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
1515   btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
1516   btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
1517   btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
1518   btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
1519   btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
1520   btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
1521   btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
1522   btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
1523   btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
1524   btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
1525   btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
1526   btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
1527   btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
1528   btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
1529 
1530   // stage 3
1531   btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
1532   btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
1533   btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
1534   btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
1535   btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
1536   btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
1537   btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
1538   btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
1539   btf_16_adds_subs_sse2(x[32], x[33]);
1540   btf_16_subs_adds_sse2(x[35], x[34]);
1541   btf_16_adds_subs_sse2(x[36], x[37]);
1542   btf_16_subs_adds_sse2(x[39], x[38]);
1543   btf_16_adds_subs_sse2(x[40], x[41]);
1544   btf_16_subs_adds_sse2(x[43], x[42]);
1545   btf_16_adds_subs_sse2(x[44], x[45]);
1546   btf_16_subs_adds_sse2(x[47], x[46]);
1547   btf_16_adds_subs_sse2(x[48], x[49]);
1548   btf_16_subs_adds_sse2(x[51], x[50]);
1549   btf_16_adds_subs_sse2(x[52], x[53]);
1550   btf_16_subs_adds_sse2(x[55], x[54]);
1551   btf_16_adds_subs_sse2(x[56], x[57]);
1552   btf_16_subs_adds_sse2(x[59], x[58]);
1553   btf_16_adds_subs_sse2(x[60], x[61]);
1554   btf_16_subs_adds_sse2(x[63], x[62]);
1555 
1556   // stage 4
1557   btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
1558   btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
1559   btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
1560   btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
1561   btf_16_adds_subs_sse2(x[16], x[17]);
1562   btf_16_subs_adds_sse2(x[19], x[18]);
1563   btf_16_adds_subs_sse2(x[20], x[21]);
1564   btf_16_subs_adds_sse2(x[23], x[22]);
1565   btf_16_adds_subs_sse2(x[24], x[25]);
1566   btf_16_subs_adds_sse2(x[27], x[26]);
1567   btf_16_adds_subs_sse2(x[28], x[29]);
1568   btf_16_subs_adds_sse2(x[31], x[30]);
1569   idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
1570 
1571   // stage 5
1572   btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
1573   btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
1574   btf_16_adds_subs_sse2(x[8], x[9]);
1575   btf_16_subs_adds_sse2(x[11], x[10]);
1576   btf_16_adds_subs_sse2(x[12], x[13]);
1577   btf_16_subs_adds_sse2(x[15], x[14]);
1578   idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
1579 
1580   // stage 6
1581   btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
1582   btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
1583   btf_16_adds_subs_sse2(x[4], x[5]);
1584   btf_16_subs_adds_sse2(x[7], x[6]);
1585   btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
1586   btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
1587   idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
1588 
1589   // stage 7
1590   btf_16_adds_subs_sse2(x[0], x[3]);
1591   btf_16_adds_subs_sse2(x[1], x[2]);
1592   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
1593   btf_16_adds_subs_sse2(x[8], x[11]);
1594   btf_16_adds_subs_sse2(x[9], x[10]);
1595   btf_16_subs_adds_sse2(x[15], x[12]);
1596   btf_16_subs_adds_sse2(x[14], x[13]);
1597   idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
1598 
1599   // stage 8
1600   btf_16_adds_subs_sse2(x[0], x[7]);
1601   btf_16_adds_subs_sse2(x[1], x[6]);
1602   btf_16_adds_subs_sse2(x[2], x[5]);
1603   btf_16_adds_subs_sse2(x[3], x[4]);
1604   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
1605   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
1606   idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
1607 
1608   // stage 9~11
1609   idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
1610   idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
1611   idct64_stage11_sse2(output, x);
1612 }
1613 
iadst4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1614 void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1615   (void)cos_bit;
1616   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1617   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1618   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1619   const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1620   const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1621   const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1622   const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1623   const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1624   const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1625   __m128i x0[4];
1626   x0[0] = input[0];
1627   x0[1] = input[1];
1628   x0[2] = input[2];
1629   x0[3] = input[3];
1630 
1631   __m128i u[4];
1632   u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1633   u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
1634   u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
1635   u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
1636 
1637   __m128i x1[16];
1638   x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
1639   x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
1640   x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
1641   x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
1642   x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
1643   x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
1644   x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
1645   x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
1646   x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
1647   x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
1648   x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
1649   x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
1650   x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
1651   x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
1652   x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
1653   x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
1654 
1655   __m128i x2[8];
1656   x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
1657   x2[1] = _mm_add_epi32(x1[1], x1[5]);
1658   x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
1659   x2[3] = _mm_add_epi32(x1[3], x1[7]);
1660   x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
1661   x2[5] = _mm_add_epi32(x1[9], x1[11]);
1662   x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
1663   x2[7] = _mm_add_epi32(x1[13], x1[15]);
1664 
1665   const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1666   for (int i = 0; i < 4; ++i) {
1667     __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
1668     __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
1669     out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1670     out1 = _mm_srai_epi32(out1, INV_COS_BIT);
1671     output[i] = _mm_packs_epi32(out0, out1);
1672   }
1673 }
1674 
1675 // TODO(binpengsmail@gmail.com):
1676 // To explore the reuse of VP9 versions of corresponding SSE2 functions and
1677 // evaluate whether there is a possibility for further speedup.
iadst4_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1678 void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1679   (void)cos_bit;
1680   const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
1681   const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
1682   const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
1683   const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
1684   const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
1685   const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
1686   const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
1687   const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
1688   const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
1689   __m128i x0[4];
1690   x0[0] = input[0];
1691   x0[1] = input[1];
1692   x0[2] = input[2];
1693   x0[3] = input[3];
1694 
1695   __m128i u[2];
1696   u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
1697   u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
1698 
1699   __m128i x1[8];
1700   x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
1701   x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
1702   x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
1703   x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
1704   x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
1705   x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
1706   x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
1707   x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
1708 
1709   __m128i x2[4];
1710   x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
1711   x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
1712   x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
1713   x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
1714 
1715   const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1716   for (int i = 0; i < 4; ++i) {
1717     __m128i out0 = _mm_add_epi32(x2[i], rounding);
1718     out0 = _mm_srai_epi32(out0, INV_COS_BIT);
1719     output[i] = _mm_packs_epi32(out0, out0);
1720   }
1721 }
1722 
iadst8_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1723 static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
1724                                   int8_t cos_bit) {
1725   (void)cos_bit;
1726   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1727   const __m128i __zero = _mm_setzero_si128();
1728   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1729 
1730   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1731   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1732   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1733   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1734 
1735   // stage 1
1736   __m128i x[8];
1737   x[1] = input[0];
1738 
1739   // stage 2
1740   btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
1741 
1742   // stage 3
1743   x[4] = x[0];
1744   x[5] = x[1];
1745 
1746   // stage 4
1747   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1748 
1749   // stage 5
1750   x[2] = x[0];
1751   x[3] = x[1];
1752   x[6] = x[4];
1753   x[7] = x[5];
1754 
1755   // stage 6
1756   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1757   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1758 
1759   // stage 7
1760   output[0] = x[0];
1761   output[1] = _mm_subs_epi16(__zero, x[4]);
1762   output[2] = x[6];
1763   output[3] = _mm_subs_epi16(__zero, x[2]);
1764   output[4] = x[3];
1765   output[5] = _mm_subs_epi16(__zero, x[7]);
1766   output[6] = x[5];
1767   output[7] = _mm_subs_epi16(__zero, x[1]);
1768 }
1769 
iadst8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1770 void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1771   (void)cos_bit;
1772   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1773   const __m128i __zero = _mm_setzero_si128();
1774   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1775 
1776   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1777   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1778   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1779   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1780   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1781   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1782   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1783   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1784   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1785   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1786   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1787   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1788   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1789 
1790   // stage 1
1791   __m128i x[8];
1792   x[0] = input[7];
1793   x[1] = input[0];
1794   x[2] = input[5];
1795   x[3] = input[2];
1796   x[4] = input[3];
1797   x[5] = input[4];
1798   x[6] = input[1];
1799   x[7] = input[6];
1800 
1801   // stage 2
1802   btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1803   btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1804   btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1805   btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1806 
1807   // stage 3
1808   btf_16_adds_subs_sse2(x[0], x[4]);
1809   btf_16_adds_subs_sse2(x[1], x[5]);
1810   btf_16_adds_subs_sse2(x[2], x[6]);
1811   btf_16_adds_subs_sse2(x[3], x[7]);
1812 
1813   // stage 4
1814   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1815   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1816 
1817   // stage 5
1818   btf_16_adds_subs_sse2(x[0], x[2]);
1819   btf_16_adds_subs_sse2(x[1], x[3]);
1820   btf_16_adds_subs_sse2(x[4], x[6]);
1821   btf_16_adds_subs_sse2(x[5], x[7]);
1822 
1823   // stage 6
1824   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1825   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1826 
1827   // stage 7
1828   output[0] = x[0];
1829   output[1] = _mm_subs_epi16(__zero, x[4]);
1830   output[2] = x[6];
1831   output[3] = _mm_subs_epi16(__zero, x[2]);
1832   output[4] = x[3];
1833   output[5] = _mm_subs_epi16(__zero, x[7]);
1834   output[6] = x[5];
1835   output[7] = _mm_subs_epi16(__zero, x[1]);
1836 }
1837 
iadst8_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)1838 void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
1839   (void)cos_bit;
1840   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1841   const __m128i __zero = _mm_setzero_si128();
1842   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
1843 
1844   const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
1845   const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
1846   const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
1847   const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
1848   const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
1849   const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
1850   const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
1851   const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
1852   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1853   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1854   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1855   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1856   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1857 
1858   // stage 1
1859   __m128i x[8];
1860   x[0] = input[7];
1861   x[1] = input[0];
1862   x[2] = input[5];
1863   x[3] = input[2];
1864   x[4] = input[3];
1865   x[5] = input[4];
1866   x[6] = input[1];
1867   x[7] = input[6];
1868 
1869   // stage 2
1870   btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
1871   btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
1872   btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
1873   btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
1874 
1875   // stage 3
1876   btf_16_adds_subs_sse2(x[0], x[4]);
1877   btf_16_adds_subs_sse2(x[1], x[5]);
1878   btf_16_adds_subs_sse2(x[2], x[6]);
1879   btf_16_adds_subs_sse2(x[3], x[7]);
1880 
1881   // stage 4
1882   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1883   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1884 
1885   // stage 5
1886   btf_16_adds_subs_sse2(x[0], x[2]);
1887   btf_16_adds_subs_sse2(x[1], x[3]);
1888   btf_16_adds_subs_sse2(x[4], x[6]);
1889   btf_16_adds_subs_sse2(x[5], x[7]);
1890 
1891   // stage 6
1892   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1893   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1894 
1895   // stage 7
1896   output[0] = x[0];
1897   output[1] = _mm_subs_epi16(__zero, x[4]);
1898   output[2] = x[6];
1899   output[3] = _mm_subs_epi16(__zero, x[2]);
1900   output[4] = x[3];
1901   output[5] = _mm_subs_epi16(__zero, x[7]);
1902   output[6] = x[5];
1903   output[7] = _mm_subs_epi16(__zero, x[1]);
1904 }
1905 
iadst16_stage3_ssse3(__m128i * x)1906 static INLINE void iadst16_stage3_ssse3(__m128i *x) {
1907   btf_16_adds_subs_sse2(x[0], x[8]);
1908   btf_16_adds_subs_sse2(x[1], x[9]);
1909   btf_16_adds_subs_sse2(x[2], x[10]);
1910   btf_16_adds_subs_sse2(x[3], x[11]);
1911   btf_16_adds_subs_sse2(x[4], x[12]);
1912   btf_16_adds_subs_sse2(x[5], x[13]);
1913   btf_16_adds_subs_sse2(x[6], x[14]);
1914   btf_16_adds_subs_sse2(x[7], x[15]);
1915 }
1916 
iadst16_stage4_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1917 static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
1918                                         const __m128i __rounding,
1919                                         int8_t cos_bit) {
1920   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
1921   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
1922   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
1923   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
1924   const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
1925   const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
1926   btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
1927   btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
1928   btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
1929   btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
1930 }
1931 
iadst16_stage5_ssse3(__m128i * x)1932 static INLINE void iadst16_stage5_ssse3(__m128i *x) {
1933   btf_16_adds_subs_sse2(x[0], x[4]);
1934   btf_16_adds_subs_sse2(x[1], x[5]);
1935   btf_16_adds_subs_sse2(x[2], x[6]);
1936   btf_16_adds_subs_sse2(x[3], x[7]);
1937   btf_16_adds_subs_sse2(x[8], x[12]);
1938   btf_16_adds_subs_sse2(x[9], x[13]);
1939   btf_16_adds_subs_sse2(x[10], x[14]);
1940   btf_16_adds_subs_sse2(x[11], x[15]);
1941 }
1942 
iadst16_stage6_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1943 static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
1944                                         const __m128i __rounding,
1945                                         int8_t cos_bit) {
1946   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
1947   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
1948   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
1949   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
1950   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
1951   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
1952   btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
1953 }
1954 
iadst16_stage7_ssse3(__m128i * x)1955 static INLINE void iadst16_stage7_ssse3(__m128i *x) {
1956   btf_16_adds_subs_sse2(x[0], x[2]);
1957   btf_16_adds_subs_sse2(x[1], x[3]);
1958   btf_16_adds_subs_sse2(x[4], x[6]);
1959   btf_16_adds_subs_sse2(x[5], x[7]);
1960   btf_16_adds_subs_sse2(x[8], x[10]);
1961   btf_16_adds_subs_sse2(x[9], x[11]);
1962   btf_16_adds_subs_sse2(x[12], x[14]);
1963   btf_16_adds_subs_sse2(x[13], x[15]);
1964 }
1965 
iadst16_stage8_ssse3(__m128i * x,const int32_t * cospi,const __m128i __rounding,int8_t cos_bit)1966 static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
1967                                         const __m128i __rounding,
1968                                         int8_t cos_bit) {
1969   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
1970   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
1971   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
1972   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
1973   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
1974   btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
1975 }
1976 
iadst16_stage9_ssse3(__m128i * output,__m128i * x)1977 static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
1978   const __m128i __zero = _mm_setzero_si128();
1979   output[0] = x[0];
1980   output[1] = _mm_subs_epi16(__zero, x[8]);
1981   output[2] = x[12];
1982   output[3] = _mm_subs_epi16(__zero, x[4]);
1983   output[4] = x[6];
1984   output[5] = _mm_subs_epi16(__zero, x[14]);
1985   output[6] = x[10];
1986   output[7] = _mm_subs_epi16(__zero, x[2]);
1987   output[8] = x[3];
1988   output[9] = _mm_subs_epi16(__zero, x[11]);
1989   output[10] = x[15];
1990   output[11] = _mm_subs_epi16(__zero, x[7]);
1991   output[12] = x[5];
1992   output[13] = _mm_subs_epi16(__zero, x[13]);
1993   output[14] = x[9];
1994   output[15] = _mm_subs_epi16(__zero, x[1]);
1995 }
1996 
iadst16_low1_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)1997 static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
1998                                    int8_t cos_bit) {
1999   (void)cos_bit;
2000   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2001   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2002 
2003   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2004   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2005   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2006   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2007 
2008   // stage 1
2009   __m128i x[16];
2010   x[1] = input[0];
2011 
2012   // stage 2
2013   btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2014 
2015   // stage 3
2016   x[8] = x[0];
2017   x[9] = x[1];
2018 
2019   // stage 4
2020   btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2021 
2022   // stage 5
2023   x[4] = x[0];
2024   x[5] = x[1];
2025   x[12] = x[8];
2026   x[13] = x[9];
2027 
2028   // stage 6
2029   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2030   btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2031 
2032   // stage 7
2033   x[2] = x[0];
2034   x[3] = x[1];
2035   x[6] = x[4];
2036   x[7] = x[5];
2037   x[10] = x[8];
2038   x[11] = x[9];
2039   x[14] = x[12];
2040   x[15] = x[13];
2041 
2042   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2043   iadst16_stage9_ssse3(output, x);
2044 }
2045 
iadst16_low8_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2046 static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
2047                                    int8_t cos_bit) {
2048   (void)cos_bit;
2049   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2050   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2051 
2052   // stage 1
2053   __m128i x[16];
2054   x[1] = input[0];
2055   x[3] = input[2];
2056   x[5] = input[4];
2057   x[7] = input[6];
2058   x[8] = input[7];
2059   x[10] = input[5];
2060   x[12] = input[3];
2061   x[14] = input[1];
2062 
2063   // stage 2
2064   btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
2065   btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
2066   btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
2067   btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
2068   btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
2069   btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
2070   btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
2071   btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
2072 
2073   // stage 3
2074   iadst16_stage3_ssse3(x);
2075   iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2076   iadst16_stage5_ssse3(x);
2077   iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2078   iadst16_stage7_ssse3(x);
2079   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2080   iadst16_stage9_ssse3(output, x);
2081 }
iadst16_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2082 void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
2083   (void)cos_bit;
2084   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2085   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2086   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2087   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2088   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2089   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2090   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2091   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2092   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2093   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2094   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2095   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2096   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2097   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2098   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2099   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2100   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2101   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2102 
2103   // stage 1
2104   __m128i x[16];
2105   x[0] = input[15];
2106   x[1] = input[0];
2107   x[2] = input[13];
2108   x[3] = input[2];
2109   x[4] = input[11];
2110   x[5] = input[4];
2111   x[6] = input[9];
2112   x[7] = input[6];
2113   x[8] = input[7];
2114   x[9] = input[8];
2115   x[10] = input[5];
2116   x[11] = input[10];
2117   x[12] = input[3];
2118   x[13] = input[12];
2119   x[14] = input[1];
2120   x[15] = input[14];
2121 
2122   // stage 2
2123   btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2124   btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2125   btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2126   btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2127   btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2128   btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2129   btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2130   btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2131 
2132   // stage 3~9
2133   iadst16_stage3_ssse3(x);
2134   iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
2135   iadst16_stage5_ssse3(x);
2136   iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
2137   iadst16_stage7_ssse3(x);
2138   iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
2139   iadst16_stage9_ssse3(output, x);
2140 }
2141 
iadst16_w4_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2142 void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
2143                          int8_t cos_bit) {
2144   (void)cos_bit;
2145   const int32_t *cospi = cospi_arr(INV_COS_BIT);
2146   const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
2147 
2148   const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2149   const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2150   const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2151   const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2152   const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2153   const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2154   const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2155   const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2156   const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2157   const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2158   const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2159   const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2160   const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2161   const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2162   const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2163   const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2164   const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2165   const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2166   const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2167   const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2168   const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2169   const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2170   const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2171   const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2172   const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2173   const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2174   const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2175 
2176   // stage 1
2177   __m128i x[16];
2178   x[0] = input[15];
2179   x[1] = input[0];
2180   x[2] = input[13];
2181   x[3] = input[2];
2182   x[4] = input[11];
2183   x[5] = input[4];
2184   x[6] = input[9];
2185   x[7] = input[6];
2186   x[8] = input[7];
2187   x[9] = input[8];
2188   x[10] = input[5];
2189   x[11] = input[10];
2190   x[12] = input[3];
2191   x[13] = input[12];
2192   x[14] = input[1];
2193   x[15] = input[14];
2194 
2195   // stage 2
2196   btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
2197   btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
2198   btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
2199   btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
2200   btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
2201   btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
2202   btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
2203   btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
2204 
2205   // stage 3
2206   iadst16_stage3_ssse3(x);
2207 
2208   // stage 4
2209   btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
2210   btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
2211   btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
2212   btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
2213 
2214   // stage 5
2215   iadst16_stage5_ssse3(x);
2216 
2217   // stage 6
2218   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
2219   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
2220   btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
2221   btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
2222 
2223   // stage 7
2224   iadst16_stage7_ssse3(x);
2225 
2226   // stage 8
2227   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
2228   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
2229   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
2230   btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
2231 
2232   // stage 9
2233   iadst16_stage9_ssse3(output, x);
2234 }
2235 
iidentity4_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2236 static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
2237                                  int8_t cos_bit) {
2238   (void)cos_bit;
2239   const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
2240   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2241   for (int i = 0; i < 4; ++i) {
2242     __m128i x = _mm_mulhrs_epi16(input[i], scale);
2243     output[i] = _mm_adds_epi16(x, input[i]);
2244   }
2245 }
2246 
iidentity8_new_sse2(const __m128i * input,__m128i * output,int8_t cos_bit)2247 static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
2248                                 int8_t cos_bit) {
2249   (void)cos_bit;
2250   for (int i = 0; i < 8; ++i) {
2251     output[i] = _mm_adds_epi16(input[i], input[i]);
2252   }
2253 }
2254 
iidentity16_new_ssse3(const __m128i * input,__m128i * output,int8_t cos_bit)2255 static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
2256                                   int8_t cos_bit) {
2257   (void)cos_bit;
2258   const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
2259   const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
2260   for (int i = 0; i < 16; ++i) {
2261     __m128i x = _mm_mulhrs_epi16(input[i], scale);
2262     __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
2263     output[i] = _mm_adds_epi16(x, srcx2);
2264   }
2265 }
2266 
lowbd_get_recon_8x8_sse2(const __m128i pred,__m128i res)2267 static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
2268                                                __m128i res) {
2269   const __m128i zero = _mm_setzero_si128();
2270   __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
2271   return _mm_packus_epi16(x0, x0);
2272 }
2273 
lowbd_write_buffer_4xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2274 static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
2275                                                int stride, int flipud,
2276                                                const int height) {
2277   int j = flipud ? (height - 1) : 0;
2278   const int step = flipud ? -1 : 1;
2279   const __m128i zero = _mm_setzero_si128();
2280   for (int i = 0; i < height; ++i, j += step) {
2281     const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
2282     __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
2283     u = _mm_packus_epi16(u, zero);
2284     *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
2285   }
2286 }
2287 
lowbd_write_buffer_8xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,const int height)2288 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
2289                                                int stride, int flipud,
2290                                                const int height) {
2291   int j = flipud ? (height - 1) : 0;
2292   const int step = flipud ? -1 : 1;
2293   for (int i = 0; i < height; ++i, j += step) {
2294     const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
2295     const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
2296     _mm_storel_epi64((__m128i *)(output + i * stride), u);
2297   }
2298 }
2299 
2300 // 1D functions process process 8 pixels at one time.
2301 static const transform_1d_ssse3
2302     lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
2303       { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
2304       { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
2305       { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
2306       { idct32_new_sse2, NULL, NULL },
2307       { idct64_low32_new_ssse3, NULL, NULL },
2308     };
2309 
2310 // functions for blocks with eob at DC and within
2311 // topleft 8x8, 16x16, 32x32 corner
2312 static const transform_1d_ssse3
2313     lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
2314       {
2315           { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
2316           { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
2317           { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
2318       },
2319       { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
2320         { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
2321         { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
2322       {
2323           { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
2324             NULL },
2325           { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
2326             NULL },
2327           { NULL, NULL, NULL, NULL },
2328       },
2329       { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
2330           idct32_new_sse2 },
2331         { NULL, NULL, NULL, NULL },
2332         { NULL, NULL, NULL, NULL } },
2333       { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
2334           idct64_low32_new_ssse3 },
2335         { NULL, NULL, NULL, NULL },
2336         { NULL, NULL, NULL, NULL } }
2337     };
2338 
2339 // 1D functions process process 4 pixels at one time.
2340 // used in 4x4, 4x8, 4x16, 8x4, 16x4
2341 static const transform_1d_ssse3
2342     lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
2343       { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
2344       { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
2345       { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
2346       { NULL, NULL, NULL },
2347       { NULL, NULL, NULL },
2348     };
2349 
iidentity_row_8xn_ssse3(__m128i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)2350 static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
2351                                            int stride, int shift, int height,
2352                                            int txw_idx, int rect_type) {
2353   const int32_t *input_row = input;
2354   const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
2355   const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
2356                                           (1 << (NewSqrt2Bits - shift - 1)));
2357   const __m128i one = _mm_set1_epi16(1);
2358   const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
2359   if (rect_type != 1 && rect_type != -1) {
2360     for (int i = 0; i < height; ++i) {
2361       const __m128i src = load_32bit_to_16bit(input_row);
2362       input_row += stride;
2363       __m128i lo = _mm_unpacklo_epi16(src, one);
2364       __m128i hi = _mm_unpackhi_epi16(src, one);
2365       lo = _mm_madd_epi16(lo, scale_rounding);
2366       hi = _mm_madd_epi16(hi, scale_rounding);
2367       lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2368       hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2369       out[i] = _mm_packs_epi32(lo, hi);
2370     }
2371   } else {
2372     const __m128i rect_scale =
2373         _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
2374     for (int i = 0; i < height; ++i) {
2375       __m128i src = load_32bit_to_16bit(input_row);
2376       src = _mm_mulhrs_epi16(src, rect_scale);
2377       input_row += stride;
2378       __m128i lo = _mm_unpacklo_epi16(src, one);
2379       __m128i hi = _mm_unpackhi_epi16(src, one);
2380       lo = _mm_madd_epi16(lo, scale_rounding);
2381       hi = _mm_madd_epi16(hi, scale_rounding);
2382       lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
2383       hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
2384       out[i] = _mm_packs_epi32(lo, hi);
2385     }
2386   }
2387 }
2388 
iidentity_col_8xn_ssse3(uint8_t * output,int stride,__m128i * buf,int shift,int height,int txh_idx)2389 static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
2390                                            __m128i *buf, int shift, int height,
2391                                            int txh_idx) {
2392   const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
2393   const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
2394   const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
2395   const __m128i one = _mm_set1_epi16(1);
2396   const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
2397   const __m128i zero = _mm_setzero_si128();
2398   for (int h = 0; h < height; ++h) {
2399     __m128i lo = _mm_unpacklo_epi16(buf[h], one);
2400     __m128i hi = _mm_unpackhi_epi16(buf[h], one);
2401     lo = _mm_madd_epi16(lo, scale_coeff);
2402     hi = _mm_madd_epi16(hi, scale_coeff);
2403     lo = _mm_srai_epi32(lo, NewSqrt2Bits);
2404     hi = _mm_srai_epi32(hi, NewSqrt2Bits);
2405     lo = _mm_add_epi32(lo, shift_rounding);
2406     hi = _mm_add_epi32(hi, shift_rounding);
2407     lo = _mm_srai_epi32(lo, -shift);
2408     hi = _mm_srai_epi32(hi, -shift);
2409     __m128i x = _mm_packs_epi32(lo, hi);
2410 
2411     const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
2412     x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
2413     const __m128i u = _mm_packus_epi16(x, x);
2414     _mm_storel_epi64((__m128i *)(output), u);
2415     output += stride;
2416   }
2417 }
2418 
lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size)2419 static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
2420                                                    uint8_t *output, int stride,
2421                                                    TX_SIZE tx_size) {
2422   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2423   const int txw_idx = get_txw_idx(tx_size);
2424   const int txh_idx = get_txh_idx(tx_size);
2425   const int txfm_size_col = tx_size_wide[tx_size];
2426   const int txfm_size_row = tx_size_high[tx_size];
2427   const int input_stride = AOMMIN(32, txfm_size_col);
2428   const int row_max = AOMMIN(32, txfm_size_row);
2429   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2430   __m128i buf[32];
2431 
2432   for (int i = 0; i < (input_stride >> 3); ++i) {
2433     iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
2434                             txw_idx, rect_type);
2435     iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
2436                             txh_idx);
2437   }
2438 }
2439 
lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2440 void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
2441                                     int stride, TX_TYPE tx_type,
2442                                     TX_SIZE tx_size_, int eob) {
2443   (void)tx_size_;
2444   (void)eob;
2445   __m128i buf[4];
2446   const TX_SIZE tx_size = TX_4X4;
2447   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2448   const int txw_idx = get_txw_idx(tx_size);
2449   const int txh_idx = get_txh_idx(tx_size);
2450   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2451   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2452   const int txfm_size_col = tx_size_wide[tx_size];
2453   const int txfm_size_row = tx_size_high[tx_size];
2454 
2455   const transform_1d_ssse3 row_txfm =
2456       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2457   const transform_1d_ssse3 col_txfm =
2458       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2459 
2460   int ud_flip, lr_flip;
2461   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2462   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2463   transpose_16bit_4x4(buf, buf);
2464   row_txfm(buf, buf, cos_bit_row);
2465   if (lr_flip) {
2466     __m128i temp[4];
2467     flip_buf_sse2(buf, temp, txfm_size_col);
2468     transpose_16bit_4x4(temp, buf);
2469   } else {
2470     transpose_16bit_4x4(buf, buf);
2471   }
2472   col_txfm(buf, buf, cos_bit_col);
2473   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2474   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2475 }
2476 
lowbd_get_recon_16x16_sse2(const __m128i pred,__m128i res0,__m128i res1)2477 static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
2478                                                  __m128i res0, __m128i res1) {
2479   const __m128i zero = _mm_setzero_si128();
2480   __m128i x0 = _mm_unpacklo_epi8(pred, zero);
2481   __m128i x1 = _mm_unpackhi_epi8(pred, zero);
2482   x0 = _mm_adds_epi16(res0, x0);
2483   x1 = _mm_adds_epi16(res1, x1);
2484   return _mm_packus_epi16(x0, x1);
2485 }
2486 
lowbd_write_buffer_16xn_sse2(__m128i * in,uint8_t * output,int stride,int flipud,int height)2487 static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
2488                                                 int stride, int flipud,
2489                                                 int height) {
2490   int j = flipud ? (height - 1) : 0;
2491   const int step = flipud ? -1 : 1;
2492   for (int i = 0; i < height; ++i, j += step) {
2493     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
2494     __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
2495     _mm_storeu_si128((__m128i *)(output + i * stride), u);
2496   }
2497 }
2498 
round_shift_ssse3(const __m128i * input,__m128i * output,int size)2499 static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
2500                                      int size) {
2501   const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
2502   for (int i = 0; i < size; ++i) {
2503     output[i] = _mm_mulhrs_epi16(input[i], scale);
2504   }
2505 }
2506 
lowbd_inv_txfm2d_add_no_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2507 static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
2508     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2509     TX_SIZE tx_size, int eob) {
2510   __m128i buf1[64 * 8];
2511   int eobx, eoby;
2512   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
2513   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2514   const int txw_idx = get_txw_idx(tx_size);
2515   const int txh_idx = get_txh_idx(tx_size);
2516   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2517   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2518   const int txfm_size_col = tx_size_wide[tx_size];
2519   const int txfm_size_row = tx_size_high[tx_size];
2520   const int buf_size_w_div8 = txfm_size_col >> 3;
2521   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
2522   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
2523   const int input_stride = AOMMIN(32, txfm_size_col);
2524   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2525 
2526   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
2527   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
2528   const transform_1d_ssse3 row_txfm =
2529       lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
2530   const transform_1d_ssse3 col_txfm =
2531       lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
2532 
2533   assert(col_txfm != NULL);
2534   assert(row_txfm != NULL);
2535   int ud_flip, lr_flip;
2536   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2537   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
2538     __m128i buf0[64];
2539     const int32_t *input_row = input + i * input_stride * 8;
2540     for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
2541       __m128i *buf0_cur = buf0 + j * 8;
2542       load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2543       transpose_16bit_8x8(buf0_cur, buf0_cur);
2544     }
2545     if (rect_type == 1 || rect_type == -1) {
2546       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
2547     }
2548     row_txfm(buf0, buf0, cos_bit_row);
2549     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2550     __m128i *_buf1 = buf1 + i * 8;
2551     if (lr_flip) {
2552       for (int j = 0; j < buf_size_w_div8; ++j) {
2553         __m128i temp[8];
2554         flip_buf_sse2(buf0 + 8 * j, temp, 8);
2555         transpose_16bit_8x8(temp,
2556                             _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
2557       }
2558     } else {
2559       for (int j = 0; j < buf_size_w_div8; ++j) {
2560         transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
2561       }
2562     }
2563   }
2564   for (int i = 0; i < buf_size_w_div8; i++) {
2565     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
2566     round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
2567   }
2568 
2569   if (txfm_size_col >= 16) {
2570     for (int i = 0; i < (txfm_size_col >> 4); i++) {
2571       lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
2572                                    output + 16 * i, stride, ud_flip,
2573                                    txfm_size_row);
2574     }
2575   } else if (txfm_size_col == 8) {
2576     lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
2577   }
2578 }
2579 
lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2580 static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
2581     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2582     TX_SIZE tx_size, int eob) {
2583   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2584   int eobx, eoby;
2585   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
2586   const int txw_idx = get_txw_idx(tx_size);
2587   const int txh_idx = get_txh_idx(tx_size);
2588   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2589   const int txfm_size_col = tx_size_wide[tx_size];
2590   const int txfm_size_row = tx_size_high[tx_size];
2591   const int buf_size_w_div8 = (eobx + 8) >> 3;
2592   const int input_stride = AOMMIN(32, txfm_size_col);
2593   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2594 
2595   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
2596   assert(fun_idx < 5);
2597   const transform_1d_ssse3 col_txfm =
2598       lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
2599 
2600   assert(col_txfm != NULL);
2601 
2602   int ud_flip, lr_flip;
2603   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2604   for (int i = 0; i < buf_size_w_div8; i++) {
2605     __m128i buf0[64];
2606     iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
2607                             eoby + 1, txw_idx, rect_type);
2608     col_txfm(buf0, buf0, cos_bit_col);
2609     __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
2610     int k = ud_flip ? (txfm_size_row - 1) : 0;
2611     const int step = ud_flip ? -1 : 1;
2612     uint8_t *out = output + 8 * i;
2613     for (int j = 0; j < txfm_size_row; ++j, k += step) {
2614       const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
2615       __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
2616       const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
2617       _mm_storel_epi64((__m128i *)(out), u);
2618       out += stride;
2619     }
2620   }
2621 }
2622 
lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2623 static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
2624     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2625     TX_SIZE tx_size, int eob) {
2626   __m128i buf1[64];
2627   int eobx, eoby;
2628   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
2629   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2630   const int txw_idx = get_txw_idx(tx_size);
2631   const int txh_idx = get_txh_idx(tx_size);
2632   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2633   const int txfm_size_col = tx_size_wide[tx_size];
2634   const int txfm_size_row = tx_size_high[tx_size];
2635   const int buf_size_w_div8 = txfm_size_col >> 3;
2636   const int buf_size_h_div8 = (eoby + 8) >> 3;
2637   const int input_stride = AOMMIN(32, txfm_size_col);
2638   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2639 
2640   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
2641   const transform_1d_ssse3 row_txfm =
2642       lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
2643 
2644   assert(row_txfm != NULL);
2645   int ud_flip, lr_flip;
2646   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2647   for (int i = 0; i < buf_size_h_div8; i++) {
2648     __m128i buf0[64];
2649     const int32_t *input_row = input + i * input_stride * 8;
2650     for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
2651       __m128i *buf0_cur = buf0 + j * 8;
2652       load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
2653       transpose_16bit_8x8(buf0_cur, buf0_cur);
2654     }
2655     if (rect_type == 1 || rect_type == -1) {
2656       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
2657     }
2658     row_txfm(buf0, buf0, cos_bit_row);
2659     round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
2660     __m128i *_buf1 = buf1;
2661     if (lr_flip) {
2662       for (int j = 0; j < buf_size_w_div8; ++j) {
2663         __m128i temp[8];
2664         flip_buf_sse2(buf0 + 8 * j, temp, 8);
2665         transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
2666       }
2667     } else {
2668       for (int j = 0; j < buf_size_w_div8; ++j) {
2669         transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
2670       }
2671     }
2672 
2673     for (int j = 0; j < buf_size_w_div8; ++j) {
2674       iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
2675                               buf1 + j * 8, shift[1], 8, txh_idx);
2676     }
2677   }
2678 }
2679 
2680 // for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2681 static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
2682     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2683     TX_SIZE tx_size, int eob) {
2684   switch (tx_type) {
2685     case DCT_DCT:
2686       lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2687                                              tx_size, eob);
2688       break;
2689     case IDTX:
2690       lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2691       break;
2692     case V_DCT:
2693     case V_ADST:
2694     case V_FLIPADST:
2695       lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2696                                             tx_size, eob);
2697       break;
2698     case H_DCT:
2699     case H_ADST:
2700     case H_FLIPADST:
2701       lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2702                                             tx_size, eob);
2703       break;
2704     default:
2705       lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
2706                                              tx_size, eob);
2707       break;
2708   }
2709 }
2710 
lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2711 void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
2712                                     int stride, TX_TYPE tx_type,
2713                                     TX_SIZE tx_size_, int eob) {
2714   (void)tx_size_;
2715   (void)eob;
2716   __m128i buf[8];
2717   const TX_SIZE tx_size = TX_4X8;
2718   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2719   const int txw_idx = get_txw_idx(tx_size);
2720   const int txh_idx = get_txh_idx(tx_size);
2721   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2722   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2723   const int txfm_size_col = tx_size_wide[tx_size];
2724   const int txfm_size_row = tx_size_high[tx_size];
2725 
2726   const transform_1d_ssse3 row_txfm =
2727       lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2728   const transform_1d_ssse3 col_txfm =
2729       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2730 
2731   int ud_flip, lr_flip;
2732   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2733   load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
2734   transpose_16bit_4x8(buf, buf);
2735   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
2736   row_txfm(buf, buf, cos_bit_row);
2737   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
2738   if (lr_flip) {
2739     __m128i temp[4];
2740     flip_buf_sse2(buf, temp, txfm_size_col);
2741     transpose_16bit_8x4(temp, buf);
2742   } else {
2743     transpose_16bit_8x4(buf, buf);
2744   }
2745   col_txfm(buf, buf, cos_bit_col);
2746   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2747   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2748 }
2749 
lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2750 void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
2751                                     int stride, TX_TYPE tx_type,
2752                                     TX_SIZE tx_size_, int eob) {
2753   (void)tx_size_;
2754   (void)eob;
2755   __m128i buf[8];
2756   const TX_SIZE tx_size = TX_8X4;
2757   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2758   const int txw_idx = get_txw_idx(tx_size);
2759   const int txh_idx = get_txh_idx(tx_size);
2760   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2761   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2762   const int txfm_size_col = tx_size_wide[tx_size];
2763   const int txfm_size_row = tx_size_high[tx_size];
2764 
2765   const transform_1d_ssse3 row_txfm =
2766       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2767   const transform_1d_ssse3 col_txfm =
2768       lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2769 
2770   int ud_flip, lr_flip;
2771   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2772   load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
2773   transpose_16bit_8x4(buf, buf);
2774   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
2775   row_txfm(buf, buf, cos_bit_row);
2776   // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
2777   if (lr_flip) {
2778     __m128i temp[8];
2779     flip_buf_sse2(buf, temp, txfm_size_col);
2780     transpose_16bit_4x8(temp, buf);
2781   } else {
2782     transpose_16bit_4x8(buf, buf);
2783   }
2784   col_txfm(buf, buf, cos_bit_col);
2785   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2786   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2787 }
2788 
lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2789 void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
2790                                      int stride, TX_TYPE tx_type,
2791                                      TX_SIZE tx_size_, int eob) {
2792   (void)tx_size_;
2793   (void)eob;
2794   __m128i buf[16];
2795   const TX_SIZE tx_size = TX_4X16;
2796   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2797   const int txw_idx = get_txw_idx(tx_size);
2798   const int txh_idx = get_txh_idx(tx_size);
2799   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2800   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2801   const int txfm_size_col = tx_size_wide[tx_size];
2802   const int txfm_size_row = tx_size_high[tx_size];
2803 
2804   const transform_1d_ssse3 row_txfm =
2805       lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
2806   const transform_1d_ssse3 col_txfm =
2807       lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
2808 
2809   int ud_flip, lr_flip;
2810   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2811 
2812   const int row_one_loop = 8;
2813   for (int i = 0; i < 2; ++i) {
2814     const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
2815     __m128i *buf_cur = buf + i * row_one_loop;
2816     load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
2817                                   row_one_loop);
2818     transpose_16bit_4x8(buf_cur, buf_cur);
2819     row_txfm(buf_cur, buf_cur, cos_bit_row);
2820     round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
2821     if (lr_flip) {
2822       __m128i temp[8];
2823       flip_buf_sse2(buf_cur, temp, txfm_size_col);
2824       transpose_16bit_8x4(temp, buf_cur);
2825     } else {
2826       transpose_16bit_8x4(buf_cur, buf_cur);
2827     }
2828   }
2829   col_txfm(buf, buf, cos_bit_col);
2830   round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
2831   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
2832 }
2833 
lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size_,int eob)2834 void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
2835                                      int stride, TX_TYPE tx_type,
2836                                      TX_SIZE tx_size_, int eob) {
2837   (void)tx_size_;
2838   (void)eob;
2839   __m128i buf[16];
2840   const TX_SIZE tx_size = TX_16X4;
2841   const int8_t *shift = inv_txfm_shift_ls[tx_size];
2842   const int txw_idx = get_txw_idx(tx_size);
2843   const int txh_idx = get_txh_idx(tx_size);
2844   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
2845   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
2846   const int txfm_size_col = tx_size_wide[tx_size];
2847   const int txfm_size_row = tx_size_high[tx_size];
2848   const int buf_size_w_div8 = txfm_size_col >> 3;
2849 
2850   const transform_1d_ssse3 row_txfm =
2851       lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
2852   const transform_1d_ssse3 col_txfm =
2853       lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
2854 
2855   int ud_flip, lr_flip;
2856   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2857   const int row_one_loop = 8;
2858   for (int i = 0; i < buf_size_w_div8; ++i) {
2859     const int32_t *input_cur = input + i * row_one_loop;
2860     __m128i *buf_cur = buf + i * row_one_loop;
2861     load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
2862                                txfm_size_row);
2863     transpose_16bit_8x4(buf_cur, buf_cur);
2864   }
2865   row_txfm(buf, buf, cos_bit_row);
2866   round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
2867   if (lr_flip) {
2868     __m128i temp[16];
2869     flip_buf_sse2(buf, temp, 16);
2870     transpose_16bit_4x8(temp, buf);
2871     transpose_16bit_4x8(temp + 8, buf + 8);
2872   } else {
2873     transpose_16bit_4x8(buf, buf);
2874     transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
2875   }
2876   for (int i = 0; i < buf_size_w_div8; i++) {
2877     col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
2878     round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
2879   }
2880   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
2881   lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
2882 }
2883 
av1_lowbd_inv_txfm2d_add_ssse3(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2884 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
2885                                     int stride, TX_TYPE tx_type,
2886                                     TX_SIZE tx_size, int eob) {
2887   switch (tx_size) {
2888     case TX_4X4:
2889       lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
2890                                      eob);
2891       break;
2892     case TX_4X8:
2893       lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
2894                                      eob);
2895       break;
2896     case TX_8X4:
2897       lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
2898                                      eob);
2899       break;
2900     case TX_4X16:
2901       lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
2902                                       eob);
2903       break;
2904     case TX_16X4:
2905       lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
2906                                       eob);
2907       break;
2908     default:
2909       lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
2910                                           tx_size, eob);
2911       break;
2912   }
2913 }
av1_inv_txfm_add_ssse3(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)2914 void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2915                             const TxfmParam *txfm_param) {
2916   const TX_TYPE tx_type = txfm_param->tx_type;
2917   if (!txfm_param->lossless) {
2918     av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
2919                                    txfm_param->tx_size, txfm_param->eob);
2920   } else {
2921     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2922   }
2923 }
2924