1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse4.h"
21 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
22 
highbd_clamp_epi16(__m128i u,int bd)23 static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
24   const __m128i zero = _mm_setzero_si128();
25   const __m128i one = _mm_set1_epi16(1);
26   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
27   __m128i clamped, mask;
28 
29   mask = _mm_cmpgt_epi16(u, max);
30   clamped = _mm_andnot_si128(mask, u);
31   mask = _mm_and_si128(mask, max);
32   clamped = _mm_or_si128(mask, clamped);
33   mask = _mm_cmpgt_epi16(clamped, zero);
34   clamped = _mm_and_si128(clamped, mask);
35 
36   return clamped;
37 }
38 
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)39 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
40                                                   __m128i res0, __m128i res1,
41                                                   const int bd) {
42   __m128i x0 = _mm_cvtepi16_epi32(pred);
43   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
44 
45   x0 = _mm_add_epi32(res0, x0);
46   x1 = _mm_add_epi32(res1, x1);
47   x0 = _mm_packus_epi32(x0, x1);
48   x0 = highbd_clamp_epi16(x0, bd);
49   return x0;
50 }
51 
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)52 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
53                                                   int stride, int flipud,
54                                                   int height, const int bd) {
55   int j = flipud ? (height - 1) : 0;
56   const int step = flipud ? -1 : 1;
57   for (int i = 0; i < height; ++i, j += step) {
58     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
59     __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
60 
61     _mm_storeu_si128((__m128i *)(output + i * stride), u);
62   }
63 }
64 
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)65 static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
66                                            __m128i *out, int out_size) {
67   for (int i = 0; i < out_size; ++i) {
68     out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
69   }
70 }
71 
load_buffer_4x4(const int32_t * coeff,__m128i * in)72 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
73   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
74   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
75   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
76   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
77 }
78 
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)79 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
80                           __m128i *out1, const __m128i *clamp_lo,
81                           const __m128i *clamp_hi) {
82   __m128i a0 = _mm_add_epi32(in0, in1);
83   __m128i a1 = _mm_sub_epi32(in0, in1);
84 
85   a0 = _mm_max_epi32(a0, *clamp_lo);
86   a0 = _mm_min_epi32(a0, *clamp_hi);
87   a1 = _mm_max_epi32(a1, *clamp_lo);
88   a1 = _mm_min_epi32(a1, *clamp_hi);
89 
90   *out0 = a0;
91   *out1 = a1;
92 }
93 
addsub_no_clamp_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1)94 static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
95                                    __m128i *out0, __m128i *out1) {
96   __m128i a0 = _mm_add_epi32(in0, in1);
97   __m128i a1 = _mm_sub_epi32(in0, in1);
98 
99   *out0 = a0;
100   *out1 = a1;
101 }
102 
addsub_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)103 static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
104                                 __m128i *out0, __m128i *out1,
105                                 const __m128i *clamp_lo,
106                                 const __m128i *clamp_hi, int shift) {
107   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
108   __m128i in0_w_offset = _mm_add_epi32(in0, offset);
109   __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
110   __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
111 
112   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
113   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
114 
115   a0 = _mm_max_epi32(a0, *clamp_lo);
116   a0 = _mm_min_epi32(a0, *clamp_hi);
117   a1 = _mm_max_epi32(a1, *clamp_lo);
118   a1 = _mm_min_epi32(a1, *clamp_hi);
119 
120   *out0 = a0;
121   *out1 = a1;
122 }
123 
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)124 static INLINE void idct32_stage4_sse4_1(
125     __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
126     const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
127     const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
128     const __m128i *rounding, int bit) {
129   __m128i temp1, temp2;
130   temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
131   bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
132   bf1[17] = temp1;
133 
134   temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
135   bf1[29] =
136       half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
137   bf1[18] = temp2;
138 
139   temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
140   bf1[26] =
141       half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
142   bf1[21] = temp1;
143 
144   temp2 =
145       half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
146   bf1[25] =
147       half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
148   bf1[22] = temp2;
149 }
150 
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)151 static INLINE void idct32_stage5_sse4_1(
152     __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
153     const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
154     const __m128i *clamp_hi, const __m128i *rounding, int bit) {
155   __m128i temp1, temp2;
156   temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
157   bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
158   bf1[9] = temp1;
159 
160   temp2 =
161       half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
162   bf1[13] =
163       half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
164   bf1[10] = temp2;
165 
166   addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
167   addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
168   addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
169   addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
170   addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
171   addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
172   addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
173   addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
174 }
175 
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)176 static INLINE void idct32_stage6_sse4_1(
177     __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
178     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
179     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
180     const __m128i *rounding, int bit) {
181   __m128i temp1, temp2;
182   temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
183   bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
184   bf1[5] = temp1;
185 
186   addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
187   addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
188   addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
189   addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
190 
191   temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
192   bf1[29] =
193       half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
194   bf1[18] = temp1;
195   temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
196   bf1[28] =
197       half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
198   bf1[19] = temp2;
199   temp1 =
200       half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
201   bf1[27] =
202       half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
203   bf1[20] = temp1;
204   temp2 =
205       half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
206   bf1[26] =
207       half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
208   bf1[21] = temp2;
209 }
210 
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)211 static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
212                                         const __m128i *cospi32,
213                                         const __m128i *clamp_lo,
214                                         const __m128i *clamp_hi,
215                                         const __m128i *rounding, int bit) {
216   __m128i temp1, temp2;
217   addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
218   addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
219   addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
220   addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
221 
222   temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
223   bf1[13] =
224       half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
225   bf1[10] = temp1;
226   temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
227   bf1[12] =
228       half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
229   bf1[11] = temp2;
230 
231   addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
232   addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
233   addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
234   addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
235   addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
236   addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
237   addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
238   addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
239 }
240 
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)241 static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
242                                         const __m128i *cospi32,
243                                         const __m128i *clamp_lo,
244                                         const __m128i *clamp_hi,
245                                         const __m128i *rounding, int bit) {
246   __m128i temp1, temp2;
247   addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
248   addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
249   addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
250   addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
251   addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
252   addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
253   addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
254   addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
255 
256   temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
257   bf1[27] =
258       half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
259   bf1[20] = temp1;
260   temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
261   bf1[26] =
262       half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
263   bf1[21] = temp2;
264   temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
265   bf1[25] =
266       half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
267   bf1[22] = temp1;
268   temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
269   bf1[24] =
270       half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
271   bf1[23] = temp2;
272 }
273 
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const int log_range)274 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
275                                         const int do_cols, const int bd,
276                                         const int out_shift,
277                                         const int log_range) {
278   if (do_cols) {
279     addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
280     addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
281     addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
282     addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
283     addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
284     addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
285     addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
286     addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
287     addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
288     addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
289     addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
290     addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
291     addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
292     addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
293     addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
294     addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
295   } else {
296     const int log_range_out = AOMMAX(16, bd + 6);
297     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
298         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
299     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
300         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
301 
302     addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
303                         &clamp_hi_out, out_shift);
304     addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
305                         &clamp_hi_out, out_shift);
306     addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
307                         &clamp_hi_out, out_shift);
308     addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
309                         &clamp_hi_out, out_shift);
310     addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
311                         &clamp_hi_out, out_shift);
312     addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
313                         &clamp_hi_out, out_shift);
314     addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
315                         &clamp_hi_out, out_shift);
316     addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
317                         &clamp_hi_out, out_shift);
318     addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
319                         &clamp_hi_out, out_shift);
320     addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
321                         &clamp_hi_out, out_shift);
322     addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
323                         &clamp_hi_out, out_shift);
324     addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
325                         &clamp_hi_out, out_shift);
326     addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
327                         &clamp_hi_out, out_shift);
328     addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
329                         &clamp_hi_out, out_shift);
330     addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
331                         &clamp_hi_out, out_shift);
332     addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
333                         &clamp_hi_out, out_shift);
334   }
335 }
336 
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)337 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
338                              __m128i *out0, __m128i *out1,
339                              const __m128i *clamp_lo, const __m128i *clamp_hi,
340                              int shift) {
341   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
342   __m128i a0 = _mm_add_epi32(offset, in0);
343   __m128i a1 = _mm_sub_epi32(offset, in1);
344 
345   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
346   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
347 
348   a0 = _mm_max_epi32(a0, *clamp_lo);
349   a0 = _mm_min_epi32(a0, *clamp_hi);
350   a1 = _mm_max_epi32(a1, *clamp_lo);
351   a1 = _mm_min_epi32(a1, *clamp_hi);
352 
353   *out0 = a0;
354   *out1 = a1;
355 }
356 
idct4x4_sse4_1(__m128i * in,int bit,int do_cols,int bd)357 static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
358   const int32_t *cospi = cospi_arr(bit);
359   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
360   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
361   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
362   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
363   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
364 
365   __m128i u0, u1, u2, u3;
366   __m128i v0, v1, v2, v3, x, y;
367 
368   v0 = _mm_unpacklo_epi32(in[0], in[1]);
369   v1 = _mm_unpackhi_epi32(in[0], in[1]);
370   v2 = _mm_unpacklo_epi32(in[2], in[3]);
371   v3 = _mm_unpackhi_epi32(in[2], in[3]);
372 
373   u0 = _mm_unpacklo_epi64(v0, v2);
374   u1 = _mm_unpackhi_epi64(v0, v2);
375   u2 = _mm_unpacklo_epi64(v1, v3);
376   u3 = _mm_unpackhi_epi64(v1, v3);
377 
378   x = _mm_mullo_epi32(u0, cospi32);
379   y = _mm_mullo_epi32(u2, cospi32);
380   v0 = _mm_add_epi32(x, y);
381   v0 = _mm_add_epi32(v0, rnding);
382   v0 = _mm_srai_epi32(v0, bit);
383 
384   v1 = _mm_sub_epi32(x, y);
385   v1 = _mm_add_epi32(v1, rnding);
386   v1 = _mm_srai_epi32(v1, bit);
387 
388   x = _mm_mullo_epi32(u1, cospi48);
389   y = _mm_mullo_epi32(u3, cospim16);
390   v2 = _mm_add_epi32(x, y);
391   v2 = _mm_add_epi32(v2, rnding);
392   v2 = _mm_srai_epi32(v2, bit);
393 
394   x = _mm_mullo_epi32(u1, cospi16);
395   y = _mm_mullo_epi32(u3, cospi48);
396   v3 = _mm_add_epi32(x, y);
397   v3 = _mm_add_epi32(v3, rnding);
398   v3 = _mm_srai_epi32(v3, bit);
399 
400   if (do_cols) {
401     addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
402     addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
403   } else {
404     const int log_range = AOMMAX(16, bd + 6);
405     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
406     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
407     addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
408     addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
409   }
410 }
411 
iadst4x4_sse4_1(__m128i * in,int bit,int do_cols,int bd)412 static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
413   const int32_t *sinpi = sinpi_arr(bit);
414   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
415   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
416   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
417   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
418   const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
419   __m128i t;
420   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
421   __m128i x0, x1, x2, x3;
422   __m128i u0, u1, u2, u3;
423   __m128i v0, v1, v2, v3;
424 
425   v0 = _mm_unpacklo_epi32(in[0], in[1]);
426   v1 = _mm_unpackhi_epi32(in[0], in[1]);
427   v2 = _mm_unpacklo_epi32(in[2], in[3]);
428   v3 = _mm_unpackhi_epi32(in[2], in[3]);
429 
430   x0 = _mm_unpacklo_epi64(v0, v2);
431   x1 = _mm_unpackhi_epi64(v0, v2);
432   x2 = _mm_unpacklo_epi64(v1, v3);
433   x3 = _mm_unpackhi_epi64(v1, v3);
434 
435   s0 = _mm_mullo_epi32(x0, sinpi1);
436   s1 = _mm_mullo_epi32(x0, sinpi2);
437   s2 = _mm_mullo_epi32(x1, sinpi3);
438   s3 = _mm_mullo_epi32(x2, sinpi4);
439   s4 = _mm_mullo_epi32(x2, sinpi1);
440   s5 = _mm_mullo_epi32(x3, sinpi2);
441   s6 = _mm_mullo_epi32(x3, sinpi4);
442   t = _mm_sub_epi32(x0, x2);
443   s7 = _mm_add_epi32(t, x3);
444 
445   t = _mm_add_epi32(s0, s3);
446   s0 = _mm_add_epi32(t, s5);
447   t = _mm_sub_epi32(s1, s4);
448   s1 = _mm_sub_epi32(t, s6);
449   s3 = s2;
450   s2 = _mm_mullo_epi32(s7, sinpi3);
451 
452   u0 = _mm_add_epi32(s0, s3);
453   u1 = _mm_add_epi32(s1, s3);
454   u2 = s2;
455   t = _mm_add_epi32(s0, s1);
456   u3 = _mm_sub_epi32(t, s3);
457 
458   u0 = _mm_add_epi32(u0, rnding);
459   u0 = _mm_srai_epi32(u0, bit);
460 
461   u1 = _mm_add_epi32(u1, rnding);
462   u1 = _mm_srai_epi32(u1, bit);
463 
464   u2 = _mm_add_epi32(u2, rnding);
465   u2 = _mm_srai_epi32(u2, bit);
466 
467   u3 = _mm_add_epi32(u3, rnding);
468   u3 = _mm_srai_epi32(u3, bit);
469 
470   if (!do_cols) {
471     const int log_range = AOMMAX(16, bd + 6);
472     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
473     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
474 
475     u0 = _mm_max_epi32(u0, clamp_lo);
476     u0 = _mm_min_epi32(u0, clamp_hi);
477     u1 = _mm_max_epi32(u1, clamp_lo);
478     u1 = _mm_min_epi32(u1, clamp_hi);
479     u2 = _mm_max_epi32(u2, clamp_lo);
480     u2 = _mm_min_epi32(u2, clamp_hi);
481     u3 = _mm_max_epi32(u3, clamp_lo);
482     u3 = _mm_min_epi32(u3, clamp_hi);
483   }
484 
485   in[0] = u0;
486   in[1] = u1;
487   in[2] = u2;
488   in[3] = u3;
489 }
490 
round_shift_4x4(__m128i * in,int shift)491 static INLINE void round_shift_4x4(__m128i *in, int shift) {
492   __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
493 
494   in[0] = _mm_add_epi32(in[0], rnding);
495   in[1] = _mm_add_epi32(in[1], rnding);
496   in[2] = _mm_add_epi32(in[2], rnding);
497   in[3] = _mm_add_epi32(in[3], rnding);
498 
499   in[0] = _mm_srai_epi32(in[0], shift);
500   in[1] = _mm_srai_epi32(in[1], shift);
501   in[2] = _mm_srai_epi32(in[2], shift);
502   in[3] = _mm_srai_epi32(in[3], shift);
503 }
504 
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)505 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
506                              int fliplr, int flipud, int shift, int bd) {
507   const __m128i zero = _mm_setzero_si128();
508   __m128i u0, u1, u2, u3;
509   __m128i v0, v1, v2, v3;
510 
511   round_shift_4x4(in, shift);
512 
513   v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
514   v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
515   v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
516   v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
517 
518   v0 = _mm_unpacklo_epi16(v0, zero);
519   v1 = _mm_unpacklo_epi16(v1, zero);
520   v2 = _mm_unpacklo_epi16(v2, zero);
521   v3 = _mm_unpacklo_epi16(v3, zero);
522 
523   if (fliplr) {
524     in[0] = _mm_shuffle_epi32(in[0], 0x1B);
525     in[1] = _mm_shuffle_epi32(in[1], 0x1B);
526     in[2] = _mm_shuffle_epi32(in[2], 0x1B);
527     in[3] = _mm_shuffle_epi32(in[3], 0x1B);
528   }
529 
530   if (flipud) {
531     u0 = _mm_add_epi32(in[3], v0);
532     u1 = _mm_add_epi32(in[2], v1);
533     u2 = _mm_add_epi32(in[1], v2);
534     u3 = _mm_add_epi32(in[0], v3);
535   } else {
536     u0 = _mm_add_epi32(in[0], v0);
537     u1 = _mm_add_epi32(in[1], v1);
538     u2 = _mm_add_epi32(in[2], v2);
539     u3 = _mm_add_epi32(in[3], v3);
540   }
541 
542   v0 = _mm_packus_epi32(u0, u1);
543   v2 = _mm_packus_epi32(u2, u3);
544 
545   u0 = highbd_clamp_epi16(v0, bd);
546   u2 = highbd_clamp_epi16(v2, bd);
547 
548   v0 = _mm_unpacklo_epi64(u0, u0);
549   v1 = _mm_unpackhi_epi64(u0, u0);
550   v2 = _mm_unpacklo_epi64(u2, u2);
551   v3 = _mm_unpackhi_epi64(u2, u2);
552 
553   _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
554   _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
555   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
556   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
557 }
558 
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)559 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
560                                    int stride, TX_TYPE tx_type, int bd) {
561   __m128i in[4];
562   const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
563   const int txw_idx = get_txw_idx(TX_4X4);
564   const int txh_idx = get_txh_idx(TX_4X4);
565 
566   switch (tx_type) {
567     case DCT_DCT:
568       load_buffer_4x4(coeff, in);
569       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
570       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
571       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
572       break;
573     case ADST_DCT:
574       load_buffer_4x4(coeff, in);
575       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
576       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
577       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
578       break;
579     case DCT_ADST:
580       load_buffer_4x4(coeff, in);
581       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
582       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
583       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
584       break;
585     case ADST_ADST:
586       load_buffer_4x4(coeff, in);
587       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
588       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
589       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
590       break;
591     case FLIPADST_DCT:
592       load_buffer_4x4(coeff, in);
593       idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
594       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
595       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
596       break;
597     case DCT_FLIPADST:
598       load_buffer_4x4(coeff, in);
599       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
600       idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
601       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
602       break;
603     case FLIPADST_FLIPADST:
604       load_buffer_4x4(coeff, in);
605       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
606       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
607       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
608       break;
609     case ADST_FLIPADST:
610       load_buffer_4x4(coeff, in);
611       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
612       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
613       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
614       break;
615     case FLIPADST_ADST:
616       load_buffer_4x4(coeff, in);
617       iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
618       iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
619       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
620       break;
621     default: assert(0);
622   }
623 }
624 
625 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)626 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
627   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
628   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
629   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
630   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
631   in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
632   in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
633   in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
634   in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
635   in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
636   in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
637   in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
638   in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
639   in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
640   in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
641   in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
642   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
643 }
644 
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)645 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
646                            int bd, int out_shift) {
647   const int32_t *cospi = cospi_arr(bit);
648   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
649   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
650   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
651   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
652   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
653   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
654   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
655   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
656   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
657   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
658   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
659   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
660   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
661   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
662   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
663   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
664   __m128i x, y;
665   int col;
666 
667   // Note:
668   //  Even column: 0, 2, ..., 14
669   //  Odd column: 1, 3, ..., 15
670   //  one even column plus one odd column constructs one row (8 coeffs)
671   //  total we have 8 rows (8x8).
672   for (col = 0; col < 2; ++col) {
673     // stage 0
674     // stage 1
675     // stage 2
676     u0 = in[0 * 2 + col];
677     u1 = in[4 * 2 + col];
678     u2 = in[2 * 2 + col];
679     u3 = in[6 * 2 + col];
680 
681     x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
682     y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
683     u4 = _mm_add_epi32(x, y);
684     u4 = _mm_add_epi32(u4, rnding);
685     u4 = _mm_srai_epi32(u4, bit);
686 
687     x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
688     y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
689     u7 = _mm_add_epi32(x, y);
690     u7 = _mm_add_epi32(u7, rnding);
691     u7 = _mm_srai_epi32(u7, bit);
692 
693     x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
694     y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
695     u5 = _mm_add_epi32(x, y);
696     u5 = _mm_add_epi32(u5, rnding);
697     u5 = _mm_srai_epi32(u5, bit);
698 
699     x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
700     y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
701     u6 = _mm_add_epi32(x, y);
702     u6 = _mm_add_epi32(u6, rnding);
703     u6 = _mm_srai_epi32(u6, bit);
704 
705     // stage 3
706     x = _mm_mullo_epi32(u0, cospi32);
707     y = _mm_mullo_epi32(u1, cospi32);
708     v0 = _mm_add_epi32(x, y);
709     v0 = _mm_add_epi32(v0, rnding);
710     v0 = _mm_srai_epi32(v0, bit);
711 
712     v1 = _mm_sub_epi32(x, y);
713     v1 = _mm_add_epi32(v1, rnding);
714     v1 = _mm_srai_epi32(v1, bit);
715 
716     x = _mm_mullo_epi32(u2, cospi48);
717     y = _mm_mullo_epi32(u3, cospim16);
718     v2 = _mm_add_epi32(x, y);
719     v2 = _mm_add_epi32(v2, rnding);
720     v2 = _mm_srai_epi32(v2, bit);
721 
722     x = _mm_mullo_epi32(u2, cospi16);
723     y = _mm_mullo_epi32(u3, cospi48);
724     v3 = _mm_add_epi32(x, y);
725     v3 = _mm_add_epi32(v3, rnding);
726     v3 = _mm_srai_epi32(v3, bit);
727 
728     addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
729     addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
730 
731     // stage 4
732     addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
733     addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
734     u4 = v4;
735     u7 = v7;
736 
737     x = _mm_mullo_epi32(v5, cospi32);
738     y = _mm_mullo_epi32(v6, cospi32);
739     u6 = _mm_add_epi32(y, x);
740     u6 = _mm_add_epi32(u6, rnding);
741     u6 = _mm_srai_epi32(u6, bit);
742 
743     u5 = _mm_sub_epi32(y, x);
744     u5 = _mm_add_epi32(u5, rnding);
745     u5 = _mm_srai_epi32(u5, bit);
746 
747     // stage 5
748     if (do_cols) {
749       addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
750       addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
751       addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
752       addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
753     } else {
754       const int log_range_out = AOMMAX(16, bd + 6);
755       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
756           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
757       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
758           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
759       addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
760                           &clamp_lo_out, &clamp_hi_out, out_shift);
761       addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
762                           &clamp_lo_out, &clamp_hi_out, out_shift);
763       addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
764                           &clamp_lo_out, &clamp_hi_out, out_shift);
765       addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
766                           &clamp_lo_out, &clamp_hi_out, out_shift);
767     }
768   }
769 }
770 
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)771 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
772                             int bd, int out_shift) {
773   const int32_t *cospi = cospi_arr(bit);
774   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
775   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
776   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
777   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
778   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
779   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
780   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
781   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
782   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
783   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
784   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
785   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
786   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
787   const __m128i kZero = _mm_setzero_si128();
788   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
789   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
790   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
791   __m128i u[8], v[8], x;
792 
793   // Even 8 points: 0, 2, ..., 14
794   // stage 0
795   // stage 1
796   // stage 2
797   // (1)
798   u[0] = _mm_mullo_epi32(in[14], cospi4);
799   x = _mm_mullo_epi32(in[0], cospi60);
800   u[0] = _mm_add_epi32(u[0], x);
801   u[0] = _mm_add_epi32(u[0], rnding);
802   u[0] = _mm_srai_epi32(u[0], bit);
803 
804   u[1] = _mm_mullo_epi32(in[14], cospi60);
805   x = _mm_mullo_epi32(in[0], cospi4);
806   u[1] = _mm_sub_epi32(u[1], x);
807   u[1] = _mm_add_epi32(u[1], rnding);
808   u[1] = _mm_srai_epi32(u[1], bit);
809 
810   // (2)
811   u[2] = _mm_mullo_epi32(in[10], cospi20);
812   x = _mm_mullo_epi32(in[4], cospi44);
813   u[2] = _mm_add_epi32(u[2], x);
814   u[2] = _mm_add_epi32(u[2], rnding);
815   u[2] = _mm_srai_epi32(u[2], bit);
816 
817   u[3] = _mm_mullo_epi32(in[10], cospi44);
818   x = _mm_mullo_epi32(in[4], cospi20);
819   u[3] = _mm_sub_epi32(u[3], x);
820   u[3] = _mm_add_epi32(u[3], rnding);
821   u[3] = _mm_srai_epi32(u[3], bit);
822 
823   // (3)
824   u[4] = _mm_mullo_epi32(in[6], cospi36);
825   x = _mm_mullo_epi32(in[8], cospi28);
826   u[4] = _mm_add_epi32(u[4], x);
827   u[4] = _mm_add_epi32(u[4], rnding);
828   u[4] = _mm_srai_epi32(u[4], bit);
829 
830   u[5] = _mm_mullo_epi32(in[6], cospi28);
831   x = _mm_mullo_epi32(in[8], cospi36);
832   u[5] = _mm_sub_epi32(u[5], x);
833   u[5] = _mm_add_epi32(u[5], rnding);
834   u[5] = _mm_srai_epi32(u[5], bit);
835 
836   // (4)
837   u[6] = _mm_mullo_epi32(in[2], cospi52);
838   x = _mm_mullo_epi32(in[12], cospi12);
839   u[6] = _mm_add_epi32(u[6], x);
840   u[6] = _mm_add_epi32(u[6], rnding);
841   u[6] = _mm_srai_epi32(u[6], bit);
842 
843   u[7] = _mm_mullo_epi32(in[2], cospi12);
844   x = _mm_mullo_epi32(in[12], cospi52);
845   u[7] = _mm_sub_epi32(u[7], x);
846   u[7] = _mm_add_epi32(u[7], rnding);
847   u[7] = _mm_srai_epi32(u[7], bit);
848 
849   // stage 3
850   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
851   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
852   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
853   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
854 
855   // stage 4
856   u[0] = v[0];
857   u[1] = v[1];
858   u[2] = v[2];
859   u[3] = v[3];
860 
861   u[4] = _mm_mullo_epi32(v[4], cospi16);
862   x = _mm_mullo_epi32(v[5], cospi48);
863   u[4] = _mm_add_epi32(u[4], x);
864   u[4] = _mm_add_epi32(u[4], rnding);
865   u[4] = _mm_srai_epi32(u[4], bit);
866 
867   u[5] = _mm_mullo_epi32(v[4], cospi48);
868   x = _mm_mullo_epi32(v[5], cospi16);
869   u[5] = _mm_sub_epi32(u[5], x);
870   u[5] = _mm_add_epi32(u[5], rnding);
871   u[5] = _mm_srai_epi32(u[5], bit);
872 
873   u[6] = _mm_mullo_epi32(v[6], cospim48);
874   x = _mm_mullo_epi32(v[7], cospi16);
875   u[6] = _mm_add_epi32(u[6], x);
876   u[6] = _mm_add_epi32(u[6], rnding);
877   u[6] = _mm_srai_epi32(u[6], bit);
878 
879   u[7] = _mm_mullo_epi32(v[6], cospi16);
880   x = _mm_mullo_epi32(v[7], cospim48);
881   u[7] = _mm_sub_epi32(u[7], x);
882   u[7] = _mm_add_epi32(u[7], rnding);
883   u[7] = _mm_srai_epi32(u[7], bit);
884 
885   // stage 5
886   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
887   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
888   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
889   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
890 
891   // stage 6
892   u[0] = v[0];
893   u[1] = v[1];
894   u[4] = v[4];
895   u[5] = v[5];
896 
897   v[0] = _mm_mullo_epi32(v[2], cospi32);
898   x = _mm_mullo_epi32(v[3], cospi32);
899   u[2] = _mm_add_epi32(v[0], x);
900   u[2] = _mm_add_epi32(u[2], rnding);
901   u[2] = _mm_srai_epi32(u[2], bit);
902 
903   u[3] = _mm_sub_epi32(v[0], x);
904   u[3] = _mm_add_epi32(u[3], rnding);
905   u[3] = _mm_srai_epi32(u[3], bit);
906 
907   v[0] = _mm_mullo_epi32(v[6], cospi32);
908   x = _mm_mullo_epi32(v[7], cospi32);
909   u[6] = _mm_add_epi32(v[0], x);
910   u[6] = _mm_add_epi32(u[6], rnding);
911   u[6] = _mm_srai_epi32(u[6], bit);
912 
913   u[7] = _mm_sub_epi32(v[0], x);
914   u[7] = _mm_add_epi32(u[7], rnding);
915   u[7] = _mm_srai_epi32(u[7], bit);
916 
917   // stage 7
918   if (do_cols) {
919     out[0] = u[0];
920     out[2] = _mm_sub_epi32(kZero, u[4]);
921     out[4] = u[6];
922     out[6] = _mm_sub_epi32(kZero, u[2]);
923     out[8] = u[3];
924     out[10] = _mm_sub_epi32(kZero, u[7]);
925     out[12] = u[5];
926     out[14] = _mm_sub_epi32(kZero, u[1]);
927   } else {
928     const int log_range_out = AOMMAX(16, bd + 6);
929     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
930     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
931 
932     neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
933                      out_shift);
934     neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
935                      out_shift);
936     neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
937                      &clamp_hi_out, out_shift);
938     neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
939                      &clamp_hi_out, out_shift);
940   }
941 
942   // Odd 8 points: 1, 3, ..., 15
943   // stage 0
944   // stage 1
945   // stage 2
946   // (1)
947   u[0] = _mm_mullo_epi32(in[15], cospi4);
948   x = _mm_mullo_epi32(in[1], cospi60);
949   u[0] = _mm_add_epi32(u[0], x);
950   u[0] = _mm_add_epi32(u[0], rnding);
951   u[0] = _mm_srai_epi32(u[0], bit);
952 
953   u[1] = _mm_mullo_epi32(in[15], cospi60);
954   x = _mm_mullo_epi32(in[1], cospi4);
955   u[1] = _mm_sub_epi32(u[1], x);
956   u[1] = _mm_add_epi32(u[1], rnding);
957   u[1] = _mm_srai_epi32(u[1], bit);
958 
959   // (2)
960   u[2] = _mm_mullo_epi32(in[11], cospi20);
961   x = _mm_mullo_epi32(in[5], cospi44);
962   u[2] = _mm_add_epi32(u[2], x);
963   u[2] = _mm_add_epi32(u[2], rnding);
964   u[2] = _mm_srai_epi32(u[2], bit);
965 
966   u[3] = _mm_mullo_epi32(in[11], cospi44);
967   x = _mm_mullo_epi32(in[5], cospi20);
968   u[3] = _mm_sub_epi32(u[3], x);
969   u[3] = _mm_add_epi32(u[3], rnding);
970   u[3] = _mm_srai_epi32(u[3], bit);
971 
972   // (3)
973   u[4] = _mm_mullo_epi32(in[7], cospi36);
974   x = _mm_mullo_epi32(in[9], cospi28);
975   u[4] = _mm_add_epi32(u[4], x);
976   u[4] = _mm_add_epi32(u[4], rnding);
977   u[4] = _mm_srai_epi32(u[4], bit);
978 
979   u[5] = _mm_mullo_epi32(in[7], cospi28);
980   x = _mm_mullo_epi32(in[9], cospi36);
981   u[5] = _mm_sub_epi32(u[5], x);
982   u[5] = _mm_add_epi32(u[5], rnding);
983   u[5] = _mm_srai_epi32(u[5], bit);
984 
985   // (4)
986   u[6] = _mm_mullo_epi32(in[3], cospi52);
987   x = _mm_mullo_epi32(in[13], cospi12);
988   u[6] = _mm_add_epi32(u[6], x);
989   u[6] = _mm_add_epi32(u[6], rnding);
990   u[6] = _mm_srai_epi32(u[6], bit);
991 
992   u[7] = _mm_mullo_epi32(in[3], cospi12);
993   x = _mm_mullo_epi32(in[13], cospi52);
994   u[7] = _mm_sub_epi32(u[7], x);
995   u[7] = _mm_add_epi32(u[7], rnding);
996   u[7] = _mm_srai_epi32(u[7], bit);
997 
998   // stage 3
999   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1000   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1001   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1002   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1003 
1004   // stage 4
1005   u[0] = v[0];
1006   u[1] = v[1];
1007   u[2] = v[2];
1008   u[3] = v[3];
1009 
1010   u[4] = _mm_mullo_epi32(v[4], cospi16);
1011   x = _mm_mullo_epi32(v[5], cospi48);
1012   u[4] = _mm_add_epi32(u[4], x);
1013   u[4] = _mm_add_epi32(u[4], rnding);
1014   u[4] = _mm_srai_epi32(u[4], bit);
1015 
1016   u[5] = _mm_mullo_epi32(v[4], cospi48);
1017   x = _mm_mullo_epi32(v[5], cospi16);
1018   u[5] = _mm_sub_epi32(u[5], x);
1019   u[5] = _mm_add_epi32(u[5], rnding);
1020   u[5] = _mm_srai_epi32(u[5], bit);
1021 
1022   u[6] = _mm_mullo_epi32(v[6], cospim48);
1023   x = _mm_mullo_epi32(v[7], cospi16);
1024   u[6] = _mm_add_epi32(u[6], x);
1025   u[6] = _mm_add_epi32(u[6], rnding);
1026   u[6] = _mm_srai_epi32(u[6], bit);
1027 
1028   u[7] = _mm_mullo_epi32(v[6], cospi16);
1029   x = _mm_mullo_epi32(v[7], cospim48);
1030   u[7] = _mm_sub_epi32(u[7], x);
1031   u[7] = _mm_add_epi32(u[7], rnding);
1032   u[7] = _mm_srai_epi32(u[7], bit);
1033 
1034   // stage 5
1035   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1036   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1037   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1038   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1039 
1040   // stage 6
1041   u[0] = v[0];
1042   u[1] = v[1];
1043   u[4] = v[4];
1044   u[5] = v[5];
1045 
1046   v[0] = _mm_mullo_epi32(v[2], cospi32);
1047   x = _mm_mullo_epi32(v[3], cospi32);
1048   u[2] = _mm_add_epi32(v[0], x);
1049   u[2] = _mm_add_epi32(u[2], rnding);
1050   u[2] = _mm_srai_epi32(u[2], bit);
1051 
1052   u[3] = _mm_sub_epi32(v[0], x);
1053   u[3] = _mm_add_epi32(u[3], rnding);
1054   u[3] = _mm_srai_epi32(u[3], bit);
1055 
1056   v[0] = _mm_mullo_epi32(v[6], cospi32);
1057   x = _mm_mullo_epi32(v[7], cospi32);
1058   u[6] = _mm_add_epi32(v[0], x);
1059   u[6] = _mm_add_epi32(u[6], rnding);
1060   u[6] = _mm_srai_epi32(u[6], bit);
1061 
1062   u[7] = _mm_sub_epi32(v[0], x);
1063   u[7] = _mm_add_epi32(u[7], rnding);
1064   u[7] = _mm_srai_epi32(u[7], bit);
1065 
1066   // stage 7
1067   if (do_cols) {
1068     out[1] = u[0];
1069     out[3] = _mm_sub_epi32(kZero, u[4]);
1070     out[5] = u[6];
1071     out[7] = _mm_sub_epi32(kZero, u[2]);
1072     out[9] = u[3];
1073     out[11] = _mm_sub_epi32(kZero, u[7]);
1074     out[13] = u[5];
1075     out[15] = _mm_sub_epi32(kZero, u[1]);
1076   } else {
1077     const int log_range_out = AOMMAX(16, bd + 6);
1078     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1079     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1080 
1081     neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1082                      out_shift);
1083     neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1084                      out_shift);
1085     neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1086                      &clamp_hi_out, out_shift);
1087     neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1088                      &clamp_hi_out, out_shift);
1089   }
1090 }
1091 
round_shift_8x8(__m128i * in,int shift)1092 static void round_shift_8x8(__m128i *in, int shift) {
1093   round_shift_4x4(&in[0], shift);
1094   round_shift_4x4(&in[4], shift);
1095   round_shift_4x4(&in[8], shift);
1096   round_shift_4x4(&in[12], shift);
1097 }
1098 
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1099 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1100                              int fliplr, int bd) {
1101   __m128i x0, x1;
1102   const __m128i zero = _mm_setzero_si128();
1103 
1104   x0 = _mm_unpacklo_epi16(pred, zero);
1105   x1 = _mm_unpackhi_epi16(pred, zero);
1106 
1107   if (fliplr) {
1108     res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1109     res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1110     x0 = _mm_add_epi32(res_hi, x0);
1111     x1 = _mm_add_epi32(res_lo, x1);
1112 
1113   } else {
1114     x0 = _mm_add_epi32(res_lo, x0);
1115     x1 = _mm_add_epi32(res_hi, x1);
1116   }
1117 
1118   x0 = _mm_packus_epi32(x0, x1);
1119   return highbd_clamp_epi16(x0, bd);
1120 }
1121 
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1122 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1123                              int fliplr, int flipud, int shift, int bd) {
1124   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1125   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1126 
1127   round_shift_8x8(in, shift);
1128 
1129   v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1130   v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1131   v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1132   v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1133   v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1134   v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1135   v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1136   v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1137 
1138   if (flipud) {
1139     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1140     u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1141     u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1142     u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1143     u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1144     u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1145     u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1146     u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1147   } else {
1148     u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1149     u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1150     u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1151     u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1152     u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1153     u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1154     u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1155     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1156   }
1157 
1158   _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1159   _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1160   _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1161   _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1162   _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1163   _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1164   _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1165   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1166 }
1167 
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1168 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
1169                                    int stride, TX_TYPE tx_type, int bd) {
1170   __m128i in[16], out[16];
1171   const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
1172   const int txw_idx = get_txw_idx(TX_8X8);
1173   const int txh_idx = get_txh_idx(TX_8X8);
1174 
1175   switch (tx_type) {
1176     case DCT_DCT:
1177       load_buffer_8x8(coeff, in);
1178       transpose_8x8(in, out);
1179       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1180                      -shift[0]);
1181       transpose_8x8(in, out);
1182       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1183       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1184       break;
1185     case DCT_ADST:
1186       load_buffer_8x8(coeff, in);
1187       transpose_8x8(in, out);
1188       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1189                       -shift[0]);
1190       transpose_8x8(in, out);
1191       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1192       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1193       break;
1194     case ADST_DCT:
1195       load_buffer_8x8(coeff, in);
1196       transpose_8x8(in, out);
1197       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1198                      -shift[0]);
1199       transpose_8x8(in, out);
1200       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1201       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1202       break;
1203     case ADST_ADST:
1204       load_buffer_8x8(coeff, in);
1205       transpose_8x8(in, out);
1206       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1207                       -shift[0]);
1208       transpose_8x8(in, out);
1209       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1210       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1211       break;
1212     case FLIPADST_DCT:
1213       load_buffer_8x8(coeff, in);
1214       transpose_8x8(in, out);
1215       idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1216                      -shift[0]);
1217       transpose_8x8(in, out);
1218       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1219       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1220       break;
1221     case DCT_FLIPADST:
1222       load_buffer_8x8(coeff, in);
1223       transpose_8x8(in, out);
1224       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1225                       -shift[0]);
1226       transpose_8x8(in, out);
1227       idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1228       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1229       break;
1230     case ADST_FLIPADST:
1231       load_buffer_8x8(coeff, in);
1232       transpose_8x8(in, out);
1233       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1234                       -shift[0]);
1235       transpose_8x8(in, out);
1236       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1237       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1238       break;
1239     case FLIPADST_FLIPADST:
1240       load_buffer_8x8(coeff, in);
1241       transpose_8x8(in, out);
1242       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1243                       -shift[0]);
1244       transpose_8x8(in, out);
1245       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1246       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
1247       break;
1248     case FLIPADST_ADST:
1249       load_buffer_8x8(coeff, in);
1250       transpose_8x8(in, out);
1251       iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1252                       -shift[0]);
1253       transpose_8x8(in, out);
1254       iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1255       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1256       break;
1257     default: assert(0);
1258   }
1259 }
1260 
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1261 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1262                                 int bd, int out_shift) {
1263   const int32_t *cospi = cospi_arr(bit);
1264   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1265   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1266   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1267   __m128i x;
1268 
1269   // stage 0
1270   // stage 1
1271   // stage 2
1272   // stage 3
1273   x = _mm_mullo_epi32(in[0], cospi32);
1274   x = _mm_add_epi32(x, rnding);
1275   x = _mm_srai_epi32(x, bit);
1276 
1277   // stage 4
1278   // stage 5
1279   if (!do_cols) {
1280     const int log_range_out = AOMMAX(16, bd + 6);
1281     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1282         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1283     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1284         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1285 
1286     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1287     x = _mm_add_epi32(x, offset);
1288     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1289     x = _mm_max_epi32(x, clamp_lo_out);
1290     x = _mm_min_epi32(x, clamp_hi_out);
1291   }
1292 
1293   out[0] = x;
1294   out[1] = x;
1295   out[2] = x;
1296   out[3] = x;
1297   out[4] = x;
1298   out[5] = x;
1299   out[6] = x;
1300   out[7] = x;
1301 }
1302 
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1303 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1304                                int bd, int out_shift) {
1305   const int32_t *cospi = cospi_arr(bit);
1306   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1307   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1308   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1309   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1310   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1311   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1312   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1313   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1314   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1315   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1316   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1317   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1318   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1319   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1320   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1321   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1322   __m128i x, y;
1323 
1324   // stage 0
1325   // stage 1
1326   // stage 2
1327   u0 = in[0];
1328   u1 = in[4];
1329   u2 = in[2];
1330   u3 = in[6];
1331 
1332   x = _mm_mullo_epi32(in[1], cospi56);
1333   y = _mm_mullo_epi32(in[7], cospim8);
1334   u4 = _mm_add_epi32(x, y);
1335   u4 = _mm_add_epi32(u4, rnding);
1336   u4 = _mm_srai_epi32(u4, bit);
1337 
1338   x = _mm_mullo_epi32(in[1], cospi8);
1339   y = _mm_mullo_epi32(in[7], cospi56);
1340   u7 = _mm_add_epi32(x, y);
1341   u7 = _mm_add_epi32(u7, rnding);
1342   u7 = _mm_srai_epi32(u7, bit);
1343 
1344   x = _mm_mullo_epi32(in[5], cospi24);
1345   y = _mm_mullo_epi32(in[3], cospim40);
1346   u5 = _mm_add_epi32(x, y);
1347   u5 = _mm_add_epi32(u5, rnding);
1348   u5 = _mm_srai_epi32(u5, bit);
1349 
1350   x = _mm_mullo_epi32(in[5], cospi40);
1351   y = _mm_mullo_epi32(in[3], cospi24);
1352   u6 = _mm_add_epi32(x, y);
1353   u6 = _mm_add_epi32(u6, rnding);
1354   u6 = _mm_srai_epi32(u6, bit);
1355 
1356   // stage 3
1357   x = _mm_mullo_epi32(u0, cospi32);
1358   y = _mm_mullo_epi32(u1, cospi32);
1359   v0 = _mm_add_epi32(x, y);
1360   v0 = _mm_add_epi32(v0, rnding);
1361   v0 = _mm_srai_epi32(v0, bit);
1362 
1363   v1 = _mm_sub_epi32(x, y);
1364   v1 = _mm_add_epi32(v1, rnding);
1365   v1 = _mm_srai_epi32(v1, bit);
1366 
1367   x = _mm_mullo_epi32(u2, cospi48);
1368   y = _mm_mullo_epi32(u3, cospim16);
1369   v2 = _mm_add_epi32(x, y);
1370   v2 = _mm_add_epi32(v2, rnding);
1371   v2 = _mm_srai_epi32(v2, bit);
1372 
1373   x = _mm_mullo_epi32(u2, cospi16);
1374   y = _mm_mullo_epi32(u3, cospi48);
1375   v3 = _mm_add_epi32(x, y);
1376   v3 = _mm_add_epi32(v3, rnding);
1377   v3 = _mm_srai_epi32(v3, bit);
1378 
1379   addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1380   addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1381 
1382   // stage 4
1383   addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1384   addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1385   u4 = v4;
1386   u7 = v7;
1387 
1388   x = _mm_mullo_epi32(v5, cospi32);
1389   y = _mm_mullo_epi32(v6, cospi32);
1390   u6 = _mm_add_epi32(y, x);
1391   u6 = _mm_add_epi32(u6, rnding);
1392   u6 = _mm_srai_epi32(u6, bit);
1393 
1394   u5 = _mm_sub_epi32(y, x);
1395   u5 = _mm_add_epi32(u5, rnding);
1396   u5 = _mm_srai_epi32(u5, bit);
1397 
1398   // stage 5
1399   if (do_cols) {
1400     addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
1401     addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
1402     addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
1403     addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
1404   } else {
1405     const int log_range_out = AOMMAX(16, bd + 6);
1406     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1407         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1408     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1409         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1410     addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
1411                         out_shift);
1412     addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
1413                         out_shift);
1414     addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
1415                         out_shift);
1416     addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
1417                         out_shift);
1418   }
1419 }
1420 
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1421 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1422                                  int do_cols, int bd, int out_shift) {
1423   const int32_t *cospi = cospi_arr(bit);
1424   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1425   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1426   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1427   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1428   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1429   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1430   const __m128i kZero = _mm_setzero_si128();
1431   __m128i u[8], x;
1432 
1433   // stage 0
1434   // stage 1
1435   // stage 2
1436 
1437   x = _mm_mullo_epi32(in[0], cospi60);
1438   u[0] = _mm_add_epi32(x, rnding);
1439   u[0] = _mm_srai_epi32(u[0], bit);
1440 
1441   x = _mm_mullo_epi32(in[0], cospi4);
1442   u[1] = _mm_sub_epi32(kZero, x);
1443   u[1] = _mm_add_epi32(u[1], rnding);
1444   u[1] = _mm_srai_epi32(u[1], bit);
1445 
1446   // stage 3
1447   // stage 4
1448   __m128i temp1, temp2;
1449   temp1 = _mm_mullo_epi32(u[0], cospi16);
1450   x = _mm_mullo_epi32(u[1], cospi48);
1451   temp1 = _mm_add_epi32(temp1, x);
1452   temp1 = _mm_add_epi32(temp1, rnding);
1453   temp1 = _mm_srai_epi32(temp1, bit);
1454   u[4] = temp1;
1455 
1456   temp2 = _mm_mullo_epi32(u[0], cospi48);
1457   x = _mm_mullo_epi32(u[1], cospi16);
1458   u[5] = _mm_sub_epi32(temp2, x);
1459   u[5] = _mm_add_epi32(u[5], rnding);
1460   u[5] = _mm_srai_epi32(u[5], bit);
1461 
1462   // stage 5
1463   // stage 6
1464   temp1 = _mm_mullo_epi32(u[0], cospi32);
1465   x = _mm_mullo_epi32(u[1], cospi32);
1466   u[2] = _mm_add_epi32(temp1, x);
1467   u[2] = _mm_add_epi32(u[2], rnding);
1468   u[2] = _mm_srai_epi32(u[2], bit);
1469 
1470   u[3] = _mm_sub_epi32(temp1, x);
1471   u[3] = _mm_add_epi32(u[3], rnding);
1472   u[3] = _mm_srai_epi32(u[3], bit);
1473 
1474   temp1 = _mm_mullo_epi32(u[4], cospi32);
1475   x = _mm_mullo_epi32(u[5], cospi32);
1476   u[6] = _mm_add_epi32(temp1, x);
1477   u[6] = _mm_add_epi32(u[6], rnding);
1478   u[6] = _mm_srai_epi32(u[6], bit);
1479 
1480   u[7] = _mm_sub_epi32(temp1, x);
1481   u[7] = _mm_add_epi32(u[7], rnding);
1482   u[7] = _mm_srai_epi32(u[7], bit);
1483 
1484   // stage 7
1485   if (do_cols) {
1486     out[0] = u[0];
1487     out[1] = _mm_sub_epi32(kZero, u[4]);
1488     out[2] = u[6];
1489     out[3] = _mm_sub_epi32(kZero, u[2]);
1490     out[4] = u[3];
1491     out[5] = _mm_sub_epi32(kZero, u[7]);
1492     out[6] = u[5];
1493     out[7] = _mm_sub_epi32(kZero, u[1]);
1494   } else {
1495     const int log_range_out = AOMMAX(16, bd + 6);
1496     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1497     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1498 
1499     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1500                      out_shift);
1501     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1502                      out_shift);
1503     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1504                      out_shift);
1505     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1506                      out_shift);
1507   }
1508 }
1509 
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1510 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1511                                 int bd, int out_shift) {
1512   const int32_t *cospi = cospi_arr(bit);
1513   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1514   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1515   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1516   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1517   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1518   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1519   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1520   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1521   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1522   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1523   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1524   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1525   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1526   const __m128i kZero = _mm_setzero_si128();
1527   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1528   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1529   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1530   __m128i u[8], v[8], x;
1531 
1532   // stage 0
1533   // stage 1
1534   // stage 2
1535 
1536   u[0] = _mm_mullo_epi32(in[7], cospi4);
1537   x = _mm_mullo_epi32(in[0], cospi60);
1538   u[0] = _mm_add_epi32(u[0], x);
1539   u[0] = _mm_add_epi32(u[0], rnding);
1540   u[0] = _mm_srai_epi32(u[0], bit);
1541 
1542   u[1] = _mm_mullo_epi32(in[7], cospi60);
1543   x = _mm_mullo_epi32(in[0], cospi4);
1544   u[1] = _mm_sub_epi32(u[1], x);
1545   u[1] = _mm_add_epi32(u[1], rnding);
1546   u[1] = _mm_srai_epi32(u[1], bit);
1547 
1548   // (2)
1549   u[2] = _mm_mullo_epi32(in[5], cospi20);
1550   x = _mm_mullo_epi32(in[2], cospi44);
1551   u[2] = _mm_add_epi32(u[2], x);
1552   u[2] = _mm_add_epi32(u[2], rnding);
1553   u[2] = _mm_srai_epi32(u[2], bit);
1554 
1555   u[3] = _mm_mullo_epi32(in[5], cospi44);
1556   x = _mm_mullo_epi32(in[2], cospi20);
1557   u[3] = _mm_sub_epi32(u[3], x);
1558   u[3] = _mm_add_epi32(u[3], rnding);
1559   u[3] = _mm_srai_epi32(u[3], bit);
1560 
1561   // (3)
1562   u[4] = _mm_mullo_epi32(in[3], cospi36);
1563   x = _mm_mullo_epi32(in[4], cospi28);
1564   u[4] = _mm_add_epi32(u[4], x);
1565   u[4] = _mm_add_epi32(u[4], rnding);
1566   u[4] = _mm_srai_epi32(u[4], bit);
1567 
1568   u[5] = _mm_mullo_epi32(in[3], cospi28);
1569   x = _mm_mullo_epi32(in[4], cospi36);
1570   u[5] = _mm_sub_epi32(u[5], x);
1571   u[5] = _mm_add_epi32(u[5], rnding);
1572   u[5] = _mm_srai_epi32(u[5], bit);
1573 
1574   // (4)
1575   u[6] = _mm_mullo_epi32(in[1], cospi52);
1576   x = _mm_mullo_epi32(in[6], cospi12);
1577   u[6] = _mm_add_epi32(u[6], x);
1578   u[6] = _mm_add_epi32(u[6], rnding);
1579   u[6] = _mm_srai_epi32(u[6], bit);
1580 
1581   u[7] = _mm_mullo_epi32(in[1], cospi12);
1582   x = _mm_mullo_epi32(in[6], cospi52);
1583   u[7] = _mm_sub_epi32(u[7], x);
1584   u[7] = _mm_add_epi32(u[7], rnding);
1585   u[7] = _mm_srai_epi32(u[7], bit);
1586 
1587   // stage 3
1588   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1589   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1590   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1591   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1592 
1593   // stage 4
1594   u[0] = v[0];
1595   u[1] = v[1];
1596   u[2] = v[2];
1597   u[3] = v[3];
1598 
1599   u[4] = _mm_mullo_epi32(v[4], cospi16);
1600   x = _mm_mullo_epi32(v[5], cospi48);
1601   u[4] = _mm_add_epi32(u[4], x);
1602   u[4] = _mm_add_epi32(u[4], rnding);
1603   u[4] = _mm_srai_epi32(u[4], bit);
1604 
1605   u[5] = _mm_mullo_epi32(v[4], cospi48);
1606   x = _mm_mullo_epi32(v[5], cospi16);
1607   u[5] = _mm_sub_epi32(u[5], x);
1608   u[5] = _mm_add_epi32(u[5], rnding);
1609   u[5] = _mm_srai_epi32(u[5], bit);
1610 
1611   u[6] = _mm_mullo_epi32(v[6], cospim48);
1612   x = _mm_mullo_epi32(v[7], cospi16);
1613   u[6] = _mm_add_epi32(u[6], x);
1614   u[6] = _mm_add_epi32(u[6], rnding);
1615   u[6] = _mm_srai_epi32(u[6], bit);
1616 
1617   u[7] = _mm_mullo_epi32(v[6], cospi16);
1618   x = _mm_mullo_epi32(v[7], cospim48);
1619   u[7] = _mm_sub_epi32(u[7], x);
1620   u[7] = _mm_add_epi32(u[7], rnding);
1621   u[7] = _mm_srai_epi32(u[7], bit);
1622 
1623   // stage 5
1624   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1625   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1626   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1627   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1628 
1629   // stage 6
1630   u[0] = v[0];
1631   u[1] = v[1];
1632   u[4] = v[4];
1633   u[5] = v[5];
1634 
1635   v[0] = _mm_mullo_epi32(v[2], cospi32);
1636   x = _mm_mullo_epi32(v[3], cospi32);
1637   u[2] = _mm_add_epi32(v[0], x);
1638   u[2] = _mm_add_epi32(u[2], rnding);
1639   u[2] = _mm_srai_epi32(u[2], bit);
1640 
1641   u[3] = _mm_sub_epi32(v[0], x);
1642   u[3] = _mm_add_epi32(u[3], rnding);
1643   u[3] = _mm_srai_epi32(u[3], bit);
1644 
1645   v[0] = _mm_mullo_epi32(v[6], cospi32);
1646   x = _mm_mullo_epi32(v[7], cospi32);
1647   u[6] = _mm_add_epi32(v[0], x);
1648   u[6] = _mm_add_epi32(u[6], rnding);
1649   u[6] = _mm_srai_epi32(u[6], bit);
1650 
1651   u[7] = _mm_sub_epi32(v[0], x);
1652   u[7] = _mm_add_epi32(u[7], rnding);
1653   u[7] = _mm_srai_epi32(u[7], bit);
1654 
1655   // stage 7
1656   if (do_cols) {
1657     out[0] = u[0];
1658     out[1] = _mm_sub_epi32(kZero, u[4]);
1659     out[2] = u[6];
1660     out[3] = _mm_sub_epi32(kZero, u[2]);
1661     out[4] = u[3];
1662     out[5] = _mm_sub_epi32(kZero, u[7]);
1663     out[6] = u[5];
1664     out[7] = _mm_sub_epi32(kZero, u[1]);
1665   } else {
1666     const int log_range_out = AOMMAX(16, bd + 6);
1667     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1668     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1669 
1670     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1671                      out_shift);
1672     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1673                      out_shift);
1674     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1675                      out_shift);
1676     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1677                      out_shift);
1678   }
1679 }
1680 
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1681 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1682                                   int do_cols, int bd, int out_shift) {
1683   const int32_t *cospi = cospi_arr(bit);
1684   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1685   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1686   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1687   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1688   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1689 
1690   {
1691     // stage 0
1692     // stage 1
1693     // stage 2
1694     // stage 3
1695     // stage 4
1696     in[0] = _mm_mullo_epi32(in[0], cospi32);
1697     in[0] = _mm_add_epi32(in[0], rnding);
1698     in[0] = _mm_srai_epi32(in[0], bit);
1699 
1700     // stage 5
1701     // stage 6
1702     // stage 7
1703     if (do_cols) {
1704       in[0] = _mm_max_epi32(in[0], clamp_lo);
1705       in[0] = _mm_min_epi32(in[0], clamp_hi);
1706     } else {
1707       const int log_range_out = AOMMAX(16, bd + 6);
1708       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1709           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1710       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1711           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1712       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1713       in[0] = _mm_add_epi32(in[0], offset);
1714       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1715       in[0] = _mm_max_epi32(in[0], clamp_lo_out);
1716       in[0] = _mm_min_epi32(in[0], clamp_hi_out);
1717     }
1718 
1719     out[0] = in[0];
1720     out[1] = in[0];
1721     out[2] = in[0];
1722     out[3] = in[0];
1723     out[4] = in[0];
1724     out[5] = in[0];
1725     out[6] = in[0];
1726     out[7] = in[0];
1727     out[8] = in[0];
1728     out[9] = in[0];
1729     out[10] = in[0];
1730     out[11] = in[0];
1731     out[12] = in[0];
1732     out[13] = in[0];
1733     out[14] = in[0];
1734     out[15] = in[0];
1735   }
1736 }
1737 
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1738 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1739                                   int do_cols, int bd, int out_shift) {
1740   const int32_t *cospi = cospi_arr(bit);
1741   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1742   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1743   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1744   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1745   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1746   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1747   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1748   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1749   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1750   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1751   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1752   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1753   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1754   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1755   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1756   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1757   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1758   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1759   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1760   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1761   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1762   __m128i u[16], x, y;
1763 
1764   {
1765     // stage 0
1766     // stage 1
1767     u[0] = in[0];
1768     u[2] = in[4];
1769     u[4] = in[2];
1770     u[6] = in[6];
1771     u[8] = in[1];
1772     u[10] = in[5];
1773     u[12] = in[3];
1774     u[14] = in[7];
1775 
1776     // stage 2
1777     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1778     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1779 
1780     u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1781     u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1782 
1783     u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1784     u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1785 
1786     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1787     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1788 
1789     // stage 3
1790     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1791     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1792     u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1793     u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1794 
1795     addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1796     addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1797     addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1798     addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1799 
1800     // stage 4
1801     x = _mm_mullo_epi32(u[0], cospi32);
1802     u[0] = _mm_add_epi32(x, rnding);
1803     u[0] = _mm_srai_epi32(u[0], bit);
1804     u[1] = u[0];
1805 
1806     u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
1807     u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
1808 
1809     addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1810     addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1811 
1812     x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1813     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1814     u[9] = x;
1815     y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1816     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1817     u[10] = y;
1818 
1819     // stage 5
1820     addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1821     addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1822 
1823     x = _mm_mullo_epi32(u[5], cospi32);
1824     y = _mm_mullo_epi32(u[6], cospi32);
1825     u[5] = _mm_sub_epi32(y, x);
1826     u[5] = _mm_add_epi32(u[5], rnding);
1827     u[5] = _mm_srai_epi32(u[5], bit);
1828 
1829     u[6] = _mm_add_epi32(y, x);
1830     u[6] = _mm_add_epi32(u[6], rnding);
1831     u[6] = _mm_srai_epi32(u[6], bit);
1832 
1833     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1834     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1835     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1836     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1837 
1838     // stage 6
1839     addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1840     addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1841     addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1842     addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1843 
1844     x = _mm_mullo_epi32(u[10], cospi32);
1845     y = _mm_mullo_epi32(u[13], cospi32);
1846     u[10] = _mm_sub_epi32(y, x);
1847     u[10] = _mm_add_epi32(u[10], rnding);
1848     u[10] = _mm_srai_epi32(u[10], bit);
1849 
1850     u[13] = _mm_add_epi32(x, y);
1851     u[13] = _mm_add_epi32(u[13], rnding);
1852     u[13] = _mm_srai_epi32(u[13], bit);
1853 
1854     x = _mm_mullo_epi32(u[11], cospi32);
1855     y = _mm_mullo_epi32(u[12], cospi32);
1856     u[11] = _mm_sub_epi32(y, x);
1857     u[11] = _mm_add_epi32(u[11], rnding);
1858     u[11] = _mm_srai_epi32(u[11], bit);
1859 
1860     u[12] = _mm_add_epi32(x, y);
1861     u[12] = _mm_add_epi32(u[12], rnding);
1862     u[12] = _mm_srai_epi32(u[12], bit);
1863     // stage 7
1864     if (do_cols) {
1865       addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
1866       addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
1867       addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
1868       addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
1869       addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
1870       addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
1871       addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
1872       addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
1873     } else {
1874       const int log_range_out = AOMMAX(16, bd + 6);
1875       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1876           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1877       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1878           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1879 
1880       addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
1881                           &clamp_hi_out, out_shift);
1882       addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
1883                           &clamp_hi_out, out_shift);
1884       addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
1885                           &clamp_hi_out, out_shift);
1886       addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
1887                           &clamp_hi_out, out_shift);
1888       addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
1889                           &clamp_hi_out, out_shift);
1890       addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
1891                           &clamp_hi_out, out_shift);
1892       addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
1893                           &clamp_hi_out, out_shift);
1894       addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
1895                           &clamp_hi_out, out_shift);
1896     }
1897   }
1898 }
1899 
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1900 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1901                                    int do_cols, int bd, int out_shift) {
1902   const int32_t *cospi = cospi_arr(bit);
1903   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1904   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1905   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1906   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1907   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1908   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1909   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1910   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1911   const __m128i zero = _mm_setzero_si128();
1912   __m128i v[16], x, y, temp1, temp2;
1913 
1914   // Calculate the column 0, 1, 2, 3
1915   {
1916     // stage 0
1917     // stage 1
1918     // stage 2
1919     x = _mm_mullo_epi32(in[0], cospi62);
1920     v[0] = _mm_add_epi32(x, rnding);
1921     v[0] = _mm_srai_epi32(v[0], bit);
1922 
1923     x = _mm_mullo_epi32(in[0], cospi2);
1924     v[1] = _mm_sub_epi32(zero, x);
1925     v[1] = _mm_add_epi32(v[1], rnding);
1926     v[1] = _mm_srai_epi32(v[1], bit);
1927 
1928     // stage 3
1929     v[8] = v[0];
1930     v[9] = v[1];
1931 
1932     // stage 4
1933     temp1 = _mm_mullo_epi32(v[8], cospi8);
1934     x = _mm_mullo_epi32(v[9], cospi56);
1935     temp1 = _mm_add_epi32(temp1, x);
1936     temp1 = _mm_add_epi32(temp1, rnding);
1937     temp1 = _mm_srai_epi32(temp1, bit);
1938 
1939     temp2 = _mm_mullo_epi32(v[8], cospi56);
1940     x = _mm_mullo_epi32(v[9], cospi8);
1941     temp2 = _mm_sub_epi32(temp2, x);
1942     temp2 = _mm_add_epi32(temp2, rnding);
1943     temp2 = _mm_srai_epi32(temp2, bit);
1944     v[8] = temp1;
1945     v[9] = temp2;
1946 
1947     // stage 5
1948     v[4] = v[0];
1949     v[5] = v[1];
1950     v[12] = v[8];
1951     v[13] = v[9];
1952 
1953     // stage 6
1954     temp1 = _mm_mullo_epi32(v[4], cospi16);
1955     x = _mm_mullo_epi32(v[5], cospi48);
1956     temp1 = _mm_add_epi32(temp1, x);
1957     temp1 = _mm_add_epi32(temp1, rnding);
1958     temp1 = _mm_srai_epi32(temp1, bit);
1959 
1960     temp2 = _mm_mullo_epi32(v[4], cospi48);
1961     x = _mm_mullo_epi32(v[5], cospi16);
1962     temp2 = _mm_sub_epi32(temp2, x);
1963     temp2 = _mm_add_epi32(temp2, rnding);
1964     temp2 = _mm_srai_epi32(temp2, bit);
1965     v[4] = temp1;
1966     v[5] = temp2;
1967 
1968     temp1 = _mm_mullo_epi32(v[12], cospi16);
1969     x = _mm_mullo_epi32(v[13], cospi48);
1970     temp1 = _mm_add_epi32(temp1, x);
1971     temp1 = _mm_add_epi32(temp1, rnding);
1972     temp1 = _mm_srai_epi32(temp1, bit);
1973 
1974     temp2 = _mm_mullo_epi32(v[12], cospi48);
1975     x = _mm_mullo_epi32(v[13], cospi16);
1976     temp2 = _mm_sub_epi32(temp2, x);
1977     temp2 = _mm_add_epi32(temp2, rnding);
1978     temp2 = _mm_srai_epi32(temp2, bit);
1979     v[12] = temp1;
1980     v[13] = temp2;
1981 
1982     // stage 7
1983     v[2] = v[0];
1984     v[3] = v[1];
1985     v[6] = v[4];
1986     v[7] = v[5];
1987     v[10] = v[8];
1988     v[11] = v[9];
1989     v[14] = v[12];
1990     v[15] = v[13];
1991 
1992     // stage 8
1993     y = _mm_mullo_epi32(v[2], cospi32);
1994     x = _mm_mullo_epi32(v[3], cospi32);
1995     v[2] = _mm_add_epi32(y, x);
1996     v[2] = _mm_add_epi32(v[2], rnding);
1997     v[2] = _mm_srai_epi32(v[2], bit);
1998 
1999     v[3] = _mm_sub_epi32(y, x);
2000     v[3] = _mm_add_epi32(v[3], rnding);
2001     v[3] = _mm_srai_epi32(v[3], bit);
2002 
2003     y = _mm_mullo_epi32(v[6], cospi32);
2004     x = _mm_mullo_epi32(v[7], cospi32);
2005     v[6] = _mm_add_epi32(y, x);
2006     v[6] = _mm_add_epi32(v[6], rnding);
2007     v[6] = _mm_srai_epi32(v[6], bit);
2008 
2009     v[7] = _mm_sub_epi32(y, x);
2010     v[7] = _mm_add_epi32(v[7], rnding);
2011     v[7] = _mm_srai_epi32(v[7], bit);
2012 
2013     y = _mm_mullo_epi32(v[10], cospi32);
2014     x = _mm_mullo_epi32(v[11], cospi32);
2015     v[10] = _mm_add_epi32(y, x);
2016     v[10] = _mm_add_epi32(v[10], rnding);
2017     v[10] = _mm_srai_epi32(v[10], bit);
2018 
2019     v[11] = _mm_sub_epi32(y, x);
2020     v[11] = _mm_add_epi32(v[11], rnding);
2021     v[11] = _mm_srai_epi32(v[11], bit);
2022 
2023     y = _mm_mullo_epi32(v[14], cospi32);
2024     x = _mm_mullo_epi32(v[15], cospi32);
2025     v[14] = _mm_add_epi32(y, x);
2026     v[14] = _mm_add_epi32(v[14], rnding);
2027     v[14] = _mm_srai_epi32(v[14], bit);
2028 
2029     v[15] = _mm_sub_epi32(y, x);
2030     v[15] = _mm_add_epi32(v[15], rnding);
2031     v[15] = _mm_srai_epi32(v[15], bit);
2032 
2033     // stage 9
2034     if (do_cols) {
2035       out[0] = v[0];
2036       out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2037       out[2] = v[12];
2038       out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2039       out[4] = v[6];
2040       out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2041       out[6] = v[10];
2042       out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2043       out[8] = v[3];
2044       out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2045       out[10] = v[15];
2046       out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2047       out[12] = v[5];
2048       out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2049       out[14] = v[9];
2050       out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2051     } else {
2052       const int log_range_out = AOMMAX(16, bd + 6);
2053       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2054       const __m128i clamp_hi_out =
2055           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2056 
2057       neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2058                        &clamp_hi_out, out_shift);
2059       neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2060                        &clamp_hi_out, out_shift);
2061       neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2062                        &clamp_hi_out, out_shift);
2063       neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2064                        &clamp_hi_out, out_shift);
2065       neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2066                        &clamp_hi_out, out_shift);
2067       neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2068                        &clamp_hi_out, out_shift);
2069       neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2070                        &clamp_hi_out, out_shift);
2071       neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2072                        &clamp_hi_out, out_shift);
2073     }
2074   }
2075 }
2076 
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2077 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2078                                    int do_cols, int bd, int out_shift) {
2079   const int32_t *cospi = cospi_arr(bit);
2080   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2081   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2082   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2083   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2084   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2085   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2086   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2087   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2088   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2089   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2090   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2091   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2092   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2093   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2094   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2095   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2096   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2097   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2098   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2099   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2100   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2101   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2102   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2103   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2104   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2105   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2106   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2107   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2108   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2109   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2110   __m128i u[16], x, y;
2111 
2112   // Calculate the column 0, 1, 2, 3
2113   {
2114     // stage 0
2115     // stage 1
2116     // stage 2
2117     __m128i zero = _mm_setzero_si128();
2118     x = _mm_mullo_epi32(in[0], cospi62);
2119     u[0] = _mm_add_epi32(x, rnding);
2120     u[0] = _mm_srai_epi32(u[0], bit);
2121 
2122     x = _mm_mullo_epi32(in[0], cospi2);
2123     u[1] = _mm_sub_epi32(zero, x);
2124     u[1] = _mm_add_epi32(u[1], rnding);
2125     u[1] = _mm_srai_epi32(u[1], bit);
2126 
2127     x = _mm_mullo_epi32(in[2], cospi54);
2128     u[2] = _mm_add_epi32(x, rnding);
2129     u[2] = _mm_srai_epi32(u[2], bit);
2130 
2131     x = _mm_mullo_epi32(in[2], cospi10);
2132     u[3] = _mm_sub_epi32(zero, x);
2133     u[3] = _mm_add_epi32(u[3], rnding);
2134     u[3] = _mm_srai_epi32(u[3], bit);
2135 
2136     x = _mm_mullo_epi32(in[4], cospi46);
2137     u[4] = _mm_add_epi32(x, rnding);
2138     u[4] = _mm_srai_epi32(u[4], bit);
2139 
2140     x = _mm_mullo_epi32(in[4], cospi18);
2141     u[5] = _mm_sub_epi32(zero, x);
2142     u[5] = _mm_add_epi32(u[5], rnding);
2143     u[5] = _mm_srai_epi32(u[5], bit);
2144 
2145     x = _mm_mullo_epi32(in[6], cospi38);
2146     u[6] = _mm_add_epi32(x, rnding);
2147     u[6] = _mm_srai_epi32(u[6], bit);
2148 
2149     x = _mm_mullo_epi32(in[6], cospi26);
2150     u[7] = _mm_sub_epi32(zero, x);
2151     u[7] = _mm_add_epi32(u[7], rnding);
2152     u[7] = _mm_srai_epi32(u[7], bit);
2153 
2154     u[8] = _mm_mullo_epi32(in[7], cospi34);
2155     u[8] = _mm_add_epi32(u[8], rnding);
2156     u[8] = _mm_srai_epi32(u[8], bit);
2157 
2158     u[9] = _mm_mullo_epi32(in[7], cospi30);
2159     u[9] = _mm_add_epi32(u[9], rnding);
2160     u[9] = _mm_srai_epi32(u[9], bit);
2161 
2162     u[10] = _mm_mullo_epi32(in[5], cospi42);
2163     u[10] = _mm_add_epi32(u[10], rnding);
2164     u[10] = _mm_srai_epi32(u[10], bit);
2165 
2166     u[11] = _mm_mullo_epi32(in[5], cospi22);
2167     u[11] = _mm_add_epi32(u[11], rnding);
2168     u[11] = _mm_srai_epi32(u[11], bit);
2169 
2170     u[12] = _mm_mullo_epi32(in[3], cospi50);
2171     u[12] = _mm_add_epi32(u[12], rnding);
2172     u[12] = _mm_srai_epi32(u[12], bit);
2173 
2174     u[13] = _mm_mullo_epi32(in[3], cospi14);
2175     u[13] = _mm_add_epi32(u[13], rnding);
2176     u[13] = _mm_srai_epi32(u[13], bit);
2177 
2178     u[14] = _mm_mullo_epi32(in[1], cospi58);
2179     u[14] = _mm_add_epi32(u[14], rnding);
2180     u[14] = _mm_srai_epi32(u[14], bit);
2181 
2182     u[15] = _mm_mullo_epi32(in[1], cospi6);
2183     u[15] = _mm_add_epi32(u[15], rnding);
2184     u[15] = _mm_srai_epi32(u[15], bit);
2185 
2186     // stage 3
2187     addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2188     addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2189     addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2190     addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2191     addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2192     addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2193     addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2194     addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2195 
2196     // stage 4
2197     y = _mm_mullo_epi32(u[8], cospi56);
2198     x = _mm_mullo_epi32(u[9], cospi56);
2199     u[8] = _mm_mullo_epi32(u[8], cospi8);
2200     u[8] = _mm_add_epi32(u[8], x);
2201     u[8] = _mm_add_epi32(u[8], rnding);
2202     u[8] = _mm_srai_epi32(u[8], bit);
2203 
2204     x = _mm_mullo_epi32(u[9], cospi8);
2205     u[9] = _mm_sub_epi32(y, x);
2206     u[9] = _mm_add_epi32(u[9], rnding);
2207     u[9] = _mm_srai_epi32(u[9], bit);
2208 
2209     x = _mm_mullo_epi32(u[11], cospi24);
2210     y = _mm_mullo_epi32(u[10], cospi24);
2211     u[10] = _mm_mullo_epi32(u[10], cospi40);
2212     u[10] = _mm_add_epi32(u[10], x);
2213     u[10] = _mm_add_epi32(u[10], rnding);
2214     u[10] = _mm_srai_epi32(u[10], bit);
2215 
2216     x = _mm_mullo_epi32(u[11], cospi40);
2217     u[11] = _mm_sub_epi32(y, x);
2218     u[11] = _mm_add_epi32(u[11], rnding);
2219     u[11] = _mm_srai_epi32(u[11], bit);
2220 
2221     x = _mm_mullo_epi32(u[13], cospi8);
2222     y = _mm_mullo_epi32(u[12], cospi8);
2223     u[12] = _mm_mullo_epi32(u[12], cospim56);
2224     u[12] = _mm_add_epi32(u[12], x);
2225     u[12] = _mm_add_epi32(u[12], rnding);
2226     u[12] = _mm_srai_epi32(u[12], bit);
2227 
2228     x = _mm_mullo_epi32(u[13], cospim56);
2229     u[13] = _mm_sub_epi32(y, x);
2230     u[13] = _mm_add_epi32(u[13], rnding);
2231     u[13] = _mm_srai_epi32(u[13], bit);
2232 
2233     x = _mm_mullo_epi32(u[15], cospi40);
2234     y = _mm_mullo_epi32(u[14], cospi40);
2235     u[14] = _mm_mullo_epi32(u[14], cospim24);
2236     u[14] = _mm_add_epi32(u[14], x);
2237     u[14] = _mm_add_epi32(u[14], rnding);
2238     u[14] = _mm_srai_epi32(u[14], bit);
2239 
2240     x = _mm_mullo_epi32(u[15], cospim24);
2241     u[15] = _mm_sub_epi32(y, x);
2242     u[15] = _mm_add_epi32(u[15], rnding);
2243     u[15] = _mm_srai_epi32(u[15], bit);
2244 
2245     // stage 5
2246     addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2247     addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2248     addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2249     addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2250     addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2251     addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2252     addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2253     addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2254 
2255     // stage 6
2256     x = _mm_mullo_epi32(u[5], cospi48);
2257     y = _mm_mullo_epi32(u[4], cospi48);
2258     u[4] = _mm_mullo_epi32(u[4], cospi16);
2259     u[4] = _mm_add_epi32(u[4], x);
2260     u[4] = _mm_add_epi32(u[4], rnding);
2261     u[4] = _mm_srai_epi32(u[4], bit);
2262 
2263     x = _mm_mullo_epi32(u[5], cospi16);
2264     u[5] = _mm_sub_epi32(y, x);
2265     u[5] = _mm_add_epi32(u[5], rnding);
2266     u[5] = _mm_srai_epi32(u[5], bit);
2267 
2268     x = _mm_mullo_epi32(u[7], cospi16);
2269     y = _mm_mullo_epi32(u[6], cospi16);
2270     u[6] = _mm_mullo_epi32(u[6], cospim48);
2271     u[6] = _mm_add_epi32(u[6], x);
2272     u[6] = _mm_add_epi32(u[6], rnding);
2273     u[6] = _mm_srai_epi32(u[6], bit);
2274 
2275     x = _mm_mullo_epi32(u[7], cospim48);
2276     u[7] = _mm_sub_epi32(y, x);
2277     u[7] = _mm_add_epi32(u[7], rnding);
2278     u[7] = _mm_srai_epi32(u[7], bit);
2279 
2280     x = _mm_mullo_epi32(u[13], cospi48);
2281     y = _mm_mullo_epi32(u[12], cospi48);
2282     u[12] = _mm_mullo_epi32(u[12], cospi16);
2283     u[12] = _mm_add_epi32(u[12], x);
2284     u[12] = _mm_add_epi32(u[12], rnding);
2285     u[12] = _mm_srai_epi32(u[12], bit);
2286 
2287     x = _mm_mullo_epi32(u[13], cospi16);
2288     u[13] = _mm_sub_epi32(y, x);
2289     u[13] = _mm_add_epi32(u[13], rnding);
2290     u[13] = _mm_srai_epi32(u[13], bit);
2291 
2292     x = _mm_mullo_epi32(u[15], cospi16);
2293     y = _mm_mullo_epi32(u[14], cospi16);
2294     u[14] = _mm_mullo_epi32(u[14], cospim48);
2295     u[14] = _mm_add_epi32(u[14], x);
2296     u[14] = _mm_add_epi32(u[14], rnding);
2297     u[14] = _mm_srai_epi32(u[14], bit);
2298 
2299     x = _mm_mullo_epi32(u[15], cospim48);
2300     u[15] = _mm_sub_epi32(y, x);
2301     u[15] = _mm_add_epi32(u[15], rnding);
2302     u[15] = _mm_srai_epi32(u[15], bit);
2303 
2304     // stage 7
2305     addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2306     addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2307     addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2308     addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2309     addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2310     addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2311     addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2312     addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2313 
2314     // stage 8
2315     y = _mm_mullo_epi32(u[2], cospi32);
2316     x = _mm_mullo_epi32(u[3], cospi32);
2317     u[2] = _mm_add_epi32(y, x);
2318     u[2] = _mm_add_epi32(u[2], rnding);
2319     u[2] = _mm_srai_epi32(u[2], bit);
2320 
2321     u[3] = _mm_sub_epi32(y, x);
2322     u[3] = _mm_add_epi32(u[3], rnding);
2323     u[3] = _mm_srai_epi32(u[3], bit);
2324     y = _mm_mullo_epi32(u[6], cospi32);
2325     x = _mm_mullo_epi32(u[7], cospi32);
2326     u[6] = _mm_add_epi32(y, x);
2327     u[6] = _mm_add_epi32(u[6], rnding);
2328     u[6] = _mm_srai_epi32(u[6], bit);
2329 
2330     u[7] = _mm_sub_epi32(y, x);
2331     u[7] = _mm_add_epi32(u[7], rnding);
2332     u[7] = _mm_srai_epi32(u[7], bit);
2333 
2334     y = _mm_mullo_epi32(u[10], cospi32);
2335     x = _mm_mullo_epi32(u[11], cospi32);
2336     u[10] = _mm_add_epi32(y, x);
2337     u[10] = _mm_add_epi32(u[10], rnding);
2338     u[10] = _mm_srai_epi32(u[10], bit);
2339 
2340     u[11] = _mm_sub_epi32(y, x);
2341     u[11] = _mm_add_epi32(u[11], rnding);
2342     u[11] = _mm_srai_epi32(u[11], bit);
2343 
2344     y = _mm_mullo_epi32(u[14], cospi32);
2345     x = _mm_mullo_epi32(u[15], cospi32);
2346     u[14] = _mm_add_epi32(y, x);
2347     u[14] = _mm_add_epi32(u[14], rnding);
2348     u[14] = _mm_srai_epi32(u[14], bit);
2349 
2350     u[15] = _mm_sub_epi32(y, x);
2351     u[15] = _mm_add_epi32(u[15], rnding);
2352     u[15] = _mm_srai_epi32(u[15], bit);
2353 
2354     // stage 9
2355     if (do_cols) {
2356       out[0] = u[0];
2357       out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
2358       out[2] = u[12];
2359       out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
2360       out[4] = u[6];
2361       out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
2362       out[6] = u[10];
2363       out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
2364       out[8] = u[3];
2365       out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
2366       out[10] = u[15];
2367       out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
2368       out[12] = u[5];
2369       out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
2370       out[14] = u[9];
2371       out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
2372     } else {
2373       const int log_range_out = AOMMAX(16, bd + 6);
2374       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2375       const __m128i clamp_hi_out =
2376           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2377 
2378       neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
2379                        &clamp_hi_out, out_shift);
2380       neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2381                        &clamp_hi_out, out_shift);
2382       neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2383                        &clamp_hi_out, out_shift);
2384       neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2385                        &clamp_hi_out, out_shift);
2386       neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2387                        &clamp_hi_out, out_shift);
2388       neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2389                        &clamp_hi_out, out_shift);
2390       neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2391                        &clamp_hi_out, out_shift);
2392       neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2393                        &clamp_hi_out, out_shift);
2394     }
2395   }
2396 }
2397 
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2398 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2399                              int bd, int out_shift) {
2400   const int32_t *cospi = cospi_arr(bit);
2401   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2402   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2403   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2404   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2405   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2406   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2407   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2408   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2409   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2410   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2411   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2412   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2413   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2414   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2415   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2416   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2417   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2418   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2419   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2420   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2421   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2422   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2423   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2424   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2425   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2426   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2427   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2428   __m128i u[16], v[16], x, y;
2429 
2430   {
2431     // stage 0
2432     // stage 1
2433     u[0] = in[0];
2434     u[1] = in[8];
2435     u[2] = in[4];
2436     u[3] = in[12];
2437     u[4] = in[2];
2438     u[5] = in[10];
2439     u[6] = in[6];
2440     u[7] = in[14];
2441     u[8] = in[1];
2442     u[9] = in[9];
2443     u[10] = in[5];
2444     u[11] = in[13];
2445     u[12] = in[3];
2446     u[13] = in[11];
2447     u[14] = in[7];
2448     u[15] = in[15];
2449 
2450     // stage 2
2451     v[0] = u[0];
2452     v[1] = u[1];
2453     v[2] = u[2];
2454     v[3] = u[3];
2455     v[4] = u[4];
2456     v[5] = u[5];
2457     v[6] = u[6];
2458     v[7] = u[7];
2459 
2460     v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2461     v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2462     v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2463     v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2464     v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2465     v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2466     v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2467     v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2468 
2469     // stage 3
2470     u[0] = v[0];
2471     u[1] = v[1];
2472     u[2] = v[2];
2473     u[3] = v[3];
2474     u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2475     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2476     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2477     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2478     addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2479     addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2480     addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2481     addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2482 
2483     // stage 4
2484     x = _mm_mullo_epi32(u[0], cospi32);
2485     y = _mm_mullo_epi32(u[1], cospi32);
2486     v[0] = _mm_add_epi32(x, y);
2487     v[0] = _mm_add_epi32(v[0], rnding);
2488     v[0] = _mm_srai_epi32(v[0], bit);
2489 
2490     v[1] = _mm_sub_epi32(x, y);
2491     v[1] = _mm_add_epi32(v[1], rnding);
2492     v[1] = _mm_srai_epi32(v[1], bit);
2493 
2494     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2495     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2496     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2497     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2498     v[8] = u[8];
2499     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2500     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2501     v[11] = u[11];
2502     v[12] = u[12];
2503     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2504     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2505     v[15] = u[15];
2506 
2507     // stage 5
2508     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2509     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2510     u[4] = v[4];
2511 
2512     x = _mm_mullo_epi32(v[5], cospi32);
2513     y = _mm_mullo_epi32(v[6], cospi32);
2514     u[5] = _mm_sub_epi32(y, x);
2515     u[5] = _mm_add_epi32(u[5], rnding);
2516     u[5] = _mm_srai_epi32(u[5], bit);
2517 
2518     u[6] = _mm_add_epi32(y, x);
2519     u[6] = _mm_add_epi32(u[6], rnding);
2520     u[6] = _mm_srai_epi32(u[6], bit);
2521 
2522     u[7] = v[7];
2523     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2524     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2525     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2526     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2527 
2528     // stage 6
2529     addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2530     addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2531     addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2532     addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2533     v[8] = u[8];
2534     v[9] = u[9];
2535 
2536     x = _mm_mullo_epi32(u[10], cospi32);
2537     y = _mm_mullo_epi32(u[13], cospi32);
2538     v[10] = _mm_sub_epi32(y, x);
2539     v[10] = _mm_add_epi32(v[10], rnding);
2540     v[10] = _mm_srai_epi32(v[10], bit);
2541 
2542     v[13] = _mm_add_epi32(x, y);
2543     v[13] = _mm_add_epi32(v[13], rnding);
2544     v[13] = _mm_srai_epi32(v[13], bit);
2545 
2546     x = _mm_mullo_epi32(u[11], cospi32);
2547     y = _mm_mullo_epi32(u[12], cospi32);
2548     v[11] = _mm_sub_epi32(y, x);
2549     v[11] = _mm_add_epi32(v[11], rnding);
2550     v[11] = _mm_srai_epi32(v[11], bit);
2551 
2552     v[12] = _mm_add_epi32(x, y);
2553     v[12] = _mm_add_epi32(v[12], rnding);
2554     v[12] = _mm_srai_epi32(v[12], bit);
2555 
2556     v[14] = u[14];
2557     v[15] = u[15];
2558 
2559     // stage 7
2560     if (do_cols) {
2561       addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
2562       addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
2563       addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
2564       addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
2565       addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
2566       addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
2567       addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
2568       addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
2569     } else {
2570       const int log_range_out = AOMMAX(16, bd + 6);
2571       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2572           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2573       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2574           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2575 
2576       addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
2577                           &clamp_hi_out, out_shift);
2578       addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
2579                           &clamp_hi_out, out_shift);
2580       addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
2581                           &clamp_hi_out, out_shift);
2582       addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
2583                           &clamp_hi_out, out_shift);
2584       addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
2585                           &clamp_hi_out, out_shift);
2586       addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
2587                           &clamp_hi_out, out_shift);
2588       addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
2589                           &clamp_hi_out, out_shift);
2590       addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
2591                           &clamp_hi_out, out_shift);
2592     }
2593   }
2594 }
2595 
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2596 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2597                               int bd, int out_shift) {
2598   const int32_t *cospi = cospi_arr(bit);
2599   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2600   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2601   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2602   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2603   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2604   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2605   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2606   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2607   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2608   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2609   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2610   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2611   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2612   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2613   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2614   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2615   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2616   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2617   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2618   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2619   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2620   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2621   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2622   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2623   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2624   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2625   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2626   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2627   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2628   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2629   __m128i u[16], v[16], x, y;
2630 
2631   // Calculate the column 0, 1, 2, 3
2632   {
2633     // stage 0
2634     // stage 1
2635     // stage 2
2636     v[0] = _mm_mullo_epi32(in[15], cospi2);
2637     x = _mm_mullo_epi32(in[0], cospi62);
2638     v[0] = _mm_add_epi32(v[0], x);
2639     v[0] = _mm_add_epi32(v[0], rnding);
2640     v[0] = _mm_srai_epi32(v[0], bit);
2641 
2642     v[1] = _mm_mullo_epi32(in[15], cospi62);
2643     x = _mm_mullo_epi32(in[0], cospi2);
2644     v[1] = _mm_sub_epi32(v[1], x);
2645     v[1] = _mm_add_epi32(v[1], rnding);
2646     v[1] = _mm_srai_epi32(v[1], bit);
2647 
2648     v[2] = _mm_mullo_epi32(in[13], cospi10);
2649     x = _mm_mullo_epi32(in[2], cospi54);
2650     v[2] = _mm_add_epi32(v[2], x);
2651     v[2] = _mm_add_epi32(v[2], rnding);
2652     v[2] = _mm_srai_epi32(v[2], bit);
2653 
2654     v[3] = _mm_mullo_epi32(in[13], cospi54);
2655     x = _mm_mullo_epi32(in[2], cospi10);
2656     v[3] = _mm_sub_epi32(v[3], x);
2657     v[3] = _mm_add_epi32(v[3], rnding);
2658     v[3] = _mm_srai_epi32(v[3], bit);
2659 
2660     v[4] = _mm_mullo_epi32(in[11], cospi18);
2661     x = _mm_mullo_epi32(in[4], cospi46);
2662     v[4] = _mm_add_epi32(v[4], x);
2663     v[4] = _mm_add_epi32(v[4], rnding);
2664     v[4] = _mm_srai_epi32(v[4], bit);
2665 
2666     v[5] = _mm_mullo_epi32(in[11], cospi46);
2667     x = _mm_mullo_epi32(in[4], cospi18);
2668     v[5] = _mm_sub_epi32(v[5], x);
2669     v[5] = _mm_add_epi32(v[5], rnding);
2670     v[5] = _mm_srai_epi32(v[5], bit);
2671 
2672     v[6] = _mm_mullo_epi32(in[9], cospi26);
2673     x = _mm_mullo_epi32(in[6], cospi38);
2674     v[6] = _mm_add_epi32(v[6], x);
2675     v[6] = _mm_add_epi32(v[6], rnding);
2676     v[6] = _mm_srai_epi32(v[6], bit);
2677 
2678     v[7] = _mm_mullo_epi32(in[9], cospi38);
2679     x = _mm_mullo_epi32(in[6], cospi26);
2680     v[7] = _mm_sub_epi32(v[7], x);
2681     v[7] = _mm_add_epi32(v[7], rnding);
2682     v[7] = _mm_srai_epi32(v[7], bit);
2683 
2684     v[8] = _mm_mullo_epi32(in[7], cospi34);
2685     x = _mm_mullo_epi32(in[8], cospi30);
2686     v[8] = _mm_add_epi32(v[8], x);
2687     v[8] = _mm_add_epi32(v[8], rnding);
2688     v[8] = _mm_srai_epi32(v[8], bit);
2689 
2690     v[9] = _mm_mullo_epi32(in[7], cospi30);
2691     x = _mm_mullo_epi32(in[8], cospi34);
2692     v[9] = _mm_sub_epi32(v[9], x);
2693     v[9] = _mm_add_epi32(v[9], rnding);
2694     v[9] = _mm_srai_epi32(v[9], bit);
2695 
2696     v[10] = _mm_mullo_epi32(in[5], cospi42);
2697     x = _mm_mullo_epi32(in[10], cospi22);
2698     v[10] = _mm_add_epi32(v[10], x);
2699     v[10] = _mm_add_epi32(v[10], rnding);
2700     v[10] = _mm_srai_epi32(v[10], bit);
2701 
2702     v[11] = _mm_mullo_epi32(in[5], cospi22);
2703     x = _mm_mullo_epi32(in[10], cospi42);
2704     v[11] = _mm_sub_epi32(v[11], x);
2705     v[11] = _mm_add_epi32(v[11], rnding);
2706     v[11] = _mm_srai_epi32(v[11], bit);
2707 
2708     v[12] = _mm_mullo_epi32(in[3], cospi50);
2709     x = _mm_mullo_epi32(in[12], cospi14);
2710     v[12] = _mm_add_epi32(v[12], x);
2711     v[12] = _mm_add_epi32(v[12], rnding);
2712     v[12] = _mm_srai_epi32(v[12], bit);
2713 
2714     v[13] = _mm_mullo_epi32(in[3], cospi14);
2715     x = _mm_mullo_epi32(in[12], cospi50);
2716     v[13] = _mm_sub_epi32(v[13], x);
2717     v[13] = _mm_add_epi32(v[13], rnding);
2718     v[13] = _mm_srai_epi32(v[13], bit);
2719 
2720     v[14] = _mm_mullo_epi32(in[1], cospi58);
2721     x = _mm_mullo_epi32(in[14], cospi6);
2722     v[14] = _mm_add_epi32(v[14], x);
2723     v[14] = _mm_add_epi32(v[14], rnding);
2724     v[14] = _mm_srai_epi32(v[14], bit);
2725 
2726     v[15] = _mm_mullo_epi32(in[1], cospi6);
2727     x = _mm_mullo_epi32(in[14], cospi58);
2728     v[15] = _mm_sub_epi32(v[15], x);
2729     v[15] = _mm_add_epi32(v[15], rnding);
2730     v[15] = _mm_srai_epi32(v[15], bit);
2731 
2732     // stage 3
2733     addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2734     addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2735     addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2736     addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2737     addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2738     addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2739     addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2740     addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2741 
2742     // stage 4
2743     v[0] = u[0];
2744     v[1] = u[1];
2745     v[2] = u[2];
2746     v[3] = u[3];
2747     v[4] = u[4];
2748     v[5] = u[5];
2749     v[6] = u[6];
2750     v[7] = u[7];
2751 
2752     v[8] = _mm_mullo_epi32(u[8], cospi8);
2753     x = _mm_mullo_epi32(u[9], cospi56);
2754     v[8] = _mm_add_epi32(v[8], x);
2755     v[8] = _mm_add_epi32(v[8], rnding);
2756     v[8] = _mm_srai_epi32(v[8], bit);
2757 
2758     v[9] = _mm_mullo_epi32(u[8], cospi56);
2759     x = _mm_mullo_epi32(u[9], cospi8);
2760     v[9] = _mm_sub_epi32(v[9], x);
2761     v[9] = _mm_add_epi32(v[9], rnding);
2762     v[9] = _mm_srai_epi32(v[9], bit);
2763 
2764     v[10] = _mm_mullo_epi32(u[10], cospi40);
2765     x = _mm_mullo_epi32(u[11], cospi24);
2766     v[10] = _mm_add_epi32(v[10], x);
2767     v[10] = _mm_add_epi32(v[10], rnding);
2768     v[10] = _mm_srai_epi32(v[10], bit);
2769 
2770     v[11] = _mm_mullo_epi32(u[10], cospi24);
2771     x = _mm_mullo_epi32(u[11], cospi40);
2772     v[11] = _mm_sub_epi32(v[11], x);
2773     v[11] = _mm_add_epi32(v[11], rnding);
2774     v[11] = _mm_srai_epi32(v[11], bit);
2775 
2776     v[12] = _mm_mullo_epi32(u[12], cospim56);
2777     x = _mm_mullo_epi32(u[13], cospi8);
2778     v[12] = _mm_add_epi32(v[12], x);
2779     v[12] = _mm_add_epi32(v[12], rnding);
2780     v[12] = _mm_srai_epi32(v[12], bit);
2781 
2782     v[13] = _mm_mullo_epi32(u[12], cospi8);
2783     x = _mm_mullo_epi32(u[13], cospim56);
2784     v[13] = _mm_sub_epi32(v[13], x);
2785     v[13] = _mm_add_epi32(v[13], rnding);
2786     v[13] = _mm_srai_epi32(v[13], bit);
2787 
2788     v[14] = _mm_mullo_epi32(u[14], cospim24);
2789     x = _mm_mullo_epi32(u[15], cospi40);
2790     v[14] = _mm_add_epi32(v[14], x);
2791     v[14] = _mm_add_epi32(v[14], rnding);
2792     v[14] = _mm_srai_epi32(v[14], bit);
2793 
2794     v[15] = _mm_mullo_epi32(u[14], cospi40);
2795     x = _mm_mullo_epi32(u[15], cospim24);
2796     v[15] = _mm_sub_epi32(v[15], x);
2797     v[15] = _mm_add_epi32(v[15], rnding);
2798     v[15] = _mm_srai_epi32(v[15], bit);
2799 
2800     // stage 5
2801     addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2802     addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2803     addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2804     addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2805     addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2806     addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2807     addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2808     addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2809 
2810     // stage 6
2811     v[0] = u[0];
2812     v[1] = u[1];
2813     v[2] = u[2];
2814     v[3] = u[3];
2815 
2816     v[4] = _mm_mullo_epi32(u[4], cospi16);
2817     x = _mm_mullo_epi32(u[5], cospi48);
2818     v[4] = _mm_add_epi32(v[4], x);
2819     v[4] = _mm_add_epi32(v[4], rnding);
2820     v[4] = _mm_srai_epi32(v[4], bit);
2821 
2822     v[5] = _mm_mullo_epi32(u[4], cospi48);
2823     x = _mm_mullo_epi32(u[5], cospi16);
2824     v[5] = _mm_sub_epi32(v[5], x);
2825     v[5] = _mm_add_epi32(v[5], rnding);
2826     v[5] = _mm_srai_epi32(v[5], bit);
2827 
2828     v[6] = _mm_mullo_epi32(u[6], cospim48);
2829     x = _mm_mullo_epi32(u[7], cospi16);
2830     v[6] = _mm_add_epi32(v[6], x);
2831     v[6] = _mm_add_epi32(v[6], rnding);
2832     v[6] = _mm_srai_epi32(v[6], bit);
2833 
2834     v[7] = _mm_mullo_epi32(u[6], cospi16);
2835     x = _mm_mullo_epi32(u[7], cospim48);
2836     v[7] = _mm_sub_epi32(v[7], x);
2837     v[7] = _mm_add_epi32(v[7], rnding);
2838     v[7] = _mm_srai_epi32(v[7], bit);
2839 
2840     v[8] = u[8];
2841     v[9] = u[9];
2842     v[10] = u[10];
2843     v[11] = u[11];
2844 
2845     v[12] = _mm_mullo_epi32(u[12], cospi16);
2846     x = _mm_mullo_epi32(u[13], cospi48);
2847     v[12] = _mm_add_epi32(v[12], x);
2848     v[12] = _mm_add_epi32(v[12], rnding);
2849     v[12] = _mm_srai_epi32(v[12], bit);
2850 
2851     v[13] = _mm_mullo_epi32(u[12], cospi48);
2852     x = _mm_mullo_epi32(u[13], cospi16);
2853     v[13] = _mm_sub_epi32(v[13], x);
2854     v[13] = _mm_add_epi32(v[13], rnding);
2855     v[13] = _mm_srai_epi32(v[13], bit);
2856 
2857     v[14] = _mm_mullo_epi32(u[14], cospim48);
2858     x = _mm_mullo_epi32(u[15], cospi16);
2859     v[14] = _mm_add_epi32(v[14], x);
2860     v[14] = _mm_add_epi32(v[14], rnding);
2861     v[14] = _mm_srai_epi32(v[14], bit);
2862 
2863     v[15] = _mm_mullo_epi32(u[14], cospi16);
2864     x = _mm_mullo_epi32(u[15], cospim48);
2865     v[15] = _mm_sub_epi32(v[15], x);
2866     v[15] = _mm_add_epi32(v[15], rnding);
2867     v[15] = _mm_srai_epi32(v[15], bit);
2868 
2869     // stage 7
2870     addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2871     addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2872     addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2873     addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2874     addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2875     addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2876     addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2877     addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2878 
2879     // stage 8
2880     v[0] = u[0];
2881     v[1] = u[1];
2882 
2883     y = _mm_mullo_epi32(u[2], cospi32);
2884     x = _mm_mullo_epi32(u[3], cospi32);
2885     v[2] = _mm_add_epi32(y, x);
2886     v[2] = _mm_add_epi32(v[2], rnding);
2887     v[2] = _mm_srai_epi32(v[2], bit);
2888 
2889     v[3] = _mm_sub_epi32(y, x);
2890     v[3] = _mm_add_epi32(v[3], rnding);
2891     v[3] = _mm_srai_epi32(v[3], bit);
2892 
2893     v[4] = u[4];
2894     v[5] = u[5];
2895 
2896     y = _mm_mullo_epi32(u[6], cospi32);
2897     x = _mm_mullo_epi32(u[7], cospi32);
2898     v[6] = _mm_add_epi32(y, x);
2899     v[6] = _mm_add_epi32(v[6], rnding);
2900     v[6] = _mm_srai_epi32(v[6], bit);
2901 
2902     v[7] = _mm_sub_epi32(y, x);
2903     v[7] = _mm_add_epi32(v[7], rnding);
2904     v[7] = _mm_srai_epi32(v[7], bit);
2905 
2906     v[8] = u[8];
2907     v[9] = u[9];
2908 
2909     y = _mm_mullo_epi32(u[10], cospi32);
2910     x = _mm_mullo_epi32(u[11], cospi32);
2911     v[10] = _mm_add_epi32(y, x);
2912     v[10] = _mm_add_epi32(v[10], rnding);
2913     v[10] = _mm_srai_epi32(v[10], bit);
2914 
2915     v[11] = _mm_sub_epi32(y, x);
2916     v[11] = _mm_add_epi32(v[11], rnding);
2917     v[11] = _mm_srai_epi32(v[11], bit);
2918 
2919     v[12] = u[12];
2920     v[13] = u[13];
2921 
2922     y = _mm_mullo_epi32(u[14], cospi32);
2923     x = _mm_mullo_epi32(u[15], cospi32);
2924     v[14] = _mm_add_epi32(y, x);
2925     v[14] = _mm_add_epi32(v[14], rnding);
2926     v[14] = _mm_srai_epi32(v[14], bit);
2927 
2928     v[15] = _mm_sub_epi32(y, x);
2929     v[15] = _mm_add_epi32(v[15], rnding);
2930     v[15] = _mm_srai_epi32(v[15], bit);
2931 
2932     // stage 9
2933     if (do_cols) {
2934       out[0] = v[0];
2935       out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2936       out[2] = v[12];
2937       out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2938       out[4] = v[6];
2939       out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2940       out[6] = v[10];
2941       out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2942       out[8] = v[3];
2943       out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2944       out[10] = v[15];
2945       out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2946       out[12] = v[5];
2947       out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2948       out[14] = v[9];
2949       out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2950     } else {
2951       const int log_range_out = AOMMAX(16, bd + 6);
2952       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2953       const __m128i clamp_hi_out =
2954           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2955 
2956       neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2957                        &clamp_hi_out, out_shift);
2958       neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2959                        &clamp_hi_out, out_shift);
2960       neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2961                        &clamp_hi_out, out_shift);
2962       neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2963                        &clamp_hi_out, out_shift);
2964       neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2965                        &clamp_hi_out, out_shift);
2966       neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2967                        &clamp_hi_out, out_shift);
2968       neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2969                        &clamp_hi_out, out_shift);
2970       neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2971                        &clamp_hi_out, out_shift);
2972     }
2973   }
2974 }
2975 
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)2976 static INLINE void idct64_stage8_sse4_1(
2977     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
2978     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
2979     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
2980     const __m128i *rnding, int bit) {
2981   int i;
2982   __m128i temp1, temp2, temp3, temp4;
2983   temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2984   u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2985   u[10] = temp1;
2986   temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2987   u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2988   u[11] = temp2;
2989 
2990   for (i = 16; i < 20; ++i) {
2991     addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2992     addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
2993                   clamp_hi);
2994   }
2995 
2996   temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2997   temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2998   temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2999   temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3000   u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3001   u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3002   u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3003   u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3004   u[36] = temp1;
3005   u[37] = temp2;
3006   u[38] = temp3;
3007   u[39] = temp4;
3008 
3009   temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3010   temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3011   temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3012   temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3013   u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3014   u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3015   u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3016   u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3017   u[40] = temp1;
3018   u[41] = temp2;
3019   u[42] = temp3;
3020   u[43] = temp4;
3021 }
3022 
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3023 static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3024                                         const __m128i *cospi32,
3025                                         const __m128i *clamp_lo,
3026                                         const __m128i *clamp_hi,
3027                                         const __m128i *rnding, int bit) {
3028   int i;
3029   __m128i temp1, temp2, temp3, temp4;
3030   for (i = 0; i < 8; ++i) {
3031     addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3032   }
3033 
3034   temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3035   temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3036   temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3037   temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3038   u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3039   u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3040   u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3041   u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3042   u[20] = temp1;
3043   u[21] = temp2;
3044   u[22] = temp3;
3045   u[23] = temp4;
3046   for (i = 32; i < 40; i++) {
3047     addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3048   }
3049 
3050   for (i = 48; i < 56; i++) {
3051     addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3052   }
3053 }
3054 
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3055 static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3056                                          const __m128i *cospi32,
3057                                          const __m128i *clamp_lo,
3058                                          const __m128i *clamp_hi,
3059                                          const __m128i *rnding, int bit) {
3060   __m128i temp1, temp2, temp3, temp4;
3061   for (int i = 0; i < 16; i++) {
3062     addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3063   }
3064 
3065   temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3066   temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3067   temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3068   temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3069   u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3070   u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3071   u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3072   u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3073   u[40] = temp1;
3074   u[41] = temp2;
3075   u[42] = temp3;
3076   u[43] = temp4;
3077 
3078   temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3079   temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3080   temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3081   temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3082   u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3083   u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3084   u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3085   u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3086   u[44] = temp1;
3087   u[45] = temp2;
3088   u[46] = temp3;
3089   u[47] = temp4;
3090 }
3091 
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const int log_range)3092 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3093                                          int bd, int out_shift,
3094                                          const int log_range) {
3095   if (do_cols) {
3096     for (int i = 0; i < 32; i++) {
3097       addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
3098     }
3099   } else {
3100     const int log_range_out = AOMMAX(16, bd + 6);
3101     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3102         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3103     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3104         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3105 
3106     for (int i = 0; i < 32; i++) {
3107       addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
3108                           &clamp_lo_out, &clamp_hi_out, out_shift);
3109     }
3110   }
3111 }
3112 
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3113 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3114                                   int do_cols, int bd, int out_shift) {
3115   const int32_t *cospi = cospi_arr(bit);
3116   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3117   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3118   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3119   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3120 
3121   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3122 
3123   {
3124     __m128i x;
3125 
3126     // stage 1
3127     // stage 2
3128     // stage 3
3129     // stage 4
3130     // stage 5
3131     // stage 6
3132     x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3133 
3134     // stage 8
3135     // stage 9
3136     // stage 10
3137     // stage 11
3138     if (do_cols) {
3139       x = _mm_max_epi32(x, clamp_lo);
3140       x = _mm_min_epi32(x, clamp_hi);
3141     } else {
3142       const int log_range_out = AOMMAX(16, bd + 6);
3143       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3144           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3145       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3146           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3147 
3148       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3149       x = _mm_add_epi32(x, offset);
3150       x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3151 
3152       x = _mm_max_epi32(x, clamp_lo_out);
3153       x = _mm_min_epi32(x, clamp_hi_out);
3154     }
3155 
3156     out[0] = x;
3157     out[63] = x;
3158     out[1] = x;
3159     out[62] = x;
3160     out[2] = x;
3161     out[61] = x;
3162     out[3] = x;
3163     out[60] = x;
3164     out[4] = x;
3165     out[59] = x;
3166     out[5] = x;
3167     out[58] = x;
3168     out[6] = x;
3169     out[57] = x;
3170     out[7] = x;
3171     out[56] = x;
3172     out[8] = x;
3173     out[55] = x;
3174     out[9] = x;
3175     out[54] = x;
3176     out[10] = x;
3177     out[53] = x;
3178     out[11] = x;
3179     out[52] = x;
3180     out[12] = x;
3181     out[51] = x;
3182     out[13] = x;
3183     out[50] = x;
3184     out[14] = x;
3185     out[49] = x;
3186     out[15] = x;
3187     out[48] = x;
3188     out[16] = x;
3189     out[47] = x;
3190     out[17] = x;
3191     out[46] = x;
3192     out[18] = x;
3193     out[45] = x;
3194     out[19] = x;
3195     out[44] = x;
3196     out[20] = x;
3197     out[43] = x;
3198     out[21] = x;
3199     out[42] = x;
3200     out[22] = x;
3201     out[41] = x;
3202     out[23] = x;
3203     out[40] = x;
3204     out[24] = x;
3205     out[39] = x;
3206     out[25] = x;
3207     out[38] = x;
3208     out[26] = x;
3209     out[37] = x;
3210     out[27] = x;
3211     out[36] = x;
3212     out[28] = x;
3213     out[35] = x;
3214     out[29] = x;
3215     out[34] = x;
3216     out[30] = x;
3217     out[33] = x;
3218     out[31] = x;
3219     out[32] = x;
3220   }
3221 }
3222 
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3223 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3224                                   int do_cols, int bd, int out_shift) {
3225   int i, j;
3226   const int32_t *cospi = cospi_arr(bit);
3227   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3228   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3229   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3230   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3231 
3232   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3233   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3234   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3235   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3236   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3237   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3238   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3239   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3240   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3241   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3242   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3243   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3244   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3245   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3246   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3247   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3248   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3249   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3250   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3251   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3252   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3253   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3254   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3255   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3256   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3257   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3258   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3259   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3260   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3261   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3262   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3263   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3264   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3265   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3266   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3267   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3268   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3269   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3270 
3271   {
3272     __m128i u[64];
3273 
3274     // stage 1
3275     u[0] = in[0];
3276     u[8] = in[4];
3277     u[16] = in[2];
3278     u[24] = in[6];
3279     u[32] = in[1];
3280     u[40] = in[5];
3281     u[48] = in[3];
3282     u[56] = in[7];
3283 
3284     // stage 2
3285     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3286     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3287     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3288     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3289     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3290     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3291     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3292     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3293 
3294     // stage 3
3295     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3296     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3297     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3298     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3299     u[33] = u[32];
3300     u[38] = u[39];
3301     u[41] = u[40];
3302     u[46] = u[47];
3303     u[49] = u[48];
3304     u[54] = u[55];
3305     u[57] = u[56];
3306     u[62] = u[63];
3307 
3308     // stage 4
3309     __m128i temp1, temp2;
3310     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3311     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3312     u[17] = u[16];
3313     u[22] = u[23];
3314     u[25] = u[24];
3315     u[30] = u[31];
3316 
3317     temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3318     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3319     u[33] = temp1;
3320 
3321     temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3322     u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3323     u[57] = temp2;
3324 
3325     temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3326     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3327     u[41] = temp1;
3328 
3329     temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3330     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3331     u[46] = temp2;
3332 
3333     // stage 5
3334     u[9] = u[8];
3335     u[14] = u[15];
3336 
3337     temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3338     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3339     u[17] = temp1;
3340 
3341     temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3342     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3343     u[22] = temp2;
3344 
3345     u[35] = u[32];
3346     u[34] = u[33];
3347     u[36] = u[39];
3348     u[37] = u[38];
3349     u[43] = u[40];
3350     u[42] = u[41];
3351     u[44] = u[47];
3352     u[45] = u[46];
3353     u[51] = u[48];
3354     u[50] = u[49];
3355     u[52] = u[55];
3356     u[53] = u[54];
3357     u[59] = u[56];
3358     u[58] = u[57];
3359     u[60] = u[63];
3360     u[61] = u[62];
3361 
3362     // stage 6
3363     temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3364     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3365     u[0] = temp1;
3366 
3367     temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3368     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3369     u[9] = temp2;
3370     u[19] = u[16];
3371     u[18] = u[17];
3372     u[20] = u[23];
3373     u[21] = u[22];
3374     u[27] = u[24];
3375     u[26] = u[25];
3376     u[28] = u[31];
3377     u[29] = u[30];
3378 
3379     temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3380     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3381     u[34] = temp1;
3382     temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3383     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3384     u[35] = temp2;
3385     temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3386     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3387     u[36] = temp1;
3388     temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3389     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3390     u[37] = temp2;
3391     temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3392     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3393     u[42] = temp1;
3394     temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3395     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3396     u[43] = temp2;
3397     temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3398     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3399     u[44] = temp1;
3400     temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3401     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3402     u[45] = temp2;
3403 
3404     // stage 7
3405     u[3] = u[0];
3406     u[2] = u[1];
3407     u[11] = u[8];
3408     u[10] = u[9];
3409     u[12] = u[15];
3410     u[13] = u[14];
3411 
3412     temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3413     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3414     u[18] = temp1;
3415     temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3416     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3417     u[19] = temp2;
3418     temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3419     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3420     u[20] = temp1;
3421     temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3422     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3423     u[21] = temp2;
3424     for (i = 32; i < 64; i += 16) {
3425       for (j = i; j < i + 4; j++) {
3426         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3427         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3428                       &clamp_hi);
3429       }
3430     }
3431 
3432     // stage 8
3433     u[7] = u[0];
3434     u[6] = u[1];
3435     u[5] = u[2];
3436     u[4] = u[3];
3437     u[9] = u[9];
3438 
3439     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3440                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3441 
3442     // stage 9
3443     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3444                          bit);
3445 
3446     // stage 10
3447     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3448                           bit);
3449 
3450     // stage 11
3451     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3452   }
3453 }
3454 
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3455 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3456                                    int do_cols, int bd, int out_shift) {
3457   int i, j;
3458   const int32_t *cospi = cospi_arr(bit);
3459   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3460   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3461   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3462   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3463 
3464   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3465   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3466   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3467   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3468   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3469   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3470   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3471   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3472   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3473   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3474   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3475   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3476   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3477   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3478   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3479   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3480   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3481   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3482   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3483   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3484   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3485   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3486   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3487   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3488   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3489   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3490   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3491   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3492   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3493   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3494   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3495   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3496   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3497 
3498   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3499   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3500   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3501   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3502   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3503   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3504   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3505   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3506   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3507   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3508   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3509   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3510   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3511   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3512   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3513   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3514   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3515   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3516   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3517   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3518   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3519 
3520   {
3521     __m128i u[64];
3522     __m128i tmp1, tmp2, tmp3, tmp4;
3523     // stage 1
3524     u[0] = in[0];
3525     u[32] = in[1];
3526     u[36] = in[9];
3527     u[40] = in[5];
3528     u[44] = in[13];
3529     u[48] = in[3];
3530     u[52] = in[11];
3531     u[56] = in[7];
3532     u[60] = in[15];
3533     u[16] = in[2];
3534     u[20] = in[10];
3535     u[24] = in[6];
3536     u[28] = in[14];
3537     u[4] = in[8];
3538     u[8] = in[4];
3539     u[12] = in[12];
3540 
3541     // stage 2
3542     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3543     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3544     u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3545     u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3546     u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3547     u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3548     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3549     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3550     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3551     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3552     u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3553     u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3554     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3555     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3556     u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3557     u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3558 
3559     // stage 3
3560     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3561     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3562     u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3563     u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3564     u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3565     u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3566     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3567     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3568     u[33] = u[32];
3569     u[34] = u[35];
3570     u[37] = u[36];
3571     u[38] = u[39];
3572     u[41] = u[40];
3573     u[42] = u[43];
3574     u[45] = u[44];
3575     u[46] = u[47];
3576     u[49] = u[48];
3577     u[50] = u[51];
3578     u[53] = u[52];
3579     u[54] = u[55];
3580     u[57] = u[56];
3581     u[58] = u[59];
3582     u[61] = u[60];
3583     u[62] = u[63];
3584 
3585     // stage 4
3586     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3587     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3588     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3589     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3590 
3591     u[17] = u[16];
3592     u[18] = u[19];
3593     u[21] = u[20];
3594     u[22] = u[23];
3595     u[25] = u[24];
3596     u[26] = u[27];
3597     u[29] = u[28];
3598     u[30] = u[31];
3599 
3600     tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3601     tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3602     tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3603     tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3604     u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3605     u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3606     u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3607     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3608     u[33] = tmp1;
3609     u[34] = tmp2;
3610     u[37] = tmp3;
3611     u[38] = tmp4;
3612 
3613     tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3614     tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3615     tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3616     tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3617     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3618     u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3619     u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3620     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3621     u[41] = tmp1;
3622     u[42] = tmp2;
3623     u[45] = tmp3;
3624     u[46] = tmp4;
3625 
3626     // stage 5
3627     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3628     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3629 
3630     u[9] = u[8];
3631     u[10] = u[11];
3632     u[13] = u[12];
3633     u[14] = u[15];
3634 
3635     tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3636     tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3637     tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3638     tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3639     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3640     u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3641     u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3642     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3643     u[17] = tmp1;
3644     u[18] = tmp2;
3645     u[21] = tmp3;
3646     u[22] = tmp4;
3647 
3648     for (i = 32; i < 64; i += 8) {
3649       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3650                     &clamp_hi);
3651       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3652                     &clamp_hi);
3653 
3654       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3655                     &clamp_hi);
3656       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3657                     &clamp_hi);
3658     }
3659 
3660     // stage 6
3661     tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3662     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3663     u[0] = tmp1;
3664     u[5] = u[4];
3665     u[6] = u[7];
3666 
3667     tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3668     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3669     u[9] = tmp1;
3670     tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3671     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3672     u[10] = tmp2;
3673 
3674     for (i = 16; i < 32; i += 8) {
3675       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3676                     &clamp_hi);
3677       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3678                     &clamp_hi);
3679 
3680       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3681                     &clamp_hi);
3682       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3683                     &clamp_hi);
3684     }
3685 
3686     tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3687     tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3688     tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3689     tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3690     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3691     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3692     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3693     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3694     u[34] = tmp1;
3695     u[35] = tmp2;
3696     u[36] = tmp3;
3697     u[37] = tmp4;
3698 
3699     tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3700     tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3701     tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3702     tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3703     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3704     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3705     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3706     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3707     u[42] = tmp1;
3708     u[43] = tmp2;
3709     u[44] = tmp3;
3710     u[45] = tmp4;
3711 
3712     // stage 7
3713     u[3] = u[0];
3714     u[2] = u[1];
3715     tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3716     u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3717     u[5] = tmp1;
3718     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3719     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3720     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3721     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3722 
3723     tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3724     tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3725     tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3726     tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3727     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3728     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3729     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3730     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3731     u[18] = tmp1;
3732     u[19] = tmp2;
3733     u[20] = tmp3;
3734     u[21] = tmp4;
3735 
3736     for (i = 32; i < 64; i += 16) {
3737       for (j = i; j < i + 4; j++) {
3738         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3739         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3740                       &clamp_hi);
3741       }
3742     }
3743 
3744     // stage 8
3745     for (i = 0; i < 4; ++i) {
3746       addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3747     }
3748 
3749     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3750                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3751 
3752     // stage 9
3753     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3754                          bit);
3755 
3756     // stage 10
3757     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3758                           bit);
3759 
3760     // stage 11
3761     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3762   }
3763 }
3764 
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3765 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3766                              int bd, int out_shift) {
3767   int i, j;
3768   const int32_t *cospi = cospi_arr(bit);
3769   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3770   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3771   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3772   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3773 
3774   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3775   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3776   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3777   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3778   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3779   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3780   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3781   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3782   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3783   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3784   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3785   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3786   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3787   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3788   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3789   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3790   const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
3791   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
3792   const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
3793   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3794   const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
3795   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
3796   const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
3797   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3798   const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
3799   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
3800   const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
3801   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3802   const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
3803   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
3804   const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
3805   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3806   const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
3807   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3808   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
3809   const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
3810   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3811   const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
3812   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3813   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
3814   const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
3815   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3816   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3817   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3818   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3819   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3820   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3821   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3822   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3823   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3824   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3825 
3826   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3827   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3828   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3829   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3830   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3831   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3832   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3833   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3834   const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
3835   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
3836   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3837   const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
3838   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3839   const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
3840   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
3841   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3842   const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
3843   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3844   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3845   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3846   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3847   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3848   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3849   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3850   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3851   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3852   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3853 
3854   {
3855     __m128i u[64], v[64];
3856 
3857     // stage 1
3858     u[32] = in[1];
3859     u[34] = in[17];
3860     u[36] = in[9];
3861     u[38] = in[25];
3862     u[40] = in[5];
3863     u[42] = in[21];
3864     u[44] = in[13];
3865     u[46] = in[29];
3866     u[48] = in[3];
3867     u[50] = in[19];
3868     u[52] = in[11];
3869     u[54] = in[27];
3870     u[56] = in[7];
3871     u[58] = in[23];
3872     u[60] = in[15];
3873     u[62] = in[31];
3874 
3875     v[16] = in[2];
3876     v[18] = in[18];
3877     v[20] = in[10];
3878     v[22] = in[26];
3879     v[24] = in[6];
3880     v[26] = in[22];
3881     v[28] = in[14];
3882     v[30] = in[30];
3883 
3884     u[8] = in[4];
3885     u[10] = in[20];
3886     u[12] = in[12];
3887     u[14] = in[28];
3888 
3889     v[4] = in[8];
3890     v[6] = in[24];
3891 
3892     u[0] = in[0];
3893     u[2] = in[16];
3894 
3895     // stage 2
3896     v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3897     v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
3898     v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
3899     v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3900     v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3901     v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
3902     v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
3903     v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3904     v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3905     v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
3906     v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
3907     v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3908     v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3909     v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
3910     v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
3911     v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3912     v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3913     v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
3914     v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
3915     v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3916     v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3917     v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
3918     v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
3919     v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3920     v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3921     v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
3922     v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
3923     v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3924     v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3925     v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
3926     v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
3927     v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3928 
3929     // stage 3
3930     u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
3931     u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
3932     u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
3933     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
3934     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
3935     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
3936     u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
3937     u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
3938     u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
3939     u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
3940     u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
3941     u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
3942     u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
3943     u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
3944     u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
3945     u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
3946 
3947     for (i = 32; i < 64; i += 4) {
3948       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3949                     &clamp_hi);
3950       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3951                     &clamp_hi);
3952     }
3953 
3954     // stage 4
3955     v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3956     v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
3957     v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
3958     v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3959     v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3960     v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
3961     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
3962     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3963 
3964     for (i = 16; i < 32; i += 4) {
3965       addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3966                     &clamp_hi);
3967       addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3968                     &clamp_hi);
3969     }
3970 
3971     for (i = 32; i < 64; i += 4) {
3972       v[i + 0] = u[i + 0];
3973       v[i + 3] = u[i + 3];
3974     }
3975 
3976     v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3977     v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3978     v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3979     v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3980     v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3981     v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3982     v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3983     v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3984     v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3985     v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3986     v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3987     v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3988     v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3989     v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3990     v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3991     v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3992 
3993     // stage 5
3994     u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
3995     u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
3996     u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
3997     u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
3998 
3999     for (i = 8; i < 16; i += 4) {
4000       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4001                     &clamp_hi);
4002       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4003                     &clamp_hi);
4004     }
4005 
4006     for (i = 16; i < 32; i += 4) {
4007       u[i + 0] = v[i + 0];
4008       u[i + 3] = v[i + 3];
4009     }
4010 
4011     u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4012     u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4013     u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4014     u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4015     u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4016     u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4017     u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4018     u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4019 
4020     for (i = 32; i < 64; i += 8) {
4021       addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4022                     &clamp_hi);
4023       addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4024                     &clamp_hi);
4025 
4026       addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4027                     &clamp_hi);
4028       addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4029                     &clamp_hi);
4030     }
4031 
4032     // stage 6
4033     v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4034     v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4035     v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4036     v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4037 
4038     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4039     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4040 
4041     for (i = 8; i < 16; i += 4) {
4042       v[i + 0] = u[i + 0];
4043       v[i + 3] = u[i + 3];
4044     }
4045 
4046     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4047     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4048     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4049     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4050 
4051     for (i = 16; i < 32; i += 8) {
4052       addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4053                     &clamp_hi);
4054       addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4055                     &clamp_hi);
4056 
4057       addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4058                     &clamp_hi);
4059       addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4060                     &clamp_hi);
4061     }
4062 
4063     for (i = 32; i < 64; i += 8) {
4064       v[i + 0] = u[i + 0];
4065       v[i + 1] = u[i + 1];
4066       v[i + 6] = u[i + 6];
4067       v[i + 7] = u[i + 7];
4068     }
4069 
4070     v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4071     v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4072     v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4073     v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4074     v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4075     v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4076     v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4077     v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4078     v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4079     v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4080     v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4081     v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4082     v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4083     v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4084     v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4085     v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4086 
4087     // stage 7
4088     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4089     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4090 
4091     u[4] = v[4];
4092     u[7] = v[7];
4093     u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4094     u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4095 
4096     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4097     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4098     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4099     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4100 
4101     for (i = 16; i < 32; i += 8) {
4102       u[i + 0] = v[i + 0];
4103       u[i + 1] = v[i + 1];
4104       u[i + 6] = v[i + 6];
4105       u[i + 7] = v[i + 7];
4106     }
4107 
4108     u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4109     u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4110     u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4111     u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4112     u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4113     u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4114     u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4115     u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4116 
4117     for (i = 32; i < 64; i += 16) {
4118       for (j = i; j < i + 4; j++) {
4119         addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4120         addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4121                       &clamp_hi);
4122       }
4123     }
4124 
4125     // stage 8
4126     for (i = 0; i < 4; ++i) {
4127       addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4128     }
4129 
4130     v[8] = u[8];
4131     v[9] = u[9];
4132     v[14] = u[14];
4133     v[15] = u[15];
4134 
4135     v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4136     v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4137     v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4138     v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4139 
4140     for (i = 16; i < 20; ++i) {
4141       addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4142       addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4143                     &clamp_hi);
4144     }
4145 
4146     for (i = 32; i < 36; ++i) {
4147       v[i] = u[i];
4148       v[i + 12] = u[i + 12];
4149       v[i + 16] = u[i + 16];
4150       v[i + 28] = u[i + 28];
4151     }
4152 
4153     v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4154     v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4155     v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4156     v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4157     v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4158     v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4159     v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4160     v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4161     v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4162     v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4163     v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4164     v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4165     v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4166     v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4167     v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4168     v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4169 
4170     // stage 9
4171     for (i = 0; i < 8; ++i) {
4172       addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4173     }
4174 
4175     for (i = 16; i < 20; ++i) {
4176       u[i] = v[i];
4177       u[i + 12] = v[i + 12];
4178     }
4179 
4180     u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4181     u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4182     u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4183     u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4184     u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4185     u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4186     u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4187     u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4188 
4189     for (i = 32; i < 40; i++) {
4190       addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4191     }
4192 
4193     for (i = 48; i < 56; i++) {
4194       addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4195     }
4196 
4197     // stage 10
4198     for (i = 0; i < 16; i++) {
4199       addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4200     }
4201 
4202     for (i = 32; i < 40; i++) v[i] = u[i];
4203 
4204     v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4205     v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4206     v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4207     v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4208     v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4209     v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4210     v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4211     v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4212     v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4213     v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4214     v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4215     v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4216     v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4217     v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4218     v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4219     v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4220 
4221     for (i = 56; i < 64; i++) v[i] = u[i];
4222 
4223     // stage 11
4224     if (do_cols) {
4225       for (i = 0; i < 32; i++) {
4226         addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
4227       }
4228     } else {
4229       const int log_range_out = AOMMAX(16, bd + 6);
4230       const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4231           -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4232       const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4233           (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4234 
4235       for (i = 0; i < 32; i++) {
4236         addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
4237                             &clamp_lo_out, &clamp_hi_out, out_shift);
4238       }
4239     }
4240   }
4241 }
4242 
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4243 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4244                                   int do_cols, int bd, int out_shift) {
4245   const int32_t *cospi = cospi_arr(bit);
4246   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4247   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4248   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4249   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4250   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4251   __m128i bf1;
4252 
4253   // stage 0
4254   // stage 1
4255   bf1 = in[0];
4256 
4257   // stage 2
4258   // stage 3
4259   // stage 4
4260   // stage 5
4261   bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4262 
4263   // stage 6
4264   // stage 7
4265   // stage 8
4266   // stage 9
4267   if (do_cols) {
4268     bf1 = _mm_max_epi32(bf1, clamp_lo);
4269     bf1 = _mm_min_epi32(bf1, clamp_hi);
4270   } else {
4271     const int log_range_out = AOMMAX(16, bd + 6);
4272     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4273         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4274     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4275         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4276 
4277     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4278     bf1 = _mm_add_epi32(bf1, offset);
4279     bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4280     bf1 = _mm_max_epi32(bf1, clamp_lo_out);
4281     bf1 = _mm_min_epi32(bf1, clamp_hi_out);
4282   }
4283   out[0] = bf1;
4284   out[1] = bf1;
4285   out[2] = bf1;
4286   out[3] = bf1;
4287   out[4] = bf1;
4288   out[5] = bf1;
4289   out[6] = bf1;
4290   out[7] = bf1;
4291   out[8] = bf1;
4292   out[9] = bf1;
4293   out[10] = bf1;
4294   out[11] = bf1;
4295   out[12] = bf1;
4296   out[13] = bf1;
4297   out[14] = bf1;
4298   out[15] = bf1;
4299   out[16] = bf1;
4300   out[17] = bf1;
4301   out[18] = bf1;
4302   out[19] = bf1;
4303   out[20] = bf1;
4304   out[21] = bf1;
4305   out[22] = bf1;
4306   out[23] = bf1;
4307   out[24] = bf1;
4308   out[25] = bf1;
4309   out[26] = bf1;
4310   out[27] = bf1;
4311   out[28] = bf1;
4312   out[29] = bf1;
4313   out[30] = bf1;
4314   out[31] = bf1;
4315 }
4316 
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4317 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4318                                   int do_cols, int bd, int out_shift) {
4319   const int32_t *cospi = cospi_arr(bit);
4320   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4321   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4322   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4323   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4324   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4325   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4326   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4327   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4328   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4329   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4330   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4331   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4332   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4333   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4334   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4335   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4336   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4337   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4338   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4339   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4340   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4341   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4342   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4343   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4344   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4345   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4346   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4347   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4348   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4349   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4350   __m128i bf1[32];
4351 
4352   // stage 0
4353   // stage 1
4354   bf1[0] = in[0];
4355   bf1[4] = in[4];
4356   bf1[8] = in[2];
4357   bf1[12] = in[6];
4358   bf1[16] = in[1];
4359   bf1[20] = in[5];
4360   bf1[24] = in[3];
4361   bf1[28] = in[7];
4362 
4363   // stage 2
4364   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4365   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4366   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4367   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4368   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4369   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4370   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4371   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4372 
4373   // stage 3
4374   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4375   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4376 
4377   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4378   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4379   bf1[17] = bf1[16];
4380   bf1[18] = bf1[19];
4381   bf1[21] = bf1[20];
4382   bf1[22] = bf1[23];
4383   bf1[25] = bf1[24];
4384   bf1[26] = bf1[27];
4385   bf1[29] = bf1[28];
4386   bf1[30] = bf1[31];
4387 
4388   // stage 4 :
4389   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4390   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4391 
4392   bf1[9] = bf1[8];
4393   bf1[10] = bf1[11];
4394   bf1[13] = bf1[12];
4395   bf1[14] = bf1[15];
4396 
4397   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4398                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4399 
4400   // stage 5
4401   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4402   bf1[1] = bf1[0];
4403   bf1[5] = bf1[4];
4404   bf1[6] = bf1[7];
4405 
4406   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4407                        &clamp_hi, &rounding, bit);
4408 
4409   // stage 6
4410   bf1[3] = bf1[0];
4411   bf1[2] = bf1[1];
4412 
4413   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4414                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4415 
4416   // stage 7
4417   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4418                        &rounding, bit);
4419 
4420   // stage 8
4421   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4422                        &rounding, bit);
4423 
4424   // stage 9
4425   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4426 }
4427 
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4428 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4429                                    int do_cols, int bd, int out_shift) {
4430   const int32_t *cospi = cospi_arr(bit);
4431   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4432   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4433   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4434   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4435   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4436   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4437   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4438   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4439   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4440   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4441   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4442   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4443   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4444   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4445   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4446   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4447   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4448   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4449   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4450   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4451   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4452   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4453   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4454   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4455   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4456   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4457   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4458   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4459   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4460   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4461   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4462   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4463   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4464   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4465   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4466   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4467   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4468   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4469   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4470   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4471   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4472   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4473   __m128i bf1[32];
4474 
4475   // stage 0
4476   // stage 1
4477 
4478   bf1[0] = in[0];
4479   bf1[2] = in[8];
4480   bf1[4] = in[4];
4481   bf1[6] = in[12];
4482   bf1[8] = in[2];
4483   bf1[10] = in[10];
4484   bf1[12] = in[6];
4485   bf1[14] = in[14];
4486   bf1[16] = in[1];
4487   bf1[18] = in[9];
4488   bf1[20] = in[5];
4489   bf1[22] = in[13];
4490   bf1[24] = in[3];
4491   bf1[26] = in[11];
4492   bf1[28] = in[7];
4493   bf1[30] = in[15];
4494 
4495   // stage 2
4496   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4497   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4498   bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4499   bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4500   bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4501   bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4502   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4503   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4504   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4505   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4506   bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4507   bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4508   bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4509   bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4510   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4511   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4512 
4513   // stage 3
4514   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4515   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4516   bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4517   bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4518   bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4519   bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4520   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4521   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4522 
4523   addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4524   addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4525   addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4526   addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4527   addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4528   addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4529   addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4530   addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4531   // stage 4
4532   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4533   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4534   bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4535   bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4536 
4537   addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4538   addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4539   addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4540   addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4541 
4542   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4543                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4544 
4545   // stage 5
4546   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4547   bf1[1] = bf1[0];
4548   bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4549   bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4550 
4551   addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4552   addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4553 
4554   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4555                        &clamp_hi, &rounding, bit);
4556 
4557   // stage 6
4558   addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4559   addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4560 
4561   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4562                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4563 
4564   // stage 7
4565   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4566                        &rounding, bit);
4567 
4568   // stage 8
4569   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4570                        &rounding, bit);
4571 
4572   // stage 9
4573   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4574 }
4575 
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4576 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4577                              int bd, int out_shift) {
4578   const int32_t *cospi = cospi_arr(bit);
4579   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4580   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4581   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4582   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4583   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4584   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4585   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4586   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4587   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4588   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4589   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4590   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4591   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4592   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4593   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4594   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4595   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4596   const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4597   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4598   const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4599   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4600   const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4601   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4602   const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4603   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4604   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4605   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4606   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4607   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4608   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4609   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4610   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4611   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4612   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4613   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4614   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4615   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4616   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4617   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4618   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4619   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4620   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4621   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4622   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4623   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4624   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4625   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4626   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4627   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4628   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4629   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4630   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4631   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4632   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4633   __m128i bf1[32], bf0[32];
4634 
4635   // stage 0
4636   // stage 1
4637   bf1[0] = in[0];
4638   bf1[1] = in[16];
4639   bf1[2] = in[8];
4640   bf1[3] = in[24];
4641   bf1[4] = in[4];
4642   bf1[5] = in[20];
4643   bf1[6] = in[12];
4644   bf1[7] = in[28];
4645   bf1[8] = in[2];
4646   bf1[9] = in[18];
4647   bf1[10] = in[10];
4648   bf1[11] = in[26];
4649   bf1[12] = in[6];
4650   bf1[13] = in[22];
4651   bf1[14] = in[14];
4652   bf1[15] = in[30];
4653   bf1[16] = in[1];
4654   bf1[17] = in[17];
4655   bf1[18] = in[9];
4656   bf1[19] = in[25];
4657   bf1[20] = in[5];
4658   bf1[21] = in[21];
4659   bf1[22] = in[13];
4660   bf1[23] = in[29];
4661   bf1[24] = in[3];
4662   bf1[25] = in[19];
4663   bf1[26] = in[11];
4664   bf1[27] = in[27];
4665   bf1[28] = in[7];
4666   bf1[29] = in[23];
4667   bf1[30] = in[15];
4668   bf1[31] = in[31];
4669 
4670   // stage 2
4671   bf0[0] = bf1[0];
4672   bf0[1] = bf1[1];
4673   bf0[2] = bf1[2];
4674   bf0[3] = bf1[3];
4675   bf0[4] = bf1[4];
4676   bf0[5] = bf1[5];
4677   bf0[6] = bf1[6];
4678   bf0[7] = bf1[7];
4679   bf0[8] = bf1[8];
4680   bf0[9] = bf1[9];
4681   bf0[10] = bf1[10];
4682   bf0[11] = bf1[11];
4683   bf0[12] = bf1[12];
4684   bf0[13] = bf1[13];
4685   bf0[14] = bf1[14];
4686   bf0[15] = bf1[15];
4687   bf0[16] =
4688       half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4689   bf0[17] =
4690       half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4691   bf0[18] =
4692       half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4693   bf0[19] =
4694       half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4695   bf0[20] =
4696       half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4697   bf0[21] =
4698       half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4699   bf0[22] =
4700       half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4701   bf0[23] =
4702       half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4703   bf0[24] =
4704       half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4705   bf0[25] =
4706       half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4707   bf0[26] =
4708       half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4709   bf0[27] =
4710       half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4711   bf0[28] =
4712       half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4713   bf0[29] =
4714       half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4715   bf0[30] =
4716       half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4717   bf0[31] =
4718       half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4719 
4720   // stage 3
4721   bf1[0] = bf0[0];
4722   bf1[1] = bf0[1];
4723   bf1[2] = bf0[2];
4724   bf1[3] = bf0[3];
4725   bf1[4] = bf0[4];
4726   bf1[5] = bf0[5];
4727   bf1[6] = bf0[6];
4728   bf1[7] = bf0[7];
4729   bf1[8] =
4730       half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4731   bf1[9] =
4732       half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4733   bf1[10] =
4734       half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4735   bf1[11] =
4736       half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4737   bf1[12] =
4738       half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4739   bf1[13] =
4740       half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4741   bf1[14] =
4742       half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4743   bf1[15] =
4744       half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4745 
4746   addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4747   addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4748   addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4749   addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4750   addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4751   addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4752   addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4753   addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4754 
4755   // stage 4
4756   bf0[0] = bf1[0];
4757   bf0[1] = bf1[1];
4758   bf0[2] = bf1[2];
4759   bf0[3] = bf1[3];
4760   bf0[4] =
4761       half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4762   bf0[5] =
4763       half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4764   bf0[6] =
4765       half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4766   bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4767 
4768   addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4769   addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4770   addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4771   addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4772 
4773   bf0[16] = bf1[16];
4774   bf0[17] =
4775       half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4776   bf0[18] =
4777       half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4778   bf0[19] = bf1[19];
4779   bf0[20] = bf1[20];
4780   bf0[21] =
4781       half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4782   bf0[22] =
4783       half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4784   bf0[23] = bf1[23];
4785   bf0[24] = bf1[24];
4786   bf0[25] =
4787       half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4788   bf0[26] =
4789       half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4790   bf0[27] = bf1[27];
4791   bf0[28] = bf1[28];
4792   bf0[29] =
4793       half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
4794   bf0[30] =
4795       half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
4796   bf0[31] = bf1[31];
4797 
4798   // stage 5
4799   bf1[0] =
4800       half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
4801   bf1[1] =
4802       half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
4803   bf1[2] =
4804       half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
4805   bf1[3] =
4806       half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
4807   addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4808   addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4809   bf1[8] = bf0[8];
4810   bf1[9] =
4811       half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
4812   bf1[10] =
4813       half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
4814   bf1[11] = bf0[11];
4815   bf1[12] = bf0[12];
4816   bf1[13] =
4817       half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
4818   bf1[14] =
4819       half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
4820   bf1[15] = bf0[15];
4821   addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4822   addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4823   addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4824   addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4825   addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4826   addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4827   addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
4828   addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
4829 
4830   // stage 6
4831   addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
4832   addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
4833   bf0[4] = bf1[4];
4834   bf0[5] =
4835       half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4836   bf0[6] =
4837       half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4838   bf0[7] = bf1[7];
4839   addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
4840   addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
4841   addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
4842   addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
4843   bf0[16] = bf1[16];
4844   bf0[17] = bf1[17];
4845   bf0[18] =
4846       half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
4847   bf0[19] =
4848       half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
4849   bf0[20] =
4850       half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
4851   bf0[21] =
4852       half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
4853   bf0[22] = bf1[22];
4854   bf0[23] = bf1[23];
4855   bf0[24] = bf1[24];
4856   bf0[25] = bf1[25];
4857   bf0[26] =
4858       half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
4859   bf0[27] =
4860       half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
4861   bf0[28] =
4862       half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
4863   bf0[29] =
4864       half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
4865   bf0[30] = bf1[30];
4866   bf0[31] = bf1[31];
4867 
4868   // stage 7
4869   addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
4870   addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
4871   addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
4872   addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
4873   bf1[8] = bf0[8];
4874   bf1[9] = bf0[9];
4875   bf1[10] =
4876       half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4877   bf1[11] =
4878       half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4879   bf1[12] =
4880       half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4881   bf1[13] =
4882       half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4883   bf1[14] = bf0[14];
4884   bf1[15] = bf0[15];
4885   addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
4886   addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
4887   addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
4888   addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
4889   addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
4890   addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
4891   addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
4892   addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
4893 
4894   // stage 8
4895   addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
4896   addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
4897   addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
4898   addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
4899   addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
4900   addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
4901   addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
4902   addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
4903   bf0[16] = bf1[16];
4904   bf0[17] = bf1[17];
4905   bf0[18] = bf1[18];
4906   bf0[19] = bf1[19];
4907   bf0[20] =
4908       half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4909   bf0[21] =
4910       half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4911   bf0[22] =
4912       half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4913   bf0[23] =
4914       half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4915   bf0[24] =
4916       half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4917   bf0[25] =
4918       half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4919   bf0[26] =
4920       half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4921   bf0[27] =
4922       half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4923   bf0[28] = bf1[28];
4924   bf0[29] = bf1[29];
4925   bf0[30] = bf1[30];
4926   bf0[31] = bf1[31];
4927 
4928   // stage 9
4929   if (do_cols) {
4930     addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
4931     addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
4932     addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
4933     addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
4934     addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
4935     addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
4936     addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
4937     addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
4938     addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
4939     addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
4940     addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
4941     addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
4942     addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
4943     addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
4944     addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
4945     addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
4946   } else {
4947     const int log_range_out = AOMMAX(16, bd + 6);
4948     const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4949         -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4950     const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4951         (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4952 
4953     addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
4954                         &clamp_hi_out, out_shift);
4955     addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
4956                         &clamp_hi_out, out_shift);
4957     addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
4958                         &clamp_hi_out, out_shift);
4959     addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
4960                         &clamp_hi_out, out_shift);
4961     addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
4962                         &clamp_hi_out, out_shift);
4963     addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
4964                         &clamp_hi_out, out_shift);
4965     addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
4966                         &clamp_hi_out, out_shift);
4967     addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
4968                         &clamp_hi_out, out_shift);
4969     addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
4970                         &clamp_hi_out, out_shift);
4971     addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
4972                         &clamp_hi_out, out_shift);
4973     addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
4974                         &clamp_hi_out, out_shift);
4975     addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
4976                         &clamp_hi_out, out_shift);
4977     addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
4978                         &clamp_hi_out, out_shift);
4979     addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
4980                         &clamp_hi_out, out_shift);
4981     addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
4982                         &clamp_hi_out, out_shift);
4983     addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
4984                         &clamp_hi_out, out_shift);
4985   }
4986 }
4987 
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)4988 void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
4989                                         int stride,
4990                                         const TxfmParam *txfm_param) {
4991   int bd = txfm_param->bd;
4992   const TX_TYPE tx_type = txfm_param->tx_type;
4993   const int32_t *src = cast_to_int32(input);
4994   switch (tx_type) {
4995       // Assembly version doesn't support some transform types, so use C version
4996       // for those.
4997     case V_DCT:
4998     case H_DCT:
4999     case V_ADST:
5000     case H_ADST:
5001     case V_FLIPADST:
5002     case H_FLIPADST:
5003     case IDTX:
5004       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5005                                bd);
5006       break;
5007     default:
5008       av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5009                                     tx_type, bd);
5010       break;
5011   }
5012 }
5013 
av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5014 void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5015                                          int stride,
5016                                          const TxfmParam *txfm_param) {
5017   int bd = txfm_param->bd;
5018   const TX_TYPE tx_type = txfm_param->tx_type;
5019   const int32_t *src = cast_to_int32(input);
5020   switch (tx_type) {
5021       // Assembly version doesn't support some transform types, so use C version
5022       // for those.
5023     case V_DCT:
5024     case H_DCT:
5025     case V_ADST:
5026     case H_ADST:
5027     case V_FLIPADST:
5028     case H_FLIPADST:
5029     case IDTX:
5030       av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5031                                 txfm_param->tx_type, txfm_param->bd);
5032       break;
5033     default:
5034       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5035                                                 txfm_param->tx_size,
5036                                                 txfm_param->eob, bd);
5037       break;
5038   }
5039 }
5040 
av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5041 void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
5042                                          int stride,
5043                                          const TxfmParam *txfm_param) {
5044   int bd = txfm_param->bd;
5045   const TX_TYPE tx_type = txfm_param->tx_type;
5046   const int32_t *src = cast_to_int32(input);
5047   switch (tx_type) {
5048       // Assembly version doesn't support some transform types, so use C version
5049       // for those.
5050     case V_DCT:
5051     case H_DCT:
5052     case V_ADST:
5053     case H_ADST:
5054     case V_FLIPADST:
5055     case H_FLIPADST:
5056     case IDTX:
5057       av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5058                                 txfm_param->tx_type, txfm_param->bd);
5059       break;
5060     default:
5061       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5062                                                 txfm_param->tx_size,
5063                                                 txfm_param->eob, bd);
5064       break;
5065   }
5066 }
5067 
av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5068 void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
5069                                           uint8_t *dest, int stride,
5070                                           const TxfmParam *txfm_param) {
5071   int bd = txfm_param->bd;
5072   const TX_TYPE tx_type = txfm_param->tx_type;
5073   const int32_t *src = cast_to_int32(input);
5074   switch (tx_type) {
5075       // Assembly version doesn't support some transform types, so use C version
5076       // for those.
5077     case V_DCT:
5078     case H_DCT:
5079     case V_ADST:
5080     case H_ADST:
5081     case V_FLIPADST:
5082     case H_FLIPADST:
5083     case IDTX:
5084       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5085                                  tx_type, bd);
5086       break;
5087     default:
5088       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5089                                                 txfm_param->tx_size,
5090                                                 txfm_param->eob, bd);
5091       break;
5092   }
5093 }
5094 
av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5095 void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
5096                                           uint8_t *dest, int stride,
5097                                           const TxfmParam *txfm_param) {
5098   int bd = txfm_param->bd;
5099   const TX_TYPE tx_type = txfm_param->tx_type;
5100   const int32_t *src = cast_to_int32(input);
5101   switch (tx_type) {
5102     case DCT_DCT:
5103       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5104                                                 txfm_param->tx_size,
5105                                                 txfm_param->eob, bd);
5106       break;
5107       // Assembly version doesn't support IDTX, so use C version for it.
5108     case IDTX:
5109       av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5110                                  tx_type, bd);
5111       break;
5112     default: assert(0);
5113   }
5114 }
5115 
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5116 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5117                                         int stride,
5118                                         const TxfmParam *txfm_param) {
5119   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5120   int eob = txfm_param->eob;
5121   int bd = txfm_param->bd;
5122   int lossless = txfm_param->lossless;
5123   const int32_t *src = cast_to_int32(input);
5124   const TX_TYPE tx_type = txfm_param->tx_type;
5125   if (lossless) {
5126     assert(tx_type == DCT_DCT);
5127     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5128     return;
5129   }
5130   switch (tx_type) {
5131       // Assembly version doesn't support some transform types, so use C version
5132       // for those.
5133     case V_DCT:
5134     case H_DCT:
5135     case V_ADST:
5136     case H_ADST:
5137     case V_FLIPADST:
5138     case H_FLIPADST:
5139     case IDTX:
5140       av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5141                                bd);
5142       break;
5143     default:
5144       av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5145                                     tx_type, bd);
5146       break;
5147   }
5148 }
5149 
5150 static const transform_1d_sse4_1
5151     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5152       {
5153           { NULL, NULL, NULL, NULL },
5154           { NULL, NULL, NULL, NULL },
5155           { NULL, NULL, NULL, NULL },
5156       },
5157       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5158         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5159         { NULL, NULL, NULL, NULL } },
5160       {
5161           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5162             NULL },
5163           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5164             NULL },
5165           { NULL, NULL, NULL, NULL },
5166       },
5167       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5168           idct32x32_sse4_1 },
5169         { NULL, NULL, NULL, NULL },
5170         { NULL, NULL, NULL, NULL } },
5171       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5172           idct64x64_sse4_1 },
5173         { NULL, NULL, NULL, NULL },
5174         { NULL, NULL, NULL, NULL } }
5175     };
5176 
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5177 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5178                                                     uint16_t *output,
5179                                                     int stride, TX_TYPE tx_type,
5180                                                     TX_SIZE tx_size, int eob,
5181                                                     const int bd) {
5182   __m128i buf1[64 * 16];
5183   int eobx, eoby;
5184   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5185   const int8_t *shift = inv_txfm_shift_ls[tx_size];
5186   const int txw_idx = get_txw_idx(tx_size);
5187   const int txh_idx = get_txh_idx(tx_size);
5188   const int txfm_size_col = tx_size_wide[tx_size];
5189   const int txfm_size_row = tx_size_high[tx_size];
5190   const int buf_size_w_div8 = txfm_size_col >> 2;
5191   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5192   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5193   const int input_stride = AOMMIN(32, txfm_size_col);
5194   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5195 
5196   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5197   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5198   const transform_1d_sse4_1 row_txfm =
5199       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5200   const transform_1d_sse4_1 col_txfm =
5201       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5202 
5203   assert(col_txfm != NULL);
5204   assert(row_txfm != NULL);
5205   int ud_flip, lr_flip;
5206   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5207 
5208   // 1st stage: column transform
5209   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5210     __m128i buf0[64];
5211     const int32_t *input_row = input + i * input_stride * 4;
5212     for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5213       __m128i *buf0_cur = buf0 + j * 4;
5214       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5215 
5216       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5217                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5218     }
5219     if (rect_type == 1 || rect_type == -1) {
5220       av1_round_shift_rect_array_32_sse4_1(
5221           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
5222     }
5223     row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5224 
5225     __m128i *_buf1 = buf1 + i * 4;
5226     if (lr_flip) {
5227       for (int j = 0; j < buf_size_w_div8; ++j) {
5228         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5229                       buf0[4 * j],
5230                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5231                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5232                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5233                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5234       }
5235     } else {
5236       for (int j = 0; j < buf_size_w_div8; ++j) {
5237         TRANSPOSE_4X4(
5238             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5239             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5240             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5241       }
5242     }
5243   }
5244   // 2nd stage: column transform
5245   for (int i = 0; i < buf_size_w_div8; i++) {
5246     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5247              inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5248 
5249     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5250                                     buf1 + i * txfm_size_row, txfm_size_row,
5251                                     -shift[1]);
5252   }
5253 
5254   // write to buffer
5255   {
5256     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5257       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5258                                      output + 8 * i, stride, ud_flip,
5259                                      txfm_size_row, bd);
5260     }
5261   }
5262 }
5263 
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5264 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5265                                                uint8_t *output, int stride,
5266                                                TX_TYPE tx_type, TX_SIZE tx_size,
5267                                                int eob, const int bd) {
5268   switch (tx_type) {
5269     case DCT_DCT:
5270     case ADST_DCT:
5271     case DCT_ADST:
5272     case ADST_ADST:
5273     case FLIPADST_DCT:
5274     case DCT_FLIPADST:
5275     case FLIPADST_FLIPADST:
5276     case ADST_FLIPADST:
5277     case FLIPADST_ADST:
5278       highbd_inv_txfm2d_add_no_identity_sse41(
5279           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5280           bd);
5281       break;
5282     default: assert(0); break;
5283   }
5284 }
5285 
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5286 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5287                                     int stride, const TxfmParam *txfm_param) {
5288   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5289   const TX_SIZE tx_size = txfm_param->tx_size;
5290   switch (tx_size) {
5291     case TX_32X32:
5292       av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
5293       break;
5294     case TX_16X16:
5295       av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
5296       break;
5297     case TX_8X8:
5298       av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
5299       break;
5300     case TX_4X8:
5301       av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
5302       break;
5303     case TX_8X4:
5304       av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
5305       break;
5306     case TX_8X16:
5307       av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
5308       break;
5309     case TX_16X8:
5310       av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
5311       break;
5312     case TX_16X32:
5313       av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
5314       break;
5315     case TX_32X16:
5316       av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
5317       break;
5318     case TX_32X64:
5319       av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
5320       break;
5321     case TX_64X32:
5322       av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
5323       break;
5324     case TX_4X4:
5325       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
5326       break;
5327     case TX_16X4:
5328       av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
5329       break;
5330     case TX_4X16:
5331       av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
5332       break;
5333     case TX_8X32:
5334       av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
5335       break;
5336     case TX_32X8:
5337       av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
5338       break;
5339     case TX_64X64:
5340     case TX_16X64:
5341     case TX_64X16:
5342       av1_highbd_inv_txfm2d_add_universe_sse4_1(
5343           input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
5344           txfm_param->eob, txfm_param->bd);
5345       break;
5346     default: assert(0 && "Invalid transform size"); break;
5347   }
5348 }
5349