1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse2.h"
21 #include "av1/common/x86/av1_txfm_sse4.h"
22 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
23 
highbd_clamp_epi16(__m128i u,int bd)24 static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
25   const __m128i zero = _mm_setzero_si128();
26   const __m128i one = _mm_set1_epi16(1);
27   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
28   __m128i clamped, mask;
29 
30   mask = _mm_cmpgt_epi16(u, max);
31   clamped = _mm_andnot_si128(mask, u);
32   mask = _mm_and_si128(mask, max);
33   clamped = _mm_or_si128(mask, clamped);
34   mask = _mm_cmpgt_epi16(clamped, zero);
35   clamped = _mm_and_si128(clamped, mask);
36 
37   return clamped;
38 }
39 
round_shift_4x4(__m128i * in,int shift)40 static INLINE void round_shift_4x4(__m128i *in, int shift) {
41   if (shift != 0) {
42     __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
43     in[0] = _mm_add_epi32(in[0], rnding);
44     in[1] = _mm_add_epi32(in[1], rnding);
45     in[2] = _mm_add_epi32(in[2], rnding);
46     in[3] = _mm_add_epi32(in[3], rnding);
47 
48     in[0] = _mm_srai_epi32(in[0], shift);
49     in[1] = _mm_srai_epi32(in[1], shift);
50     in[2] = _mm_srai_epi32(in[2], shift);
51     in[3] = _mm_srai_epi32(in[3], shift);
52   }
53 }
54 
round_shift_8x8(__m128i * in,int shift)55 static void round_shift_8x8(__m128i *in, int shift) {
56   round_shift_4x4(&in[0], shift);
57   round_shift_4x4(&in[4], shift);
58   round_shift_4x4(&in[8], shift);
59   round_shift_4x4(&in[12], shift);
60 }
61 
highbd_clamp_epi32_sse4_1(__m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int size)62 static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
63                                       const __m128i *clamp_lo,
64                                       const __m128i *clamp_hi, int size) {
65   __m128i a0, a1;
66   for (int i = 0; i < size; i += 4) {
67     a0 = _mm_max_epi32(in[i], *clamp_lo);
68     out[i] = _mm_min_epi32(a0, *clamp_hi);
69 
70     a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
71     out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
72 
73     a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
74     out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
75 
76     a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
77     out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
78   }
79 }
80 
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)81 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
82                                                   __m128i res0, __m128i res1,
83                                                   const int bd) {
84   __m128i x0 = _mm_cvtepi16_epi32(pred);
85   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
86   __m128i min_clip_val = _mm_setzero_si128();
87   __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
88   x0 = _mm_add_epi32(res0, x0);
89   x1 = _mm_add_epi32(res1, x1);
90   x0 = _mm_max_epi32(x0, min_clip_val);
91   x0 = _mm_min_epi32(x0, max_clip_val);
92   x1 = _mm_max_epi32(x1, min_clip_val);
93   x1 = _mm_min_epi32(x1, max_clip_val);
94   x0 = _mm_packus_epi32(x0, x1);
95   return x0;
96 }
97 
highbd_get_recon_4xn_sse4_1(const __m128i pred,__m128i res0,const int bd)98 static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
99                                                   __m128i res0, const int bd) {
100   __m128i x0 = _mm_cvtepi16_epi32(pred);
101 
102   x0 = _mm_add_epi32(res0, x0);
103   x0 = _mm_packus_epi32(x0, x0);
104   x0 = highbd_clamp_epi16(x0, bd);
105   return x0;
106 }
107 
highbd_write_buffer_4xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)108 static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
109                                                   int stride, int flipud,
110                                                   int height, const int bd) {
111   int j = flipud ? (height - 1) : 0;
112   const int step = flipud ? -1 : 1;
113   for (int i = 0; i < height; ++i, j += step) {
114     __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
115     __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
116 
117     _mm_storel_epi64((__m128i *)(output + i * stride), u);
118   }
119 }
120 
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)121 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
122                                                   int stride, int flipud,
123                                                   int height, const int bd) {
124   int j = flipud ? (height - 1) : 0;
125   const int step = flipud ? -1 : 1;
126   for (int i = 0; i < height; ++i, j += step) {
127     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
128     __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
129 
130     _mm_storeu_si128((__m128i *)(output + i * stride), u);
131   }
132 }
133 
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)134 static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
135                                            __m128i *out, int out_size) {
136   for (int i = 0; i < out_size; ++i) {
137     out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
138   }
139 }
140 
load_buffer_4x4(const int32_t * coeff,__m128i * in)141 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
142   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
143   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
144   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
145   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
146 }
147 
av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t * input,uint8_t * dest8,int stride,int bd)148 void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
149                                       int stride, int bd) {
150   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
151      0.5 shifts per pixel. */
152   __m128i op[4];
153   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
154 
155   load_buffer_4x4(input, op);
156 
157   // Shift before-hand.
158   op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
159   op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
160   op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
161   op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
162 
163   for (int i = 0; i < 2; ++i) {
164     transpose_32bit_4x4(op, op);
165 
166     __m128i a1 = op[0];
167     __m128i c1 = op[1];
168     __m128i d1 = op[2];
169     __m128i b1 = op[3];
170     a1 = _mm_add_epi32(a1, c1);          // a1 += c1
171     d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
172     __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
173     e1 = _mm_srai_epi32(e1, 1);
174     b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
175     c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
176     a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
177     d1 = _mm_add_epi32(d1, c1);  // d1 += c1
178 
179     op[0] = a1;
180     op[1] = b1;
181     op[2] = c1;
182     op[3] = d1;
183   }
184 
185   // Convert to int16_t. The C code checks that we are in range.
186   op[0] = _mm_packs_epi32(op[0], op[1]);
187   op[1] = _mm_packs_epi32(op[2], op[3]);
188 
189   // Load uint16_t.
190   __m128i dst[2];
191   __m128i tmp[4];
192   tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
193   tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
194   dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
195   tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
196   tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
197   dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
198 
199   // Add to the previous results.
200   dst[0] = _mm_add_epi16(dst[0], op[0]);
201   dst[1] = _mm_add_epi16(dst[1], op[1]);
202 
203   // Clamp.
204   dst[0] = highbd_clamp_epi16(dst[0], bd);
205   dst[1] = highbd_clamp_epi16(dst[1], bd);
206 
207   // Store.
208   _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
209   dst[0] = _mm_srli_si128(dst[0], 8);
210   _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
211   _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
212   dst[1] = _mm_srli_si128(dst[1], 8);
213   _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
214 }
215 
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)216 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
217                           __m128i *out1, const __m128i *clamp_lo,
218                           const __m128i *clamp_hi) {
219   __m128i a0 = _mm_add_epi32(in0, in1);
220   __m128i a1 = _mm_sub_epi32(in0, in1);
221 
222   a0 = _mm_max_epi32(a0, *clamp_lo);
223   a0 = _mm_min_epi32(a0, *clamp_hi);
224   a1 = _mm_max_epi32(a1, *clamp_lo);
225   a1 = _mm_min_epi32(a1, *clamp_hi);
226 
227   *out0 = a0;
228   *out1 = a1;
229 }
230 
shift_and_clamp_sse4_1(__m128i * in0,__m128i * in1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)231 static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
232                                    const __m128i *clamp_lo,
233                                    const __m128i *clamp_hi, int shift) {
234   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
235   __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
236   __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
237 
238   in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
239   in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
240 
241   in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
242   in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
243   in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
244   in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
245 
246   *in0 = in0_w_offset;
247   *in1 = in1_w_offset;
248 }
249 
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)250 static INLINE void idct32_stage4_sse4_1(
251     __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
252     const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
253     const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
254     const __m128i *rounding, int bit) {
255   __m128i temp1, temp2;
256   temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
257   bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
258   bf1[17] = temp1;
259 
260   temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
261   bf1[29] =
262       half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
263   bf1[18] = temp2;
264 
265   temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
266   bf1[26] =
267       half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
268   bf1[21] = temp1;
269 
270   temp2 =
271       half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
272   bf1[25] =
273       half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
274   bf1[22] = temp2;
275 }
276 
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)277 static INLINE void idct32_stage5_sse4_1(
278     __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
279     const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
280     const __m128i *clamp_hi, const __m128i *rounding, int bit) {
281   __m128i temp1, temp2;
282   temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
283   bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
284   bf1[9] = temp1;
285 
286   temp2 =
287       half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
288   bf1[13] =
289       half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
290   bf1[10] = temp2;
291 
292   addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
293   addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
294   addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
295   addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
296   addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
297   addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
298   addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
299   addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
300 }
301 
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)302 static INLINE void idct32_stage6_sse4_1(
303     __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
304     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
305     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
306     const __m128i *rounding, int bit) {
307   __m128i temp1, temp2;
308   temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
309   bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
310   bf1[5] = temp1;
311 
312   addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
313   addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
314   addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
315   addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
316 
317   temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
318   bf1[29] =
319       half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
320   bf1[18] = temp1;
321   temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
322   bf1[28] =
323       half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
324   bf1[19] = temp2;
325   temp1 =
326       half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
327   bf1[27] =
328       half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
329   bf1[20] = temp1;
330   temp2 =
331       half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
332   bf1[26] =
333       half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
334   bf1[21] = temp2;
335 }
336 
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)337 static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
338                                         const __m128i *cospi32,
339                                         const __m128i *clamp_lo,
340                                         const __m128i *clamp_hi,
341                                         const __m128i *rounding, int bit) {
342   __m128i temp1, temp2;
343   addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
344   addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
345   addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
346   addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
347 
348   temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
349   bf1[13] =
350       half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
351   bf1[10] = temp1;
352   temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
353   bf1[12] =
354       half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
355   bf1[11] = temp2;
356 
357   addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
358   addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
359   addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
360   addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
361   addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
362   addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
363   addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
364   addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
365 }
366 
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)367 static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
368                                         const __m128i *cospi32,
369                                         const __m128i *clamp_lo,
370                                         const __m128i *clamp_hi,
371                                         const __m128i *rounding, int bit) {
372   __m128i temp1, temp2;
373   addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
374   addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
375   addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
376   addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
377   addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
378   addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
379   addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
380   addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
381 
382   temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
383   bf1[27] =
384       half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
385   bf1[20] = temp1;
386   temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
387   bf1[26] =
388       half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
389   bf1[21] = temp2;
390   temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
391   bf1[25] =
392       half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
393   bf1[22] = temp1;
394   temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
395   bf1[24] =
396       half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
397   bf1[23] = temp2;
398 }
399 
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const __m128i * clamp_lo,const __m128i * clamp_hi)400 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
401                                         const int do_cols, const int bd,
402                                         const int out_shift,
403                                         const __m128i *clamp_lo,
404                                         const __m128i *clamp_hi) {
405   addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
406   addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
407   addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
408   addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
409   addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
410   addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
411   addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
412   addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
413   addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
414   addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
415   addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
416   addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
417   addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
418   addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
419   addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
420   addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
421 
422   if (!do_cols) {
423     const int log_range_out = AOMMAX(16, bd + 6);
424     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
425     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
426     for (int i = 0; i < 32; i += 8) {
427       round_shift_4x4(out + i, out_shift);
428       round_shift_4x4(out + i + 4, out_shift);
429     }
430     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
431   }
432 }
433 
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)434 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
435                              __m128i *out0, __m128i *out1,
436                              const __m128i *clamp_lo, const __m128i *clamp_hi,
437                              int shift) {
438   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
439   __m128i a0 = _mm_add_epi32(offset, in0);
440   __m128i a1 = _mm_sub_epi32(offset, in1);
441 
442   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
443   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
444 
445   a0 = _mm_max_epi32(a0, *clamp_lo);
446   a0 = _mm_min_epi32(a0, *clamp_hi);
447   a1 = _mm_max_epi32(a1, *clamp_lo);
448   a1 = _mm_min_epi32(a1, *clamp_hi);
449 
450   *out0 = a0;
451   *out1 = a1;
452 }
453 
idct4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)454 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
455                            int bd, int out_shift) {
456   const int32_t *cospi = cospi_arr(bit);
457   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
458   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
459   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
460   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
461   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
462   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
463   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
464   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
465   __m128i u0, u1, u2, u3;
466   __m128i v0, v1, v2, v3, x, y;
467 
468   // Stage 0
469   // Stage 1
470   // Stage 2
471   v0 = _mm_unpacklo_epi32(in[0], in[1]);
472   v1 = _mm_unpackhi_epi32(in[0], in[1]);
473   v2 = _mm_unpacklo_epi32(in[2], in[3]);
474   v3 = _mm_unpackhi_epi32(in[2], in[3]);
475 
476   u0 = _mm_unpacklo_epi64(v0, v2);
477   u1 = _mm_unpackhi_epi64(v0, v2);
478   u2 = _mm_unpacklo_epi64(v1, v3);
479   u3 = _mm_unpackhi_epi64(v1, v3);
480 
481   x = _mm_mullo_epi32(u0, cospi32);
482   y = _mm_mullo_epi32(u2, cospi32);
483   v0 = _mm_add_epi32(x, y);
484   v0 = _mm_add_epi32(v0, rnding);
485   v0 = _mm_srai_epi32(v0, bit);
486 
487   v1 = _mm_sub_epi32(x, y);
488   v1 = _mm_add_epi32(v1, rnding);
489   v1 = _mm_srai_epi32(v1, bit);
490 
491   x = _mm_mullo_epi32(u1, cospi48);
492   y = _mm_mullo_epi32(u3, cospim16);
493   v2 = _mm_add_epi32(x, y);
494   v2 = _mm_add_epi32(v2, rnding);
495   v2 = _mm_srai_epi32(v2, bit);
496 
497   x = _mm_mullo_epi32(u1, cospi16);
498   y = _mm_mullo_epi32(u3, cospi48);
499   v3 = _mm_add_epi32(x, y);
500   v3 = _mm_add_epi32(v3, rnding);
501   v3 = _mm_srai_epi32(v3, bit);
502 
503   // Stage 3
504   addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
505   addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
506 
507   if (!do_cols) {
508     log_range = AOMMAX(16, bd + 6);
509     clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
510     clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
511 
512     shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
513     shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
514   }
515 }
516 
iadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)517 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
518                             int bd, int out_shift) {
519   const int32_t *sinpi = sinpi_arr(bit);
520   const __m128i zero = _mm_set1_epi32(0);
521   __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
522   rnding = _mm_unpacklo_epi32(rnding, zero);
523   const __m128i mul = _mm_set1_epi32(1 << 4);
524   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
525   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
526   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
527   const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
528   __m128i t;
529   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
530   __m128i x0, x1, x2, x3;
531   __m128i u0, u1, u2, u3;
532   __m128i v0, v1, v2, v3;
533   __m128i u0_low, u1_low, u2_low, u3_low;
534   __m128i u0_high, u1_high, u2_high, u3_high;
535 
536   v0 = _mm_unpacklo_epi32(in[0], in[1]);
537   v1 = _mm_unpackhi_epi32(in[0], in[1]);
538   v2 = _mm_unpacklo_epi32(in[2], in[3]);
539   v3 = _mm_unpackhi_epi32(in[2], in[3]);
540 
541   x0 = _mm_unpacklo_epi64(v0, v2);
542   x1 = _mm_unpackhi_epi64(v0, v2);
543   x2 = _mm_unpacklo_epi64(v1, v3);
544   x3 = _mm_unpackhi_epi64(v1, v3);
545 
546   s0 = _mm_mullo_epi32(x0, sinpi1);
547   s1 = _mm_mullo_epi32(x0, sinpi2);
548   s2 = _mm_mullo_epi32(x1, sinpi3);
549   s3 = _mm_mullo_epi32(x2, sinpi4);
550   s4 = _mm_mullo_epi32(x2, sinpi1);
551   s5 = _mm_mullo_epi32(x3, sinpi2);
552   s6 = _mm_mullo_epi32(x3, sinpi4);
553   t = _mm_sub_epi32(x0, x2);
554   s7 = _mm_add_epi32(t, x3);
555 
556   t = _mm_add_epi32(s0, s3);
557   s0 = _mm_add_epi32(t, s5);
558   t = _mm_sub_epi32(s1, s4);
559   s1 = _mm_sub_epi32(t, s6);
560   s3 = s2;
561   s2 = _mm_mullo_epi32(s7, sinpi3);
562 
563   u0 = _mm_add_epi32(s0, s3);
564   u1 = _mm_add_epi32(s1, s3);
565   u2 = s2;
566   t = _mm_add_epi32(s0, s1);
567   u3 = _mm_sub_epi32(t, s3);
568 
569   // u0
570   u0_low = _mm_mul_epi32(u0, mul);
571   u0_low = _mm_add_epi64(u0_low, rnding);
572 
573   u0 = _mm_srli_si128(u0, 4);
574   u0_high = _mm_mul_epi32(u0, mul);
575   u0_high = _mm_add_epi64(u0_high, rnding);
576 
577   u0_low = _mm_srli_si128(u0_low, 2);
578   u0_high = _mm_srli_si128(u0_high, 2);
579 
580   u0 = _mm_unpacklo_epi32(u0_low, u0_high);
581   u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
582   u0 = _mm_unpacklo_epi64(u0, u0_high);
583 
584   // u1
585   u1_low = _mm_mul_epi32(u1, mul);
586   u1_low = _mm_add_epi64(u1_low, rnding);
587 
588   u1 = _mm_srli_si128(u1, 4);
589   u1_high = _mm_mul_epi32(u1, mul);
590   u1_high = _mm_add_epi64(u1_high, rnding);
591 
592   u1_low = _mm_srli_si128(u1_low, 2);
593   u1_high = _mm_srli_si128(u1_high, 2);
594 
595   u1 = _mm_unpacklo_epi32(u1_low, u1_high);
596   u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
597   u1 = _mm_unpacklo_epi64(u1, u1_high);
598 
599   // u2
600   u2_low = _mm_mul_epi32(u2, mul);
601   u2_low = _mm_add_epi64(u2_low, rnding);
602 
603   u2 = _mm_srli_si128(u2, 4);
604   u2_high = _mm_mul_epi32(u2, mul);
605   u2_high = _mm_add_epi64(u2_high, rnding);
606 
607   u2_low = _mm_srli_si128(u2_low, 2);
608   u2_high = _mm_srli_si128(u2_high, 2);
609 
610   u2 = _mm_unpacklo_epi32(u2_low, u2_high);
611   u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
612   u2 = _mm_unpacklo_epi64(u2, u2_high);
613 
614   // u3
615   u3_low = _mm_mul_epi32(u3, mul);
616   u3_low = _mm_add_epi64(u3_low, rnding);
617 
618   u3 = _mm_srli_si128(u3, 4);
619   u3_high = _mm_mul_epi32(u3, mul);
620   u3_high = _mm_add_epi64(u3_high, rnding);
621 
622   u3_low = _mm_srli_si128(u3_low, 2);
623   u3_high = _mm_srli_si128(u3_high, 2);
624 
625   u3 = _mm_unpacklo_epi32(u3_low, u3_high);
626   u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
627   u3 = _mm_unpacklo_epi64(u3, u3_high);
628 
629   out[0] = u0;
630   out[1] = u1;
631   out[2] = u2;
632   out[3] = u3;
633 
634   if (!do_cols) {
635     const int log_range = AOMMAX(16, bd + 6);
636     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
637     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
638     round_shift_4x4(out, out_shift);
639     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
640   }
641 }
642 
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)643 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
644                              int fliplr, int flipud, int shift, int bd) {
645   const __m128i zero = _mm_setzero_si128();
646   __m128i u0, u1, u2, u3;
647   __m128i v0, v1, v2, v3;
648 
649   round_shift_4x4(in, shift);
650 
651   v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
652   v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
653   v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
654   v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
655 
656   v0 = _mm_unpacklo_epi16(v0, zero);
657   v1 = _mm_unpacklo_epi16(v1, zero);
658   v2 = _mm_unpacklo_epi16(v2, zero);
659   v3 = _mm_unpacklo_epi16(v3, zero);
660 
661   if (fliplr) {
662     in[0] = _mm_shuffle_epi32(in[0], 0x1B);
663     in[1] = _mm_shuffle_epi32(in[1], 0x1B);
664     in[2] = _mm_shuffle_epi32(in[2], 0x1B);
665     in[3] = _mm_shuffle_epi32(in[3], 0x1B);
666   }
667 
668   if (flipud) {
669     u0 = _mm_add_epi32(in[3], v0);
670     u1 = _mm_add_epi32(in[2], v1);
671     u2 = _mm_add_epi32(in[1], v2);
672     u3 = _mm_add_epi32(in[0], v3);
673   } else {
674     u0 = _mm_add_epi32(in[0], v0);
675     u1 = _mm_add_epi32(in[1], v1);
676     u2 = _mm_add_epi32(in[2], v2);
677     u3 = _mm_add_epi32(in[3], v3);
678   }
679 
680   v0 = _mm_packus_epi32(u0, u1);
681   v2 = _mm_packus_epi32(u2, u3);
682 
683   u0 = highbd_clamp_epi16(v0, bd);
684   u2 = highbd_clamp_epi16(v2, bd);
685 
686   v0 = _mm_unpacklo_epi64(u0, u0);
687   v1 = _mm_unpackhi_epi64(u0, u0);
688   v2 = _mm_unpacklo_epi64(u2, u2);
689   v3 = _mm_unpackhi_epi64(u2, u2);
690 
691   _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
692   _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
693   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
694   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
695 }
696 
iidentity4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)697 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
698                               int bd, int out_shift) {
699   (void)bit;
700   __m128i v[4];
701   __m128i zero = _mm_set1_epi32(0);
702   __m128i fact = _mm_set1_epi32(NewSqrt2);
703   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
704   __m128i a0_low, a1_low;
705   __m128i a0_high, a1_high;
706 
707   offset = _mm_unpacklo_epi32(offset, zero);
708 
709   for (int i = 0; i < 4; i++) {
710     a0_low = _mm_mul_epi32(in[i], fact);
711     a0_low = _mm_add_epi32(a0_low, offset);
712     a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
713 
714     a0_high = _mm_srli_si128(in[i], 4);
715     a0_high = _mm_mul_epi32(a0_high, fact);
716     a0_high = _mm_add_epi32(a0_high, offset);
717     a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
718 
719     a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
720     a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
721     out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
722   }
723 
724   if (!do_cols) {
725     const int log_range = AOMMAX(16, bd + 6);
726     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
727     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
728     round_shift_4x4(out, out_shift);
729     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
730   }
731 
732   // Transpose for 4x4
733   v[0] = _mm_unpacklo_epi32(out[0], out[1]);
734   v[1] = _mm_unpackhi_epi32(out[0], out[1]);
735   v[2] = _mm_unpacklo_epi32(out[2], out[3]);
736   v[3] = _mm_unpackhi_epi32(out[2], out[3]);
737 
738   out[0] = _mm_unpacklo_epi64(v[0], v[2]);
739   out[1] = _mm_unpackhi_epi64(v[0], v[2]);
740   out[2] = _mm_unpacklo_epi64(v[1], v[3]);
741   out[3] = _mm_unpackhi_epi64(v[1], v[3]);
742 }
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)743 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
744                                    int stride, TX_TYPE tx_type, int bd) {
745   __m128i in[4];
746   const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
747   const int txw_idx = get_txw_idx(TX_4X4);
748   const int txh_idx = get_txh_idx(TX_4X4);
749 
750   switch (tx_type) {
751     case DCT_DCT:
752       load_buffer_4x4(input, in);
753       idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
754       idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
755       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
756       break;
757     case ADST_DCT:
758       load_buffer_4x4(input, in);
759       idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
760       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
761       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
762       break;
763     case DCT_ADST:
764       load_buffer_4x4(input, in);
765       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
766       idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
767       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
768       break;
769     case ADST_ADST:
770       load_buffer_4x4(input, in);
771       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
772       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
773       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
774       break;
775     case FLIPADST_DCT:
776       load_buffer_4x4(input, in);
777       idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
778       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
779       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
780       break;
781     case DCT_FLIPADST:
782       load_buffer_4x4(input, in);
783       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
784       idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
785       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
786       break;
787     case FLIPADST_FLIPADST:
788       load_buffer_4x4(input, in);
789       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
790       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
791       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
792       break;
793     case ADST_FLIPADST:
794       load_buffer_4x4(input, in);
795       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
796       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
797       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
798       break;
799     case FLIPADST_ADST:
800       load_buffer_4x4(input, in);
801       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
802       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
803       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
804       break;
805     case IDTX:
806       load_buffer_4x4(input, in);
807       iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
808                         0);
809       iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
810                         0);
811       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
812       break;
813     case V_DCT:
814       load_buffer_4x4(input, in);
815       iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
816                         0);
817       idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
818       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
819       break;
820     case H_DCT:
821       load_buffer_4x4(input, in);
822       idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
823       iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
824                         0);
825       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
826       break;
827     case V_ADST:
828       load_buffer_4x4(input, in);
829       iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
830                         0);
831       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
832       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
833       break;
834     case H_ADST:
835       load_buffer_4x4(input, in);
836       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
837       iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
838                         0);
839       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
840       break;
841     case V_FLIPADST:
842       load_buffer_4x4(input, in);
843       iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
844                         0);
845       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
846       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
847       break;
848     case H_FLIPADST:
849       load_buffer_4x4(input, in);
850       iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
851       iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
852                         0);
853       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
854       break;
855     default: assert(0);
856   }
857 }
858 
859 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)860 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
861   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
862   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
863   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
864   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
865   in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
866   in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
867   in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
868   in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
869   in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
870   in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
871   in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
872   in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
873   in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
874   in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
875   in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
876   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
877 }
878 
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)879 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
880                            int bd, int out_shift) {
881   const int32_t *cospi = cospi_arr(bit);
882   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
883   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
884   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
885   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
886   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
887   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
888   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
889   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
890   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
891   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
892   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
893   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
894   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
895   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
896   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
897   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
898   __m128i x, y;
899   int col;
900 
901   // Note:
902   //  Even column: 0, 2, ..., 14
903   //  Odd column: 1, 3, ..., 15
904   //  one even column plus one odd column constructs one row (8 coeffs)
905   //  total we have 8 rows (8x8).
906   for (col = 0; col < 2; ++col) {
907     // stage 0
908     // stage 1
909     // stage 2
910     u0 = in[0 * 2 + col];
911     u1 = in[4 * 2 + col];
912     u2 = in[2 * 2 + col];
913     u3 = in[6 * 2 + col];
914 
915     x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
916     y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
917     u4 = _mm_add_epi32(x, y);
918     u4 = _mm_add_epi32(u4, rnding);
919     u4 = _mm_srai_epi32(u4, bit);
920 
921     x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
922     y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
923     u7 = _mm_add_epi32(x, y);
924     u7 = _mm_add_epi32(u7, rnding);
925     u7 = _mm_srai_epi32(u7, bit);
926 
927     x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
928     y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
929     u5 = _mm_add_epi32(x, y);
930     u5 = _mm_add_epi32(u5, rnding);
931     u5 = _mm_srai_epi32(u5, bit);
932 
933     x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
934     y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
935     u6 = _mm_add_epi32(x, y);
936     u6 = _mm_add_epi32(u6, rnding);
937     u6 = _mm_srai_epi32(u6, bit);
938 
939     // stage 3
940     x = _mm_mullo_epi32(u0, cospi32);
941     y = _mm_mullo_epi32(u1, cospi32);
942     v0 = _mm_add_epi32(x, y);
943     v0 = _mm_add_epi32(v0, rnding);
944     v0 = _mm_srai_epi32(v0, bit);
945 
946     v1 = _mm_sub_epi32(x, y);
947     v1 = _mm_add_epi32(v1, rnding);
948     v1 = _mm_srai_epi32(v1, bit);
949 
950     x = _mm_mullo_epi32(u2, cospi48);
951     y = _mm_mullo_epi32(u3, cospim16);
952     v2 = _mm_add_epi32(x, y);
953     v2 = _mm_add_epi32(v2, rnding);
954     v2 = _mm_srai_epi32(v2, bit);
955 
956     x = _mm_mullo_epi32(u2, cospi16);
957     y = _mm_mullo_epi32(u3, cospi48);
958     v3 = _mm_add_epi32(x, y);
959     v3 = _mm_add_epi32(v3, rnding);
960     v3 = _mm_srai_epi32(v3, bit);
961 
962     addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
963     addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
964 
965     // stage 4
966     addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
967     addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
968     u4 = v4;
969     u7 = v7;
970 
971     x = _mm_mullo_epi32(v5, cospi32);
972     y = _mm_mullo_epi32(v6, cospi32);
973     u6 = _mm_add_epi32(y, x);
974     u6 = _mm_add_epi32(u6, rnding);
975     u6 = _mm_srai_epi32(u6, bit);
976 
977     u5 = _mm_sub_epi32(y, x);
978     u5 = _mm_add_epi32(u5, rnding);
979     u5 = _mm_srai_epi32(u5, bit);
980 
981     // stage 5
982     addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
983                   &clamp_hi);
984     addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
985                   &clamp_hi);
986     addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
987                   &clamp_hi);
988     addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
989                   &clamp_hi);
990   }
991 
992   if (!do_cols) {
993     const int log_range_out = AOMMAX(16, bd + 6);
994     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
995     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
996     round_shift_8x8(out, out_shift);
997     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
998   }
999 }
1000 
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1001 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1002                             int bd, int out_shift) {
1003   const int32_t *cospi = cospi_arr(bit);
1004   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1005   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1006   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1007   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1008   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1009   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1010   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1011   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1012   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1013   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1014   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1015   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1016   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1017   const __m128i kZero = _mm_setzero_si128();
1018   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1019   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1020   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1021   __m128i u[8], v[8], x;
1022 
1023   // Even 8 points: 0, 2, ..., 14
1024   // stage 0
1025   // stage 1
1026   // stage 2
1027   // (1)
1028   u[0] = _mm_mullo_epi32(in[14], cospi4);
1029   x = _mm_mullo_epi32(in[0], cospi60);
1030   u[0] = _mm_add_epi32(u[0], x);
1031   u[0] = _mm_add_epi32(u[0], rnding);
1032   u[0] = _mm_srai_epi32(u[0], bit);
1033 
1034   u[1] = _mm_mullo_epi32(in[14], cospi60);
1035   x = _mm_mullo_epi32(in[0], cospi4);
1036   u[1] = _mm_sub_epi32(u[1], x);
1037   u[1] = _mm_add_epi32(u[1], rnding);
1038   u[1] = _mm_srai_epi32(u[1], bit);
1039 
1040   // (2)
1041   u[2] = _mm_mullo_epi32(in[10], cospi20);
1042   x = _mm_mullo_epi32(in[4], cospi44);
1043   u[2] = _mm_add_epi32(u[2], x);
1044   u[2] = _mm_add_epi32(u[2], rnding);
1045   u[2] = _mm_srai_epi32(u[2], bit);
1046 
1047   u[3] = _mm_mullo_epi32(in[10], cospi44);
1048   x = _mm_mullo_epi32(in[4], cospi20);
1049   u[3] = _mm_sub_epi32(u[3], x);
1050   u[3] = _mm_add_epi32(u[3], rnding);
1051   u[3] = _mm_srai_epi32(u[3], bit);
1052 
1053   // (3)
1054   u[4] = _mm_mullo_epi32(in[6], cospi36);
1055   x = _mm_mullo_epi32(in[8], cospi28);
1056   u[4] = _mm_add_epi32(u[4], x);
1057   u[4] = _mm_add_epi32(u[4], rnding);
1058   u[4] = _mm_srai_epi32(u[4], bit);
1059 
1060   u[5] = _mm_mullo_epi32(in[6], cospi28);
1061   x = _mm_mullo_epi32(in[8], cospi36);
1062   u[5] = _mm_sub_epi32(u[5], x);
1063   u[5] = _mm_add_epi32(u[5], rnding);
1064   u[5] = _mm_srai_epi32(u[5], bit);
1065 
1066   // (4)
1067   u[6] = _mm_mullo_epi32(in[2], cospi52);
1068   x = _mm_mullo_epi32(in[12], cospi12);
1069   u[6] = _mm_add_epi32(u[6], x);
1070   u[6] = _mm_add_epi32(u[6], rnding);
1071   u[6] = _mm_srai_epi32(u[6], bit);
1072 
1073   u[7] = _mm_mullo_epi32(in[2], cospi12);
1074   x = _mm_mullo_epi32(in[12], cospi52);
1075   u[7] = _mm_sub_epi32(u[7], x);
1076   u[7] = _mm_add_epi32(u[7], rnding);
1077   u[7] = _mm_srai_epi32(u[7], bit);
1078 
1079   // stage 3
1080   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1081   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1082   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1083   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1084 
1085   // stage 4
1086   u[0] = v[0];
1087   u[1] = v[1];
1088   u[2] = v[2];
1089   u[3] = v[3];
1090 
1091   u[4] = _mm_mullo_epi32(v[4], cospi16);
1092   x = _mm_mullo_epi32(v[5], cospi48);
1093   u[4] = _mm_add_epi32(u[4], x);
1094   u[4] = _mm_add_epi32(u[4], rnding);
1095   u[4] = _mm_srai_epi32(u[4], bit);
1096 
1097   u[5] = _mm_mullo_epi32(v[4], cospi48);
1098   x = _mm_mullo_epi32(v[5], cospi16);
1099   u[5] = _mm_sub_epi32(u[5], x);
1100   u[5] = _mm_add_epi32(u[5], rnding);
1101   u[5] = _mm_srai_epi32(u[5], bit);
1102 
1103   u[6] = _mm_mullo_epi32(v[6], cospim48);
1104   x = _mm_mullo_epi32(v[7], cospi16);
1105   u[6] = _mm_add_epi32(u[6], x);
1106   u[6] = _mm_add_epi32(u[6], rnding);
1107   u[6] = _mm_srai_epi32(u[6], bit);
1108 
1109   u[7] = _mm_mullo_epi32(v[6], cospi16);
1110   x = _mm_mullo_epi32(v[7], cospim48);
1111   u[7] = _mm_sub_epi32(u[7], x);
1112   u[7] = _mm_add_epi32(u[7], rnding);
1113   u[7] = _mm_srai_epi32(u[7], bit);
1114 
1115   // stage 5
1116   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1117   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1118   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1119   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1120 
1121   // stage 6
1122   u[0] = v[0];
1123   u[1] = v[1];
1124   u[4] = v[4];
1125   u[5] = v[5];
1126 
1127   v[0] = _mm_mullo_epi32(v[2], cospi32);
1128   x = _mm_mullo_epi32(v[3], cospi32);
1129   u[2] = _mm_add_epi32(v[0], x);
1130   u[2] = _mm_add_epi32(u[2], rnding);
1131   u[2] = _mm_srai_epi32(u[2], bit);
1132 
1133   u[3] = _mm_sub_epi32(v[0], x);
1134   u[3] = _mm_add_epi32(u[3], rnding);
1135   u[3] = _mm_srai_epi32(u[3], bit);
1136 
1137   v[0] = _mm_mullo_epi32(v[6], cospi32);
1138   x = _mm_mullo_epi32(v[7], cospi32);
1139   u[6] = _mm_add_epi32(v[0], x);
1140   u[6] = _mm_add_epi32(u[6], rnding);
1141   u[6] = _mm_srai_epi32(u[6], bit);
1142 
1143   u[7] = _mm_sub_epi32(v[0], x);
1144   u[7] = _mm_add_epi32(u[7], rnding);
1145   u[7] = _mm_srai_epi32(u[7], bit);
1146 
1147   // stage 7
1148   if (do_cols) {
1149     out[0] = u[0];
1150     out[2] = _mm_sub_epi32(kZero, u[4]);
1151     out[4] = u[6];
1152     out[6] = _mm_sub_epi32(kZero, u[2]);
1153     out[8] = u[3];
1154     out[10] = _mm_sub_epi32(kZero, u[7]);
1155     out[12] = u[5];
1156     out[14] = _mm_sub_epi32(kZero, u[1]);
1157   } else {
1158     const int log_range_out = AOMMAX(16, bd + 6);
1159     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1160     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1161 
1162     neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1163                      out_shift);
1164     neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1165                      out_shift);
1166     neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
1167                      &clamp_hi_out, out_shift);
1168     neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
1169                      &clamp_hi_out, out_shift);
1170   }
1171 
1172   // Odd 8 points: 1, 3, ..., 15
1173   // stage 0
1174   // stage 1
1175   // stage 2
1176   // (1)
1177   u[0] = _mm_mullo_epi32(in[15], cospi4);
1178   x = _mm_mullo_epi32(in[1], cospi60);
1179   u[0] = _mm_add_epi32(u[0], x);
1180   u[0] = _mm_add_epi32(u[0], rnding);
1181   u[0] = _mm_srai_epi32(u[0], bit);
1182 
1183   u[1] = _mm_mullo_epi32(in[15], cospi60);
1184   x = _mm_mullo_epi32(in[1], cospi4);
1185   u[1] = _mm_sub_epi32(u[1], x);
1186   u[1] = _mm_add_epi32(u[1], rnding);
1187   u[1] = _mm_srai_epi32(u[1], bit);
1188 
1189   // (2)
1190   u[2] = _mm_mullo_epi32(in[11], cospi20);
1191   x = _mm_mullo_epi32(in[5], cospi44);
1192   u[2] = _mm_add_epi32(u[2], x);
1193   u[2] = _mm_add_epi32(u[2], rnding);
1194   u[2] = _mm_srai_epi32(u[2], bit);
1195 
1196   u[3] = _mm_mullo_epi32(in[11], cospi44);
1197   x = _mm_mullo_epi32(in[5], cospi20);
1198   u[3] = _mm_sub_epi32(u[3], x);
1199   u[3] = _mm_add_epi32(u[3], rnding);
1200   u[3] = _mm_srai_epi32(u[3], bit);
1201 
1202   // (3)
1203   u[4] = _mm_mullo_epi32(in[7], cospi36);
1204   x = _mm_mullo_epi32(in[9], cospi28);
1205   u[4] = _mm_add_epi32(u[4], x);
1206   u[4] = _mm_add_epi32(u[4], rnding);
1207   u[4] = _mm_srai_epi32(u[4], bit);
1208 
1209   u[5] = _mm_mullo_epi32(in[7], cospi28);
1210   x = _mm_mullo_epi32(in[9], cospi36);
1211   u[5] = _mm_sub_epi32(u[5], x);
1212   u[5] = _mm_add_epi32(u[5], rnding);
1213   u[5] = _mm_srai_epi32(u[5], bit);
1214 
1215   // (4)
1216   u[6] = _mm_mullo_epi32(in[3], cospi52);
1217   x = _mm_mullo_epi32(in[13], cospi12);
1218   u[6] = _mm_add_epi32(u[6], x);
1219   u[6] = _mm_add_epi32(u[6], rnding);
1220   u[6] = _mm_srai_epi32(u[6], bit);
1221 
1222   u[7] = _mm_mullo_epi32(in[3], cospi12);
1223   x = _mm_mullo_epi32(in[13], cospi52);
1224   u[7] = _mm_sub_epi32(u[7], x);
1225   u[7] = _mm_add_epi32(u[7], rnding);
1226   u[7] = _mm_srai_epi32(u[7], bit);
1227 
1228   // stage 3
1229   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1230   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1231   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1232   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1233 
1234   // stage 4
1235   u[0] = v[0];
1236   u[1] = v[1];
1237   u[2] = v[2];
1238   u[3] = v[3];
1239 
1240   u[4] = _mm_mullo_epi32(v[4], cospi16);
1241   x = _mm_mullo_epi32(v[5], cospi48);
1242   u[4] = _mm_add_epi32(u[4], x);
1243   u[4] = _mm_add_epi32(u[4], rnding);
1244   u[4] = _mm_srai_epi32(u[4], bit);
1245 
1246   u[5] = _mm_mullo_epi32(v[4], cospi48);
1247   x = _mm_mullo_epi32(v[5], cospi16);
1248   u[5] = _mm_sub_epi32(u[5], x);
1249   u[5] = _mm_add_epi32(u[5], rnding);
1250   u[5] = _mm_srai_epi32(u[5], bit);
1251 
1252   u[6] = _mm_mullo_epi32(v[6], cospim48);
1253   x = _mm_mullo_epi32(v[7], cospi16);
1254   u[6] = _mm_add_epi32(u[6], x);
1255   u[6] = _mm_add_epi32(u[6], rnding);
1256   u[6] = _mm_srai_epi32(u[6], bit);
1257 
1258   u[7] = _mm_mullo_epi32(v[6], cospi16);
1259   x = _mm_mullo_epi32(v[7], cospim48);
1260   u[7] = _mm_sub_epi32(u[7], x);
1261   u[7] = _mm_add_epi32(u[7], rnding);
1262   u[7] = _mm_srai_epi32(u[7], bit);
1263 
1264   // stage 5
1265   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1266   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1267   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1268   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1269 
1270   // stage 6
1271   u[0] = v[0];
1272   u[1] = v[1];
1273   u[4] = v[4];
1274   u[5] = v[5];
1275 
1276   v[0] = _mm_mullo_epi32(v[2], cospi32);
1277   x = _mm_mullo_epi32(v[3], cospi32);
1278   u[2] = _mm_add_epi32(v[0], x);
1279   u[2] = _mm_add_epi32(u[2], rnding);
1280   u[2] = _mm_srai_epi32(u[2], bit);
1281 
1282   u[3] = _mm_sub_epi32(v[0], x);
1283   u[3] = _mm_add_epi32(u[3], rnding);
1284   u[3] = _mm_srai_epi32(u[3], bit);
1285 
1286   v[0] = _mm_mullo_epi32(v[6], cospi32);
1287   x = _mm_mullo_epi32(v[7], cospi32);
1288   u[6] = _mm_add_epi32(v[0], x);
1289   u[6] = _mm_add_epi32(u[6], rnding);
1290   u[6] = _mm_srai_epi32(u[6], bit);
1291 
1292   u[7] = _mm_sub_epi32(v[0], x);
1293   u[7] = _mm_add_epi32(u[7], rnding);
1294   u[7] = _mm_srai_epi32(u[7], bit);
1295 
1296   // stage 7
1297   if (do_cols) {
1298     out[1] = u[0];
1299     out[3] = _mm_sub_epi32(kZero, u[4]);
1300     out[5] = u[6];
1301     out[7] = _mm_sub_epi32(kZero, u[2]);
1302     out[9] = u[3];
1303     out[11] = _mm_sub_epi32(kZero, u[7]);
1304     out[13] = u[5];
1305     out[15] = _mm_sub_epi32(kZero, u[1]);
1306   } else {
1307     const int log_range_out = AOMMAX(16, bd + 6);
1308     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1309     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1310 
1311     neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1312                      out_shift);
1313     neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1314                      out_shift);
1315     neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1316                      &clamp_hi_out, out_shift);
1317     neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1318                      &clamp_hi_out, out_shift);
1319   }
1320 }
1321 
iidentity8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1322 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1323                               int bd, int out_shift) {
1324   (void)bit;
1325   out[0] = _mm_add_epi32(in[0], in[0]);
1326   out[1] = _mm_add_epi32(in[1], in[1]);
1327   out[2] = _mm_add_epi32(in[2], in[2]);
1328   out[3] = _mm_add_epi32(in[3], in[3]);
1329   out[4] = _mm_add_epi32(in[4], in[4]);
1330   out[5] = _mm_add_epi32(in[5], in[5]);
1331   out[6] = _mm_add_epi32(in[6], in[6]);
1332   out[7] = _mm_add_epi32(in[7], in[7]);
1333 
1334   if (!do_cols) {
1335     const int log_range = AOMMAX(16, bd + 6);
1336     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1337     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1338     round_shift_4x4(out, out_shift);
1339     round_shift_4x4(out + 4, out_shift);
1340     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
1341   }
1342 }
1343 
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1344 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1345                              int fliplr, int bd) {
1346   __m128i x0, x1;
1347   const __m128i zero = _mm_setzero_si128();
1348 
1349   x0 = _mm_unpacklo_epi16(pred, zero);
1350   x1 = _mm_unpackhi_epi16(pred, zero);
1351 
1352   if (fliplr) {
1353     res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1354     res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1355     x0 = _mm_add_epi32(res_hi, x0);
1356     x1 = _mm_add_epi32(res_lo, x1);
1357 
1358   } else {
1359     x0 = _mm_add_epi32(res_lo, x0);
1360     x1 = _mm_add_epi32(res_hi, x1);
1361   }
1362 
1363   x0 = _mm_packus_epi32(x0, x1);
1364   return highbd_clamp_epi16(x0, bd);
1365 }
1366 
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1367 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1368                              int fliplr, int flipud, int shift, int bd) {
1369   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1370   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1371 
1372   round_shift_8x8(in, shift);
1373 
1374   v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1375   v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1376   v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1377   v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1378   v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1379   v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1380   v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1381   v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1382 
1383   if (flipud) {
1384     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1385     u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1386     u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1387     u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1388     u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1389     u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1390     u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1391     u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1392   } else {
1393     u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1394     u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1395     u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1396     u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1397     u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1398     u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1399     u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1400     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1401   }
1402 
1403   _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1404   _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1405   _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1406   _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1407   _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1408   _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1409   _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1410   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1411 }
1412 
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1413 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
1414                                    int stride, TX_TYPE tx_type, int bd) {
1415   __m128i in[16], out[16];
1416   const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
1417   const int txw_idx = get_txw_idx(TX_8X8);
1418   const int txh_idx = get_txh_idx(TX_8X8);
1419 
1420   switch (tx_type) {
1421     case DCT_DCT:
1422       load_buffer_8x8(input, in);
1423       transpose_8x8(in, out);
1424       idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1425                      -shift[0]);
1426       transpose_8x8(in, out);
1427       idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1428       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1429       break;
1430     case DCT_ADST:
1431       load_buffer_8x8(input, in);
1432       transpose_8x8(in, out);
1433       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1434                       -shift[0]);
1435       transpose_8x8(in, out);
1436       idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1437       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1438       break;
1439     case ADST_DCT:
1440       load_buffer_8x8(input, in);
1441       transpose_8x8(in, out);
1442       idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1443                      -shift[0]);
1444       transpose_8x8(in, out);
1445       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1446       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1447       break;
1448     case ADST_ADST:
1449       load_buffer_8x8(input, in);
1450       transpose_8x8(in, out);
1451       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1452                       -shift[0]);
1453       transpose_8x8(in, out);
1454       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1455       write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1456       break;
1457     case FLIPADST_DCT:
1458       load_buffer_8x8(input, in);
1459       transpose_8x8(in, out);
1460       idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1461                      -shift[0]);
1462       transpose_8x8(in, out);
1463       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1464       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1465       break;
1466     case DCT_FLIPADST:
1467       load_buffer_8x8(input, in);
1468       transpose_8x8(in, out);
1469       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1470                       -shift[0]);
1471       transpose_8x8(in, out);
1472       idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1473       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1474       break;
1475     case ADST_FLIPADST:
1476       load_buffer_8x8(input, in);
1477       transpose_8x8(in, out);
1478       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1479                       -shift[0]);
1480       transpose_8x8(in, out);
1481       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1482       write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1483       break;
1484     case FLIPADST_FLIPADST:
1485       load_buffer_8x8(input, in);
1486       transpose_8x8(in, out);
1487       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1488                       -shift[0]);
1489       transpose_8x8(in, out);
1490       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1491       write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
1492       break;
1493     case FLIPADST_ADST:
1494       load_buffer_8x8(input, in);
1495       transpose_8x8(in, out);
1496       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1497                       -shift[0]);
1498       transpose_8x8(in, out);
1499       iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1500       write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1501       break;
1502     default: assert(0);
1503   }
1504 }
1505 
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1506 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1507                                 int bd, int out_shift) {
1508   const int32_t *cospi = cospi_arr(bit);
1509   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1510   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1511   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1512   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1513   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1514   __m128i x;
1515 
1516   // stage 0
1517   // stage 1
1518   // stage 2
1519   // stage 3
1520   x = _mm_mullo_epi32(in[0], cospi32);
1521   x = _mm_add_epi32(x, rnding);
1522   x = _mm_srai_epi32(x, bit);
1523 
1524   // stage 4
1525   // stage 5
1526   if (!do_cols) {
1527     const int log_range_out = AOMMAX(16, bd + 6);
1528     clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1529     clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1530 
1531     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1532     x = _mm_add_epi32(x, offset);
1533     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1534   }
1535 
1536   x = _mm_max_epi32(x, clamp_lo);
1537   x = _mm_min_epi32(x, clamp_hi);
1538   out[0] = x;
1539   out[1] = x;
1540   out[2] = x;
1541   out[3] = x;
1542   out[4] = x;
1543   out[5] = x;
1544   out[6] = x;
1545   out[7] = x;
1546 }
1547 
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1548 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1549                                int bd, int out_shift) {
1550   const int32_t *cospi = cospi_arr(bit);
1551   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1552   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1553   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1554   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1555   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1556   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1557   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1558   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1559   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1560   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1561   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1562   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1563   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1564   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1565   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1566   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1567   __m128i x, y;
1568 
1569   // stage 0
1570   // stage 1
1571   // stage 2
1572   u0 = in[0];
1573   u1 = in[4];
1574   u2 = in[2];
1575   u3 = in[6];
1576 
1577   x = _mm_mullo_epi32(in[1], cospi56);
1578   y = _mm_mullo_epi32(in[7], cospim8);
1579   u4 = _mm_add_epi32(x, y);
1580   u4 = _mm_add_epi32(u4, rnding);
1581   u4 = _mm_srai_epi32(u4, bit);
1582 
1583   x = _mm_mullo_epi32(in[1], cospi8);
1584   y = _mm_mullo_epi32(in[7], cospi56);
1585   u7 = _mm_add_epi32(x, y);
1586   u7 = _mm_add_epi32(u7, rnding);
1587   u7 = _mm_srai_epi32(u7, bit);
1588 
1589   x = _mm_mullo_epi32(in[5], cospi24);
1590   y = _mm_mullo_epi32(in[3], cospim40);
1591   u5 = _mm_add_epi32(x, y);
1592   u5 = _mm_add_epi32(u5, rnding);
1593   u5 = _mm_srai_epi32(u5, bit);
1594 
1595   x = _mm_mullo_epi32(in[5], cospi40);
1596   y = _mm_mullo_epi32(in[3], cospi24);
1597   u6 = _mm_add_epi32(x, y);
1598   u6 = _mm_add_epi32(u6, rnding);
1599   u6 = _mm_srai_epi32(u6, bit);
1600 
1601   // stage 3
1602   x = _mm_mullo_epi32(u0, cospi32);
1603   y = _mm_mullo_epi32(u1, cospi32);
1604   v0 = _mm_add_epi32(x, y);
1605   v0 = _mm_add_epi32(v0, rnding);
1606   v0 = _mm_srai_epi32(v0, bit);
1607 
1608   v1 = _mm_sub_epi32(x, y);
1609   v1 = _mm_add_epi32(v1, rnding);
1610   v1 = _mm_srai_epi32(v1, bit);
1611 
1612   x = _mm_mullo_epi32(u2, cospi48);
1613   y = _mm_mullo_epi32(u3, cospim16);
1614   v2 = _mm_add_epi32(x, y);
1615   v2 = _mm_add_epi32(v2, rnding);
1616   v2 = _mm_srai_epi32(v2, bit);
1617 
1618   x = _mm_mullo_epi32(u2, cospi16);
1619   y = _mm_mullo_epi32(u3, cospi48);
1620   v3 = _mm_add_epi32(x, y);
1621   v3 = _mm_add_epi32(v3, rnding);
1622   v3 = _mm_srai_epi32(v3, bit);
1623 
1624   addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1625   addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1626 
1627   // stage 4
1628   addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1629   addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1630   u4 = v4;
1631   u7 = v7;
1632 
1633   x = _mm_mullo_epi32(v5, cospi32);
1634   y = _mm_mullo_epi32(v6, cospi32);
1635   u6 = _mm_add_epi32(y, x);
1636   u6 = _mm_add_epi32(u6, rnding);
1637   u6 = _mm_srai_epi32(u6, bit);
1638 
1639   u5 = _mm_sub_epi32(y, x);
1640   u5 = _mm_add_epi32(u5, rnding);
1641   u5 = _mm_srai_epi32(u5, bit);
1642 
1643   // stage 5
1644   addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
1645   addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
1646   addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
1647   addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
1648 
1649   if (!do_cols) {
1650     const int log_range_out = AOMMAX(16, bd + 6);
1651     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1652     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1653 
1654     round_shift_4x4(out, out_shift);
1655     round_shift_4x4(out + 4, out_shift);
1656     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
1657   }
1658 }
1659 
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1660 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1661                                  int do_cols, int bd, int out_shift) {
1662   const int32_t *cospi = cospi_arr(bit);
1663   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1664   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1665   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1666   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1667   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1668   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1669   const __m128i kZero = _mm_setzero_si128();
1670   __m128i u[8], x;
1671 
1672   // stage 0
1673   // stage 1
1674   // stage 2
1675 
1676   x = _mm_mullo_epi32(in[0], cospi60);
1677   u[0] = _mm_add_epi32(x, rnding);
1678   u[0] = _mm_srai_epi32(u[0], bit);
1679 
1680   x = _mm_mullo_epi32(in[0], cospi4);
1681   u[1] = _mm_sub_epi32(kZero, x);
1682   u[1] = _mm_add_epi32(u[1], rnding);
1683   u[1] = _mm_srai_epi32(u[1], bit);
1684 
1685   // stage 3
1686   // stage 4
1687   __m128i temp1, temp2;
1688   temp1 = _mm_mullo_epi32(u[0], cospi16);
1689   x = _mm_mullo_epi32(u[1], cospi48);
1690   temp1 = _mm_add_epi32(temp1, x);
1691   temp1 = _mm_add_epi32(temp1, rnding);
1692   temp1 = _mm_srai_epi32(temp1, bit);
1693   u[4] = temp1;
1694 
1695   temp2 = _mm_mullo_epi32(u[0], cospi48);
1696   x = _mm_mullo_epi32(u[1], cospi16);
1697   u[5] = _mm_sub_epi32(temp2, x);
1698   u[5] = _mm_add_epi32(u[5], rnding);
1699   u[5] = _mm_srai_epi32(u[5], bit);
1700 
1701   // stage 5
1702   // stage 6
1703   temp1 = _mm_mullo_epi32(u[0], cospi32);
1704   x = _mm_mullo_epi32(u[1], cospi32);
1705   u[2] = _mm_add_epi32(temp1, x);
1706   u[2] = _mm_add_epi32(u[2], rnding);
1707   u[2] = _mm_srai_epi32(u[2], bit);
1708 
1709   u[3] = _mm_sub_epi32(temp1, x);
1710   u[3] = _mm_add_epi32(u[3], rnding);
1711   u[3] = _mm_srai_epi32(u[3], bit);
1712 
1713   temp1 = _mm_mullo_epi32(u[4], cospi32);
1714   x = _mm_mullo_epi32(u[5], cospi32);
1715   u[6] = _mm_add_epi32(temp1, x);
1716   u[6] = _mm_add_epi32(u[6], rnding);
1717   u[6] = _mm_srai_epi32(u[6], bit);
1718 
1719   u[7] = _mm_sub_epi32(temp1, x);
1720   u[7] = _mm_add_epi32(u[7], rnding);
1721   u[7] = _mm_srai_epi32(u[7], bit);
1722 
1723   // stage 7
1724   if (do_cols) {
1725     out[0] = u[0];
1726     out[1] = _mm_sub_epi32(kZero, u[4]);
1727     out[2] = u[6];
1728     out[3] = _mm_sub_epi32(kZero, u[2]);
1729     out[4] = u[3];
1730     out[5] = _mm_sub_epi32(kZero, u[7]);
1731     out[6] = u[5];
1732     out[7] = _mm_sub_epi32(kZero, u[1]);
1733   } else {
1734     const int log_range_out = AOMMAX(16, bd + 6);
1735     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1736     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1737 
1738     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1739                      out_shift);
1740     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1741                      out_shift);
1742     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1743                      out_shift);
1744     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1745                      out_shift);
1746   }
1747 }
1748 
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1749 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1750                                 int bd, int out_shift) {
1751   const int32_t *cospi = cospi_arr(bit);
1752   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1753   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1754   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1755   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1756   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1757   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1758   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1759   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1760   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1761   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1762   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1763   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1764   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1765   const __m128i kZero = _mm_setzero_si128();
1766   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1767   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1768   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1769   __m128i u[8], v[8], x;
1770 
1771   // stage 0
1772   // stage 1
1773   // stage 2
1774 
1775   u[0] = _mm_mullo_epi32(in[7], cospi4);
1776   x = _mm_mullo_epi32(in[0], cospi60);
1777   u[0] = _mm_add_epi32(u[0], x);
1778   u[0] = _mm_add_epi32(u[0], rnding);
1779   u[0] = _mm_srai_epi32(u[0], bit);
1780 
1781   u[1] = _mm_mullo_epi32(in[7], cospi60);
1782   x = _mm_mullo_epi32(in[0], cospi4);
1783   u[1] = _mm_sub_epi32(u[1], x);
1784   u[1] = _mm_add_epi32(u[1], rnding);
1785   u[1] = _mm_srai_epi32(u[1], bit);
1786 
1787   // (2)
1788   u[2] = _mm_mullo_epi32(in[5], cospi20);
1789   x = _mm_mullo_epi32(in[2], cospi44);
1790   u[2] = _mm_add_epi32(u[2], x);
1791   u[2] = _mm_add_epi32(u[2], rnding);
1792   u[2] = _mm_srai_epi32(u[2], bit);
1793 
1794   u[3] = _mm_mullo_epi32(in[5], cospi44);
1795   x = _mm_mullo_epi32(in[2], cospi20);
1796   u[3] = _mm_sub_epi32(u[3], x);
1797   u[3] = _mm_add_epi32(u[3], rnding);
1798   u[3] = _mm_srai_epi32(u[3], bit);
1799 
1800   // (3)
1801   u[4] = _mm_mullo_epi32(in[3], cospi36);
1802   x = _mm_mullo_epi32(in[4], cospi28);
1803   u[4] = _mm_add_epi32(u[4], x);
1804   u[4] = _mm_add_epi32(u[4], rnding);
1805   u[4] = _mm_srai_epi32(u[4], bit);
1806 
1807   u[5] = _mm_mullo_epi32(in[3], cospi28);
1808   x = _mm_mullo_epi32(in[4], cospi36);
1809   u[5] = _mm_sub_epi32(u[5], x);
1810   u[5] = _mm_add_epi32(u[5], rnding);
1811   u[5] = _mm_srai_epi32(u[5], bit);
1812 
1813   // (4)
1814   u[6] = _mm_mullo_epi32(in[1], cospi52);
1815   x = _mm_mullo_epi32(in[6], cospi12);
1816   u[6] = _mm_add_epi32(u[6], x);
1817   u[6] = _mm_add_epi32(u[6], rnding);
1818   u[6] = _mm_srai_epi32(u[6], bit);
1819 
1820   u[7] = _mm_mullo_epi32(in[1], cospi12);
1821   x = _mm_mullo_epi32(in[6], cospi52);
1822   u[7] = _mm_sub_epi32(u[7], x);
1823   u[7] = _mm_add_epi32(u[7], rnding);
1824   u[7] = _mm_srai_epi32(u[7], bit);
1825 
1826   // stage 3
1827   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1828   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1829   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1830   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1831 
1832   // stage 4
1833   u[0] = v[0];
1834   u[1] = v[1];
1835   u[2] = v[2];
1836   u[3] = v[3];
1837 
1838   u[4] = _mm_mullo_epi32(v[4], cospi16);
1839   x = _mm_mullo_epi32(v[5], cospi48);
1840   u[4] = _mm_add_epi32(u[4], x);
1841   u[4] = _mm_add_epi32(u[4], rnding);
1842   u[4] = _mm_srai_epi32(u[4], bit);
1843 
1844   u[5] = _mm_mullo_epi32(v[4], cospi48);
1845   x = _mm_mullo_epi32(v[5], cospi16);
1846   u[5] = _mm_sub_epi32(u[5], x);
1847   u[5] = _mm_add_epi32(u[5], rnding);
1848   u[5] = _mm_srai_epi32(u[5], bit);
1849 
1850   u[6] = _mm_mullo_epi32(v[6], cospim48);
1851   x = _mm_mullo_epi32(v[7], cospi16);
1852   u[6] = _mm_add_epi32(u[6], x);
1853   u[6] = _mm_add_epi32(u[6], rnding);
1854   u[6] = _mm_srai_epi32(u[6], bit);
1855 
1856   u[7] = _mm_mullo_epi32(v[6], cospi16);
1857   x = _mm_mullo_epi32(v[7], cospim48);
1858   u[7] = _mm_sub_epi32(u[7], x);
1859   u[7] = _mm_add_epi32(u[7], rnding);
1860   u[7] = _mm_srai_epi32(u[7], bit);
1861 
1862   // stage 5
1863   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1864   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1865   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1866   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1867 
1868   // stage 6
1869   u[0] = v[0];
1870   u[1] = v[1];
1871   u[4] = v[4];
1872   u[5] = v[5];
1873 
1874   v[0] = _mm_mullo_epi32(v[2], cospi32);
1875   x = _mm_mullo_epi32(v[3], cospi32);
1876   u[2] = _mm_add_epi32(v[0], x);
1877   u[2] = _mm_add_epi32(u[2], rnding);
1878   u[2] = _mm_srai_epi32(u[2], bit);
1879 
1880   u[3] = _mm_sub_epi32(v[0], x);
1881   u[3] = _mm_add_epi32(u[3], rnding);
1882   u[3] = _mm_srai_epi32(u[3], bit);
1883 
1884   v[0] = _mm_mullo_epi32(v[6], cospi32);
1885   x = _mm_mullo_epi32(v[7], cospi32);
1886   u[6] = _mm_add_epi32(v[0], x);
1887   u[6] = _mm_add_epi32(u[6], rnding);
1888   u[6] = _mm_srai_epi32(u[6], bit);
1889 
1890   u[7] = _mm_sub_epi32(v[0], x);
1891   u[7] = _mm_add_epi32(u[7], rnding);
1892   u[7] = _mm_srai_epi32(u[7], bit);
1893 
1894   // stage 7
1895   if (do_cols) {
1896     out[0] = u[0];
1897     out[1] = _mm_sub_epi32(kZero, u[4]);
1898     out[2] = u[6];
1899     out[3] = _mm_sub_epi32(kZero, u[2]);
1900     out[4] = u[3];
1901     out[5] = _mm_sub_epi32(kZero, u[7]);
1902     out[6] = u[5];
1903     out[7] = _mm_sub_epi32(kZero, u[1]);
1904   } else {
1905     const int log_range_out = AOMMAX(16, bd + 6);
1906     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1907     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1908 
1909     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1910                      out_shift);
1911     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1912                      out_shift);
1913     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1914                      out_shift);
1915     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1916                      out_shift);
1917   }
1918 }
1919 
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1920 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1921                                   int do_cols, int bd, int out_shift) {
1922   const int32_t *cospi = cospi_arr(bit);
1923   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1924   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1925   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1926   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1927   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1928   // stage 0
1929   // stage 1
1930   // stage 2
1931   // stage 3
1932   // stage 4
1933   in[0] = _mm_mullo_epi32(in[0], cospi32);
1934   in[0] = _mm_add_epi32(in[0], rnding);
1935   in[0] = _mm_srai_epi32(in[0], bit);
1936 
1937   // stage 5
1938   // stage 6
1939   // stage 7
1940   if (!do_cols) {
1941     log_range = AOMMAX(16, bd + 6);
1942     clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1943     clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1944     if (out_shift != 0) {
1945       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1946       in[0] = _mm_add_epi32(in[0], offset);
1947       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1948     }
1949   }
1950 
1951   in[0] = _mm_max_epi32(in[0], clamp_lo);
1952   in[0] = _mm_min_epi32(in[0], clamp_hi);
1953   out[0] = in[0];
1954   out[1] = in[0];
1955   out[2] = in[0];
1956   out[3] = in[0];
1957   out[4] = in[0];
1958   out[5] = in[0];
1959   out[6] = in[0];
1960   out[7] = in[0];
1961   out[8] = in[0];
1962   out[9] = in[0];
1963   out[10] = in[0];
1964   out[11] = in[0];
1965   out[12] = in[0];
1966   out[13] = in[0];
1967   out[14] = in[0];
1968   out[15] = in[0];
1969 }
1970 
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1971 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1972                                   int do_cols, int bd, int out_shift) {
1973   const int32_t *cospi = cospi_arr(bit);
1974   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1975   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1976   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1977   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1978   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1979   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1980   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1981   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1982   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1983   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1984   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1985   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1986   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1987   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1988   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1989   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1990   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1991   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1992   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1993   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1994   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1995   __m128i u[16], x, y;
1996   // stage 0
1997   // stage 1
1998   u[0] = in[0];
1999   u[2] = in[4];
2000   u[4] = in[2];
2001   u[6] = in[6];
2002   u[8] = in[1];
2003   u[10] = in[5];
2004   u[12] = in[3];
2005   u[14] = in[7];
2006 
2007   // stage 2
2008   u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
2009   u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
2010 
2011   u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
2012   u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
2013 
2014   u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
2015   u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
2016 
2017   u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
2018   u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
2019 
2020   // stage 3
2021   u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
2022   u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
2023   u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
2024   u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
2025 
2026   addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2027   addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2028   addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2029   addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2030 
2031   // stage 4
2032   x = _mm_mullo_epi32(u[0], cospi32);
2033   u[0] = _mm_add_epi32(x, rnding);
2034   u[0] = _mm_srai_epi32(u[0], bit);
2035   u[1] = u[0];
2036 
2037   u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
2038   u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
2039 
2040   addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
2041   addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
2042 
2043   x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2044   u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2045   u[9] = x;
2046   y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2047   u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2048   u[10] = y;
2049 
2050   // stage 5
2051   addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2052   addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2053 
2054   x = _mm_mullo_epi32(u[5], cospi32);
2055   y = _mm_mullo_epi32(u[6], cospi32);
2056   u[5] = _mm_sub_epi32(y, x);
2057   u[5] = _mm_add_epi32(u[5], rnding);
2058   u[5] = _mm_srai_epi32(u[5], bit);
2059 
2060   u[6] = _mm_add_epi32(y, x);
2061   u[6] = _mm_add_epi32(u[6], rnding);
2062   u[6] = _mm_srai_epi32(u[6], bit);
2063 
2064   addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2065   addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2066   addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2067   addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2068 
2069   // stage 6
2070   addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
2071   addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
2072   addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
2073   addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
2074 
2075   x = _mm_mullo_epi32(u[10], cospi32);
2076   y = _mm_mullo_epi32(u[13], cospi32);
2077   u[10] = _mm_sub_epi32(y, x);
2078   u[10] = _mm_add_epi32(u[10], rnding);
2079   u[10] = _mm_srai_epi32(u[10], bit);
2080 
2081   u[13] = _mm_add_epi32(x, y);
2082   u[13] = _mm_add_epi32(u[13], rnding);
2083   u[13] = _mm_srai_epi32(u[13], bit);
2084 
2085   x = _mm_mullo_epi32(u[11], cospi32);
2086   y = _mm_mullo_epi32(u[12], cospi32);
2087   u[11] = _mm_sub_epi32(y, x);
2088   u[11] = _mm_add_epi32(u[11], rnding);
2089   u[11] = _mm_srai_epi32(u[11], bit);
2090 
2091   u[12] = _mm_add_epi32(x, y);
2092   u[12] = _mm_add_epi32(u[12], rnding);
2093   u[12] = _mm_srai_epi32(u[12], bit);
2094   // stage 7
2095   addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2096   addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2097   addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2098   addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2099   addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2100   addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2101   addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2102   addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2103 
2104   if (!do_cols) {
2105     const int log_range_out = AOMMAX(16, bd + 6);
2106     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2107     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2108     round_shift_8x8(out, out_shift);
2109     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2110   }
2111 }
2112 
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2113 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
2114                                    int do_cols, int bd, int out_shift) {
2115   const int32_t *cospi = cospi_arr(bit);
2116   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2117   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2118   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2119   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2120   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2121   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2122   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2123   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2124   const __m128i zero = _mm_setzero_si128();
2125   __m128i v[16], x, y, temp1, temp2;
2126   // stage 0
2127   // stage 1
2128   // stage 2
2129   x = _mm_mullo_epi32(in[0], cospi62);
2130   v[0] = _mm_add_epi32(x, rnding);
2131   v[0] = _mm_srai_epi32(v[0], bit);
2132 
2133   x = _mm_mullo_epi32(in[0], cospi2);
2134   v[1] = _mm_sub_epi32(zero, x);
2135   v[1] = _mm_add_epi32(v[1], rnding);
2136   v[1] = _mm_srai_epi32(v[1], bit);
2137 
2138   // stage 3
2139   v[8] = v[0];
2140   v[9] = v[1];
2141 
2142   // stage 4
2143   temp1 = _mm_mullo_epi32(v[8], cospi8);
2144   x = _mm_mullo_epi32(v[9], cospi56);
2145   temp1 = _mm_add_epi32(temp1, x);
2146   temp1 = _mm_add_epi32(temp1, rnding);
2147   temp1 = _mm_srai_epi32(temp1, bit);
2148 
2149   temp2 = _mm_mullo_epi32(v[8], cospi56);
2150   x = _mm_mullo_epi32(v[9], cospi8);
2151   temp2 = _mm_sub_epi32(temp2, x);
2152   temp2 = _mm_add_epi32(temp2, rnding);
2153   temp2 = _mm_srai_epi32(temp2, bit);
2154   v[8] = temp1;
2155   v[9] = temp2;
2156 
2157   // stage 5
2158   v[4] = v[0];
2159   v[5] = v[1];
2160   v[12] = v[8];
2161   v[13] = v[9];
2162 
2163   // stage 6
2164   temp1 = _mm_mullo_epi32(v[4], cospi16);
2165   x = _mm_mullo_epi32(v[5], cospi48);
2166   temp1 = _mm_add_epi32(temp1, x);
2167   temp1 = _mm_add_epi32(temp1, rnding);
2168   temp1 = _mm_srai_epi32(temp1, bit);
2169 
2170   temp2 = _mm_mullo_epi32(v[4], cospi48);
2171   x = _mm_mullo_epi32(v[5], cospi16);
2172   temp2 = _mm_sub_epi32(temp2, x);
2173   temp2 = _mm_add_epi32(temp2, rnding);
2174   temp2 = _mm_srai_epi32(temp2, bit);
2175   v[4] = temp1;
2176   v[5] = temp2;
2177 
2178   temp1 = _mm_mullo_epi32(v[12], cospi16);
2179   x = _mm_mullo_epi32(v[13], cospi48);
2180   temp1 = _mm_add_epi32(temp1, x);
2181   temp1 = _mm_add_epi32(temp1, rnding);
2182   temp1 = _mm_srai_epi32(temp1, bit);
2183 
2184   temp2 = _mm_mullo_epi32(v[12], cospi48);
2185   x = _mm_mullo_epi32(v[13], cospi16);
2186   temp2 = _mm_sub_epi32(temp2, x);
2187   temp2 = _mm_add_epi32(temp2, rnding);
2188   temp2 = _mm_srai_epi32(temp2, bit);
2189   v[12] = temp1;
2190   v[13] = temp2;
2191 
2192   // stage 7
2193   v[2] = v[0];
2194   v[3] = v[1];
2195   v[6] = v[4];
2196   v[7] = v[5];
2197   v[10] = v[8];
2198   v[11] = v[9];
2199   v[14] = v[12];
2200   v[15] = v[13];
2201 
2202   // stage 8
2203   y = _mm_mullo_epi32(v[2], cospi32);
2204   x = _mm_mullo_epi32(v[3], cospi32);
2205   v[2] = _mm_add_epi32(y, x);
2206   v[2] = _mm_add_epi32(v[2], rnding);
2207   v[2] = _mm_srai_epi32(v[2], bit);
2208 
2209   v[3] = _mm_sub_epi32(y, x);
2210   v[3] = _mm_add_epi32(v[3], rnding);
2211   v[3] = _mm_srai_epi32(v[3], bit);
2212 
2213   y = _mm_mullo_epi32(v[6], cospi32);
2214   x = _mm_mullo_epi32(v[7], cospi32);
2215   v[6] = _mm_add_epi32(y, x);
2216   v[6] = _mm_add_epi32(v[6], rnding);
2217   v[6] = _mm_srai_epi32(v[6], bit);
2218 
2219   v[7] = _mm_sub_epi32(y, x);
2220   v[7] = _mm_add_epi32(v[7], rnding);
2221   v[7] = _mm_srai_epi32(v[7], bit);
2222 
2223   y = _mm_mullo_epi32(v[10], cospi32);
2224   x = _mm_mullo_epi32(v[11], cospi32);
2225   v[10] = _mm_add_epi32(y, x);
2226   v[10] = _mm_add_epi32(v[10], rnding);
2227   v[10] = _mm_srai_epi32(v[10], bit);
2228 
2229   v[11] = _mm_sub_epi32(y, x);
2230   v[11] = _mm_add_epi32(v[11], rnding);
2231   v[11] = _mm_srai_epi32(v[11], bit);
2232 
2233   y = _mm_mullo_epi32(v[14], cospi32);
2234   x = _mm_mullo_epi32(v[15], cospi32);
2235   v[14] = _mm_add_epi32(y, x);
2236   v[14] = _mm_add_epi32(v[14], rnding);
2237   v[14] = _mm_srai_epi32(v[14], bit);
2238 
2239   v[15] = _mm_sub_epi32(y, x);
2240   v[15] = _mm_add_epi32(v[15], rnding);
2241   v[15] = _mm_srai_epi32(v[15], bit);
2242 
2243   // stage 9
2244   if (do_cols) {
2245     out[0] = v[0];
2246     out[1] = _mm_sub_epi32(zero, v[8]);
2247     out[2] = v[12];
2248     out[3] = _mm_sub_epi32(zero, v[4]);
2249     out[4] = v[6];
2250     out[5] = _mm_sub_epi32(zero, v[14]);
2251     out[6] = v[10];
2252     out[7] = _mm_sub_epi32(zero, v[2]);
2253     out[8] = v[3];
2254     out[9] = _mm_sub_epi32(zero, v[11]);
2255     out[10] = v[15];
2256     out[11] = _mm_sub_epi32(zero, v[7]);
2257     out[12] = v[5];
2258     out[13] = _mm_sub_epi32(zero, v[13]);
2259     out[14] = v[9];
2260     out[15] = _mm_sub_epi32(zero, v[1]);
2261   } else {
2262     const int log_range_out = AOMMAX(16, bd + 6);
2263     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2264     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2265 
2266     neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2267                      out_shift);
2268     neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2269                      &clamp_hi_out, out_shift);
2270     neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2271                      &clamp_hi_out, out_shift);
2272     neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2273                      &clamp_hi_out, out_shift);
2274     neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2275                      &clamp_hi_out, out_shift);
2276     neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2277                      &clamp_hi_out, out_shift);
2278     neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2279                      &clamp_hi_out, out_shift);
2280     neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2281                      &clamp_hi_out, out_shift);
2282   }
2283 }
2284 
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2285 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2286                                    int do_cols, int bd, int out_shift) {
2287   const int32_t *cospi = cospi_arr(bit);
2288   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2289   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2290   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2291   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2292   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2293   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2294   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2295   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2296   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2297   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2298   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2299   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2300   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2301   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2302   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2303   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2304   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2305   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2306   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2307   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2308   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2309   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2310   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2311   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2312   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2313   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2314   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2315   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2316   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2317   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2318   __m128i zero = _mm_setzero_si128();
2319   __m128i u[16], x, y;
2320 
2321   // stage 0
2322   // stage 1
2323   // stage 2
2324   x = _mm_mullo_epi32(in[0], cospi62);
2325   u[0] = _mm_add_epi32(x, rnding);
2326   u[0] = _mm_srai_epi32(u[0], bit);
2327 
2328   x = _mm_mullo_epi32(in[0], cospi2);
2329   u[1] = _mm_sub_epi32(zero, x);
2330   u[1] = _mm_add_epi32(u[1], rnding);
2331   u[1] = _mm_srai_epi32(u[1], bit);
2332 
2333   x = _mm_mullo_epi32(in[2], cospi54);
2334   u[2] = _mm_add_epi32(x, rnding);
2335   u[2] = _mm_srai_epi32(u[2], bit);
2336 
2337   x = _mm_mullo_epi32(in[2], cospi10);
2338   u[3] = _mm_sub_epi32(zero, x);
2339   u[3] = _mm_add_epi32(u[3], rnding);
2340   u[3] = _mm_srai_epi32(u[3], bit);
2341 
2342   x = _mm_mullo_epi32(in[4], cospi46);
2343   u[4] = _mm_add_epi32(x, rnding);
2344   u[4] = _mm_srai_epi32(u[4], bit);
2345 
2346   x = _mm_mullo_epi32(in[4], cospi18);
2347   u[5] = _mm_sub_epi32(zero, x);
2348   u[5] = _mm_add_epi32(u[5], rnding);
2349   u[5] = _mm_srai_epi32(u[5], bit);
2350 
2351   x = _mm_mullo_epi32(in[6], cospi38);
2352   u[6] = _mm_add_epi32(x, rnding);
2353   u[6] = _mm_srai_epi32(u[6], bit);
2354 
2355   x = _mm_mullo_epi32(in[6], cospi26);
2356   u[7] = _mm_sub_epi32(zero, x);
2357   u[7] = _mm_add_epi32(u[7], rnding);
2358   u[7] = _mm_srai_epi32(u[7], bit);
2359 
2360   u[8] = _mm_mullo_epi32(in[7], cospi34);
2361   u[8] = _mm_add_epi32(u[8], rnding);
2362   u[8] = _mm_srai_epi32(u[8], bit);
2363 
2364   u[9] = _mm_mullo_epi32(in[7], cospi30);
2365   u[9] = _mm_add_epi32(u[9], rnding);
2366   u[9] = _mm_srai_epi32(u[9], bit);
2367 
2368   u[10] = _mm_mullo_epi32(in[5], cospi42);
2369   u[10] = _mm_add_epi32(u[10], rnding);
2370   u[10] = _mm_srai_epi32(u[10], bit);
2371 
2372   u[11] = _mm_mullo_epi32(in[5], cospi22);
2373   u[11] = _mm_add_epi32(u[11], rnding);
2374   u[11] = _mm_srai_epi32(u[11], bit);
2375 
2376   u[12] = _mm_mullo_epi32(in[3], cospi50);
2377   u[12] = _mm_add_epi32(u[12], rnding);
2378   u[12] = _mm_srai_epi32(u[12], bit);
2379 
2380   u[13] = _mm_mullo_epi32(in[3], cospi14);
2381   u[13] = _mm_add_epi32(u[13], rnding);
2382   u[13] = _mm_srai_epi32(u[13], bit);
2383 
2384   u[14] = _mm_mullo_epi32(in[1], cospi58);
2385   u[14] = _mm_add_epi32(u[14], rnding);
2386   u[14] = _mm_srai_epi32(u[14], bit);
2387 
2388   u[15] = _mm_mullo_epi32(in[1], cospi6);
2389   u[15] = _mm_add_epi32(u[15], rnding);
2390   u[15] = _mm_srai_epi32(u[15], bit);
2391 
2392   // stage 3
2393   addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2394   addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2395   addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2396   addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2397   addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2398   addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2399   addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2400   addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2401 
2402   // stage 4
2403   y = _mm_mullo_epi32(u[8], cospi56);
2404   x = _mm_mullo_epi32(u[9], cospi56);
2405   u[8] = _mm_mullo_epi32(u[8], cospi8);
2406   u[8] = _mm_add_epi32(u[8], x);
2407   u[8] = _mm_add_epi32(u[8], rnding);
2408   u[8] = _mm_srai_epi32(u[8], bit);
2409 
2410   x = _mm_mullo_epi32(u[9], cospi8);
2411   u[9] = _mm_sub_epi32(y, x);
2412   u[9] = _mm_add_epi32(u[9], rnding);
2413   u[9] = _mm_srai_epi32(u[9], bit);
2414 
2415   x = _mm_mullo_epi32(u[11], cospi24);
2416   y = _mm_mullo_epi32(u[10], cospi24);
2417   u[10] = _mm_mullo_epi32(u[10], cospi40);
2418   u[10] = _mm_add_epi32(u[10], x);
2419   u[10] = _mm_add_epi32(u[10], rnding);
2420   u[10] = _mm_srai_epi32(u[10], bit);
2421 
2422   x = _mm_mullo_epi32(u[11], cospi40);
2423   u[11] = _mm_sub_epi32(y, x);
2424   u[11] = _mm_add_epi32(u[11], rnding);
2425   u[11] = _mm_srai_epi32(u[11], bit);
2426 
2427   x = _mm_mullo_epi32(u[13], cospi8);
2428   y = _mm_mullo_epi32(u[12], cospi8);
2429   u[12] = _mm_mullo_epi32(u[12], cospim56);
2430   u[12] = _mm_add_epi32(u[12], x);
2431   u[12] = _mm_add_epi32(u[12], rnding);
2432   u[12] = _mm_srai_epi32(u[12], bit);
2433 
2434   x = _mm_mullo_epi32(u[13], cospim56);
2435   u[13] = _mm_sub_epi32(y, x);
2436   u[13] = _mm_add_epi32(u[13], rnding);
2437   u[13] = _mm_srai_epi32(u[13], bit);
2438 
2439   x = _mm_mullo_epi32(u[15], cospi40);
2440   y = _mm_mullo_epi32(u[14], cospi40);
2441   u[14] = _mm_mullo_epi32(u[14], cospim24);
2442   u[14] = _mm_add_epi32(u[14], x);
2443   u[14] = _mm_add_epi32(u[14], rnding);
2444   u[14] = _mm_srai_epi32(u[14], bit);
2445 
2446   x = _mm_mullo_epi32(u[15], cospim24);
2447   u[15] = _mm_sub_epi32(y, x);
2448   u[15] = _mm_add_epi32(u[15], rnding);
2449   u[15] = _mm_srai_epi32(u[15], bit);
2450 
2451   // stage 5
2452   addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2453   addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2454   addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2455   addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2456   addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2457   addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2458   addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2459   addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2460 
2461   // stage 6
2462   x = _mm_mullo_epi32(u[5], cospi48);
2463   y = _mm_mullo_epi32(u[4], cospi48);
2464   u[4] = _mm_mullo_epi32(u[4], cospi16);
2465   u[4] = _mm_add_epi32(u[4], x);
2466   u[4] = _mm_add_epi32(u[4], rnding);
2467   u[4] = _mm_srai_epi32(u[4], bit);
2468 
2469   x = _mm_mullo_epi32(u[5], cospi16);
2470   u[5] = _mm_sub_epi32(y, x);
2471   u[5] = _mm_add_epi32(u[5], rnding);
2472   u[5] = _mm_srai_epi32(u[5], bit);
2473 
2474   x = _mm_mullo_epi32(u[7], cospi16);
2475   y = _mm_mullo_epi32(u[6], cospi16);
2476   u[6] = _mm_mullo_epi32(u[6], cospim48);
2477   u[6] = _mm_add_epi32(u[6], x);
2478   u[6] = _mm_add_epi32(u[6], rnding);
2479   u[6] = _mm_srai_epi32(u[6], bit);
2480 
2481   x = _mm_mullo_epi32(u[7], cospim48);
2482   u[7] = _mm_sub_epi32(y, x);
2483   u[7] = _mm_add_epi32(u[7], rnding);
2484   u[7] = _mm_srai_epi32(u[7], bit);
2485 
2486   x = _mm_mullo_epi32(u[13], cospi48);
2487   y = _mm_mullo_epi32(u[12], cospi48);
2488   u[12] = _mm_mullo_epi32(u[12], cospi16);
2489   u[12] = _mm_add_epi32(u[12], x);
2490   u[12] = _mm_add_epi32(u[12], rnding);
2491   u[12] = _mm_srai_epi32(u[12], bit);
2492 
2493   x = _mm_mullo_epi32(u[13], cospi16);
2494   u[13] = _mm_sub_epi32(y, x);
2495   u[13] = _mm_add_epi32(u[13], rnding);
2496   u[13] = _mm_srai_epi32(u[13], bit);
2497 
2498   x = _mm_mullo_epi32(u[15], cospi16);
2499   y = _mm_mullo_epi32(u[14], cospi16);
2500   u[14] = _mm_mullo_epi32(u[14], cospim48);
2501   u[14] = _mm_add_epi32(u[14], x);
2502   u[14] = _mm_add_epi32(u[14], rnding);
2503   u[14] = _mm_srai_epi32(u[14], bit);
2504 
2505   x = _mm_mullo_epi32(u[15], cospim48);
2506   u[15] = _mm_sub_epi32(y, x);
2507   u[15] = _mm_add_epi32(u[15], rnding);
2508   u[15] = _mm_srai_epi32(u[15], bit);
2509 
2510   // stage 7
2511   addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2512   addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2513   addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2514   addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2515   addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2516   addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2517   addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2518   addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2519 
2520   // stage 8
2521   y = _mm_mullo_epi32(u[2], cospi32);
2522   x = _mm_mullo_epi32(u[3], cospi32);
2523   u[2] = _mm_add_epi32(y, x);
2524   u[2] = _mm_add_epi32(u[2], rnding);
2525   u[2] = _mm_srai_epi32(u[2], bit);
2526 
2527   u[3] = _mm_sub_epi32(y, x);
2528   u[3] = _mm_add_epi32(u[3], rnding);
2529   u[3] = _mm_srai_epi32(u[3], bit);
2530   y = _mm_mullo_epi32(u[6], cospi32);
2531   x = _mm_mullo_epi32(u[7], cospi32);
2532   u[6] = _mm_add_epi32(y, x);
2533   u[6] = _mm_add_epi32(u[6], rnding);
2534   u[6] = _mm_srai_epi32(u[6], bit);
2535 
2536   u[7] = _mm_sub_epi32(y, x);
2537   u[7] = _mm_add_epi32(u[7], rnding);
2538   u[7] = _mm_srai_epi32(u[7], bit);
2539 
2540   y = _mm_mullo_epi32(u[10], cospi32);
2541   x = _mm_mullo_epi32(u[11], cospi32);
2542   u[10] = _mm_add_epi32(y, x);
2543   u[10] = _mm_add_epi32(u[10], rnding);
2544   u[10] = _mm_srai_epi32(u[10], bit);
2545 
2546   u[11] = _mm_sub_epi32(y, x);
2547   u[11] = _mm_add_epi32(u[11], rnding);
2548   u[11] = _mm_srai_epi32(u[11], bit);
2549 
2550   y = _mm_mullo_epi32(u[14], cospi32);
2551   x = _mm_mullo_epi32(u[15], cospi32);
2552   u[14] = _mm_add_epi32(y, x);
2553   u[14] = _mm_add_epi32(u[14], rnding);
2554   u[14] = _mm_srai_epi32(u[14], bit);
2555 
2556   u[15] = _mm_sub_epi32(y, x);
2557   u[15] = _mm_add_epi32(u[15], rnding);
2558   u[15] = _mm_srai_epi32(u[15], bit);
2559 
2560   // stage 9
2561   if (do_cols) {
2562     out[0] = u[0];
2563     out[1] = _mm_sub_epi32(zero, u[8]);
2564     out[2] = u[12];
2565     out[3] = _mm_sub_epi32(zero, u[4]);
2566     out[4] = u[6];
2567     out[5] = _mm_sub_epi32(zero, u[14]);
2568     out[6] = u[10];
2569     out[7] = _mm_sub_epi32(zero, u[2]);
2570     out[8] = u[3];
2571     out[9] = _mm_sub_epi32(zero, u[11]);
2572     out[10] = u[15];
2573     out[11] = _mm_sub_epi32(zero, u[7]);
2574     out[12] = u[5];
2575     out[13] = _mm_sub_epi32(zero, u[13]);
2576     out[14] = u[9];
2577     out[15] = _mm_sub_epi32(zero, u[1]);
2578   } else {
2579     const int log_range_out = AOMMAX(16, bd + 6);
2580     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2581     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2582 
2583     neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2584                      out_shift);
2585     neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2586                      &clamp_hi_out, out_shift);
2587     neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2588                      &clamp_hi_out, out_shift);
2589     neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2590                      &clamp_hi_out, out_shift);
2591     neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2592                      &clamp_hi_out, out_shift);
2593     neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2594                      &clamp_hi_out, out_shift);
2595     neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2596                      &clamp_hi_out, out_shift);
2597     neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2598                      &clamp_hi_out, out_shift);
2599   }
2600 }
2601 
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2602 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2603                              int bd, int out_shift) {
2604   const int32_t *cospi = cospi_arr(bit);
2605   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2606   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2607   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2608   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2609   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2610   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2611   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2612   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2613   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2614   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2615   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2616   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2617   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2618   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2619   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2620   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2621   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2622   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2623   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2624   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2625   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2626   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2627   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2628   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2629   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2630   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2631   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2632   __m128i u[16], v[16], x, y;
2633 
2634   {
2635     // stage 0
2636     // stage 1
2637     u[0] = in[0];
2638     u[1] = in[8];
2639     u[2] = in[4];
2640     u[3] = in[12];
2641     u[4] = in[2];
2642     u[5] = in[10];
2643     u[6] = in[6];
2644     u[7] = in[14];
2645     u[8] = in[1];
2646     u[9] = in[9];
2647     u[10] = in[5];
2648     u[11] = in[13];
2649     u[12] = in[3];
2650     u[13] = in[11];
2651     u[14] = in[7];
2652     u[15] = in[15];
2653 
2654     // stage 2
2655     v[0] = u[0];
2656     v[1] = u[1];
2657     v[2] = u[2];
2658     v[3] = u[3];
2659     v[4] = u[4];
2660     v[5] = u[5];
2661     v[6] = u[6];
2662     v[7] = u[7];
2663 
2664     v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2665     v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2666     v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2667     v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2668     v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2669     v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2670     v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2671     v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2672 
2673     // stage 3
2674     u[0] = v[0];
2675     u[1] = v[1];
2676     u[2] = v[2];
2677     u[3] = v[3];
2678     u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2679     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2680     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2681     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2682     addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2683     addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2684     addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2685     addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2686 
2687     // stage 4
2688     x = _mm_mullo_epi32(u[0], cospi32);
2689     y = _mm_mullo_epi32(u[1], cospi32);
2690     v[0] = _mm_add_epi32(x, y);
2691     v[0] = _mm_add_epi32(v[0], rnding);
2692     v[0] = _mm_srai_epi32(v[0], bit);
2693 
2694     v[1] = _mm_sub_epi32(x, y);
2695     v[1] = _mm_add_epi32(v[1], rnding);
2696     v[1] = _mm_srai_epi32(v[1], bit);
2697 
2698     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2699     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2700     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2701     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2702     v[8] = u[8];
2703     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2704     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2705     v[11] = u[11];
2706     v[12] = u[12];
2707     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2708     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2709     v[15] = u[15];
2710 
2711     // stage 5
2712     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2713     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2714     u[4] = v[4];
2715 
2716     x = _mm_mullo_epi32(v[5], cospi32);
2717     y = _mm_mullo_epi32(v[6], cospi32);
2718     u[5] = _mm_sub_epi32(y, x);
2719     u[5] = _mm_add_epi32(u[5], rnding);
2720     u[5] = _mm_srai_epi32(u[5], bit);
2721 
2722     u[6] = _mm_add_epi32(y, x);
2723     u[6] = _mm_add_epi32(u[6], rnding);
2724     u[6] = _mm_srai_epi32(u[6], bit);
2725 
2726     u[7] = v[7];
2727     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2728     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2729     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2730     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2731 
2732     // stage 6
2733     addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2734     addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2735     addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2736     addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2737     v[8] = u[8];
2738     v[9] = u[9];
2739 
2740     x = _mm_mullo_epi32(u[10], cospi32);
2741     y = _mm_mullo_epi32(u[13], cospi32);
2742     v[10] = _mm_sub_epi32(y, x);
2743     v[10] = _mm_add_epi32(v[10], rnding);
2744     v[10] = _mm_srai_epi32(v[10], bit);
2745 
2746     v[13] = _mm_add_epi32(x, y);
2747     v[13] = _mm_add_epi32(v[13], rnding);
2748     v[13] = _mm_srai_epi32(v[13], bit);
2749 
2750     x = _mm_mullo_epi32(u[11], cospi32);
2751     y = _mm_mullo_epi32(u[12], cospi32);
2752     v[11] = _mm_sub_epi32(y, x);
2753     v[11] = _mm_add_epi32(v[11], rnding);
2754     v[11] = _mm_srai_epi32(v[11], bit);
2755 
2756     v[12] = _mm_add_epi32(x, y);
2757     v[12] = _mm_add_epi32(v[12], rnding);
2758     v[12] = _mm_srai_epi32(v[12], bit);
2759 
2760     v[14] = u[14];
2761     v[15] = u[15];
2762 
2763     // stage 7
2764     addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2765     addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2766     addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2767     addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2768     addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2769     addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2770     addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2771     addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2772 
2773     if (!do_cols) {
2774       const int log_range_out = AOMMAX(16, bd + 6);
2775       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2776       const __m128i clamp_hi_out =
2777           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2778       round_shift_8x8(out, out_shift);
2779       highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2780     }
2781   }
2782 }
2783 
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2784 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2785                               int bd, int out_shift) {
2786   const int32_t *cospi = cospi_arr(bit);
2787   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2788   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2789   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2790   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2791   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2792   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2793   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2794   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2795   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2796   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2797   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2798   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2799   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2800   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2801   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2802   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2803   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2804   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2805   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2806   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2807   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2808   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2809   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2810   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2811   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2812   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2813   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2814   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2815   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2816   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2817   const __m128i zero = _mm_setzero_si128();
2818   __m128i u[16], v[16], x, y;
2819   // Calculate the column 0, 1, 2, 3
2820   // stage 0
2821   // stage 1
2822   // stage 2
2823   v[0] = _mm_mullo_epi32(in[15], cospi2);
2824   x = _mm_mullo_epi32(in[0], cospi62);
2825   v[0] = _mm_add_epi32(v[0], x);
2826   v[0] = _mm_add_epi32(v[0], rnding);
2827   v[0] = _mm_srai_epi32(v[0], bit);
2828 
2829   v[1] = _mm_mullo_epi32(in[15], cospi62);
2830   x = _mm_mullo_epi32(in[0], cospi2);
2831   v[1] = _mm_sub_epi32(v[1], x);
2832   v[1] = _mm_add_epi32(v[1], rnding);
2833   v[1] = _mm_srai_epi32(v[1], bit);
2834 
2835   v[2] = _mm_mullo_epi32(in[13], cospi10);
2836   x = _mm_mullo_epi32(in[2], cospi54);
2837   v[2] = _mm_add_epi32(v[2], x);
2838   v[2] = _mm_add_epi32(v[2], rnding);
2839   v[2] = _mm_srai_epi32(v[2], bit);
2840 
2841   v[3] = _mm_mullo_epi32(in[13], cospi54);
2842   x = _mm_mullo_epi32(in[2], cospi10);
2843   v[3] = _mm_sub_epi32(v[3], x);
2844   v[3] = _mm_add_epi32(v[3], rnding);
2845   v[3] = _mm_srai_epi32(v[3], bit);
2846 
2847   v[4] = _mm_mullo_epi32(in[11], cospi18);
2848   x = _mm_mullo_epi32(in[4], cospi46);
2849   v[4] = _mm_add_epi32(v[4], x);
2850   v[4] = _mm_add_epi32(v[4], rnding);
2851   v[4] = _mm_srai_epi32(v[4], bit);
2852 
2853   v[5] = _mm_mullo_epi32(in[11], cospi46);
2854   x = _mm_mullo_epi32(in[4], cospi18);
2855   v[5] = _mm_sub_epi32(v[5], x);
2856   v[5] = _mm_add_epi32(v[5], rnding);
2857   v[5] = _mm_srai_epi32(v[5], bit);
2858 
2859   v[6] = _mm_mullo_epi32(in[9], cospi26);
2860   x = _mm_mullo_epi32(in[6], cospi38);
2861   v[6] = _mm_add_epi32(v[6], x);
2862   v[6] = _mm_add_epi32(v[6], rnding);
2863   v[6] = _mm_srai_epi32(v[6], bit);
2864 
2865   v[7] = _mm_mullo_epi32(in[9], cospi38);
2866   x = _mm_mullo_epi32(in[6], cospi26);
2867   v[7] = _mm_sub_epi32(v[7], x);
2868   v[7] = _mm_add_epi32(v[7], rnding);
2869   v[7] = _mm_srai_epi32(v[7], bit);
2870 
2871   v[8] = _mm_mullo_epi32(in[7], cospi34);
2872   x = _mm_mullo_epi32(in[8], cospi30);
2873   v[8] = _mm_add_epi32(v[8], x);
2874   v[8] = _mm_add_epi32(v[8], rnding);
2875   v[8] = _mm_srai_epi32(v[8], bit);
2876 
2877   v[9] = _mm_mullo_epi32(in[7], cospi30);
2878   x = _mm_mullo_epi32(in[8], cospi34);
2879   v[9] = _mm_sub_epi32(v[9], x);
2880   v[9] = _mm_add_epi32(v[9], rnding);
2881   v[9] = _mm_srai_epi32(v[9], bit);
2882 
2883   v[10] = _mm_mullo_epi32(in[5], cospi42);
2884   x = _mm_mullo_epi32(in[10], cospi22);
2885   v[10] = _mm_add_epi32(v[10], x);
2886   v[10] = _mm_add_epi32(v[10], rnding);
2887   v[10] = _mm_srai_epi32(v[10], bit);
2888 
2889   v[11] = _mm_mullo_epi32(in[5], cospi22);
2890   x = _mm_mullo_epi32(in[10], cospi42);
2891   v[11] = _mm_sub_epi32(v[11], x);
2892   v[11] = _mm_add_epi32(v[11], rnding);
2893   v[11] = _mm_srai_epi32(v[11], bit);
2894 
2895   v[12] = _mm_mullo_epi32(in[3], cospi50);
2896   x = _mm_mullo_epi32(in[12], cospi14);
2897   v[12] = _mm_add_epi32(v[12], x);
2898   v[12] = _mm_add_epi32(v[12], rnding);
2899   v[12] = _mm_srai_epi32(v[12], bit);
2900 
2901   v[13] = _mm_mullo_epi32(in[3], cospi14);
2902   x = _mm_mullo_epi32(in[12], cospi50);
2903   v[13] = _mm_sub_epi32(v[13], x);
2904   v[13] = _mm_add_epi32(v[13], rnding);
2905   v[13] = _mm_srai_epi32(v[13], bit);
2906 
2907   v[14] = _mm_mullo_epi32(in[1], cospi58);
2908   x = _mm_mullo_epi32(in[14], cospi6);
2909   v[14] = _mm_add_epi32(v[14], x);
2910   v[14] = _mm_add_epi32(v[14], rnding);
2911   v[14] = _mm_srai_epi32(v[14], bit);
2912 
2913   v[15] = _mm_mullo_epi32(in[1], cospi6);
2914   x = _mm_mullo_epi32(in[14], cospi58);
2915   v[15] = _mm_sub_epi32(v[15], x);
2916   v[15] = _mm_add_epi32(v[15], rnding);
2917   v[15] = _mm_srai_epi32(v[15], bit);
2918 
2919   // stage 3
2920   addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2921   addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2922   addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2923   addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2924   addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2925   addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2926   addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2927   addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2928 
2929   // stage 4
2930   v[0] = u[0];
2931   v[1] = u[1];
2932   v[2] = u[2];
2933   v[3] = u[3];
2934   v[4] = u[4];
2935   v[5] = u[5];
2936   v[6] = u[6];
2937   v[7] = u[7];
2938 
2939   v[8] = _mm_mullo_epi32(u[8], cospi8);
2940   x = _mm_mullo_epi32(u[9], cospi56);
2941   v[8] = _mm_add_epi32(v[8], x);
2942   v[8] = _mm_add_epi32(v[8], rnding);
2943   v[8] = _mm_srai_epi32(v[8], bit);
2944 
2945   v[9] = _mm_mullo_epi32(u[8], cospi56);
2946   x = _mm_mullo_epi32(u[9], cospi8);
2947   v[9] = _mm_sub_epi32(v[9], x);
2948   v[9] = _mm_add_epi32(v[9], rnding);
2949   v[9] = _mm_srai_epi32(v[9], bit);
2950 
2951   v[10] = _mm_mullo_epi32(u[10], cospi40);
2952   x = _mm_mullo_epi32(u[11], cospi24);
2953   v[10] = _mm_add_epi32(v[10], x);
2954   v[10] = _mm_add_epi32(v[10], rnding);
2955   v[10] = _mm_srai_epi32(v[10], bit);
2956 
2957   v[11] = _mm_mullo_epi32(u[10], cospi24);
2958   x = _mm_mullo_epi32(u[11], cospi40);
2959   v[11] = _mm_sub_epi32(v[11], x);
2960   v[11] = _mm_add_epi32(v[11], rnding);
2961   v[11] = _mm_srai_epi32(v[11], bit);
2962 
2963   v[12] = _mm_mullo_epi32(u[12], cospim56);
2964   x = _mm_mullo_epi32(u[13], cospi8);
2965   v[12] = _mm_add_epi32(v[12], x);
2966   v[12] = _mm_add_epi32(v[12], rnding);
2967   v[12] = _mm_srai_epi32(v[12], bit);
2968 
2969   v[13] = _mm_mullo_epi32(u[12], cospi8);
2970   x = _mm_mullo_epi32(u[13], cospim56);
2971   v[13] = _mm_sub_epi32(v[13], x);
2972   v[13] = _mm_add_epi32(v[13], rnding);
2973   v[13] = _mm_srai_epi32(v[13], bit);
2974 
2975   v[14] = _mm_mullo_epi32(u[14], cospim24);
2976   x = _mm_mullo_epi32(u[15], cospi40);
2977   v[14] = _mm_add_epi32(v[14], x);
2978   v[14] = _mm_add_epi32(v[14], rnding);
2979   v[14] = _mm_srai_epi32(v[14], bit);
2980 
2981   v[15] = _mm_mullo_epi32(u[14], cospi40);
2982   x = _mm_mullo_epi32(u[15], cospim24);
2983   v[15] = _mm_sub_epi32(v[15], x);
2984   v[15] = _mm_add_epi32(v[15], rnding);
2985   v[15] = _mm_srai_epi32(v[15], bit);
2986 
2987   // stage 5
2988   addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2989   addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2990   addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2991   addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2992   addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2993   addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2994   addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2995   addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2996 
2997   // stage 6
2998   v[0] = u[0];
2999   v[1] = u[1];
3000   v[2] = u[2];
3001   v[3] = u[3];
3002 
3003   v[4] = _mm_mullo_epi32(u[4], cospi16);
3004   x = _mm_mullo_epi32(u[5], cospi48);
3005   v[4] = _mm_add_epi32(v[4], x);
3006   v[4] = _mm_add_epi32(v[4], rnding);
3007   v[4] = _mm_srai_epi32(v[4], bit);
3008 
3009   v[5] = _mm_mullo_epi32(u[4], cospi48);
3010   x = _mm_mullo_epi32(u[5], cospi16);
3011   v[5] = _mm_sub_epi32(v[5], x);
3012   v[5] = _mm_add_epi32(v[5], rnding);
3013   v[5] = _mm_srai_epi32(v[5], bit);
3014 
3015   v[6] = _mm_mullo_epi32(u[6], cospim48);
3016   x = _mm_mullo_epi32(u[7], cospi16);
3017   v[6] = _mm_add_epi32(v[6], x);
3018   v[6] = _mm_add_epi32(v[6], rnding);
3019   v[6] = _mm_srai_epi32(v[6], bit);
3020 
3021   v[7] = _mm_mullo_epi32(u[6], cospi16);
3022   x = _mm_mullo_epi32(u[7], cospim48);
3023   v[7] = _mm_sub_epi32(v[7], x);
3024   v[7] = _mm_add_epi32(v[7], rnding);
3025   v[7] = _mm_srai_epi32(v[7], bit);
3026 
3027   v[8] = u[8];
3028   v[9] = u[9];
3029   v[10] = u[10];
3030   v[11] = u[11];
3031 
3032   v[12] = _mm_mullo_epi32(u[12], cospi16);
3033   x = _mm_mullo_epi32(u[13], cospi48);
3034   v[12] = _mm_add_epi32(v[12], x);
3035   v[12] = _mm_add_epi32(v[12], rnding);
3036   v[12] = _mm_srai_epi32(v[12], bit);
3037 
3038   v[13] = _mm_mullo_epi32(u[12], cospi48);
3039   x = _mm_mullo_epi32(u[13], cospi16);
3040   v[13] = _mm_sub_epi32(v[13], x);
3041   v[13] = _mm_add_epi32(v[13], rnding);
3042   v[13] = _mm_srai_epi32(v[13], bit);
3043 
3044   v[14] = _mm_mullo_epi32(u[14], cospim48);
3045   x = _mm_mullo_epi32(u[15], cospi16);
3046   v[14] = _mm_add_epi32(v[14], x);
3047   v[14] = _mm_add_epi32(v[14], rnding);
3048   v[14] = _mm_srai_epi32(v[14], bit);
3049 
3050   v[15] = _mm_mullo_epi32(u[14], cospi16);
3051   x = _mm_mullo_epi32(u[15], cospim48);
3052   v[15] = _mm_sub_epi32(v[15], x);
3053   v[15] = _mm_add_epi32(v[15], rnding);
3054   v[15] = _mm_srai_epi32(v[15], bit);
3055 
3056   // stage 7
3057   addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3058   addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3059   addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3060   addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3061   addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3062   addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3063   addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3064   addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3065 
3066   // stage 8
3067   v[0] = u[0];
3068   v[1] = u[1];
3069 
3070   y = _mm_mullo_epi32(u[2], cospi32);
3071   x = _mm_mullo_epi32(u[3], cospi32);
3072   v[2] = _mm_add_epi32(y, x);
3073   v[2] = _mm_add_epi32(v[2], rnding);
3074   v[2] = _mm_srai_epi32(v[2], bit);
3075 
3076   v[3] = _mm_sub_epi32(y, x);
3077   v[3] = _mm_add_epi32(v[3], rnding);
3078   v[3] = _mm_srai_epi32(v[3], bit);
3079 
3080   v[4] = u[4];
3081   v[5] = u[5];
3082 
3083   y = _mm_mullo_epi32(u[6], cospi32);
3084   x = _mm_mullo_epi32(u[7], cospi32);
3085   v[6] = _mm_add_epi32(y, x);
3086   v[6] = _mm_add_epi32(v[6], rnding);
3087   v[6] = _mm_srai_epi32(v[6], bit);
3088 
3089   v[7] = _mm_sub_epi32(y, x);
3090   v[7] = _mm_add_epi32(v[7], rnding);
3091   v[7] = _mm_srai_epi32(v[7], bit);
3092 
3093   v[8] = u[8];
3094   v[9] = u[9];
3095 
3096   y = _mm_mullo_epi32(u[10], cospi32);
3097   x = _mm_mullo_epi32(u[11], cospi32);
3098   v[10] = _mm_add_epi32(y, x);
3099   v[10] = _mm_add_epi32(v[10], rnding);
3100   v[10] = _mm_srai_epi32(v[10], bit);
3101 
3102   v[11] = _mm_sub_epi32(y, x);
3103   v[11] = _mm_add_epi32(v[11], rnding);
3104   v[11] = _mm_srai_epi32(v[11], bit);
3105 
3106   v[12] = u[12];
3107   v[13] = u[13];
3108 
3109   y = _mm_mullo_epi32(u[14], cospi32);
3110   x = _mm_mullo_epi32(u[15], cospi32);
3111   v[14] = _mm_add_epi32(y, x);
3112   v[14] = _mm_add_epi32(v[14], rnding);
3113   v[14] = _mm_srai_epi32(v[14], bit);
3114 
3115   v[15] = _mm_sub_epi32(y, x);
3116   v[15] = _mm_add_epi32(v[15], rnding);
3117   v[15] = _mm_srai_epi32(v[15], bit);
3118 
3119   // stage 9
3120   if (do_cols) {
3121     out[0] = v[0];
3122     out[1] = _mm_sub_epi32(zero, v[8]);
3123     out[2] = v[12];
3124     out[3] = _mm_sub_epi32(zero, v[4]);
3125     out[4] = v[6];
3126     out[5] = _mm_sub_epi32(zero, v[14]);
3127     out[6] = v[10];
3128     out[7] = _mm_sub_epi32(zero, v[2]);
3129     out[8] = v[3];
3130     out[9] = _mm_sub_epi32(zero, v[11]);
3131     out[10] = v[15];
3132     out[11] = _mm_sub_epi32(zero, v[7]);
3133     out[12] = v[5];
3134     out[13] = _mm_sub_epi32(zero, v[13]);
3135     out[14] = v[9];
3136     out[15] = _mm_sub_epi32(zero, v[1]);
3137   } else {
3138     const int log_range_out = AOMMAX(16, bd + 6);
3139     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3140     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3141 
3142     neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3143                      out_shift);
3144     neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3145                      &clamp_hi_out, out_shift);
3146     neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3147                      &clamp_hi_out, out_shift);
3148     neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3149                      &clamp_hi_out, out_shift);
3150     neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3151                      &clamp_hi_out, out_shift);
3152     neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3153                      &clamp_hi_out, out_shift);
3154     neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3155                      &clamp_hi_out, out_shift);
3156     neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3157                      &clamp_hi_out, out_shift);
3158   }
3159 }
iidentity16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3160 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3161                                int bd, int out_shift) {
3162   (void)bit;
3163   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
3164   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
3165   __m128i a0_low, a0_high, a1_low, a1_high;
3166   __m128i zero = _mm_set1_epi32(0);
3167   offset = _mm_unpacklo_epi32(offset, zero);
3168 
3169   for (int i = 0; i < 16; i++) {
3170     a0_low = _mm_mul_epi32(in[i], fact);
3171     a0_low = _mm_add_epi32(a0_low, offset);
3172     a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
3173 
3174     a0_high = _mm_srli_si128(in[i], 4);
3175     a0_high = _mm_mul_epi32(a0_high, fact);
3176     a0_high = _mm_add_epi32(a0_high, offset);
3177     a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
3178 
3179     a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
3180     a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
3181     out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
3182   }
3183 
3184   if (!do_cols) {
3185     const int log_range = AOMMAX(16, bd + 6);
3186     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3187     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3188     round_shift_8x8(out, out_shift);
3189     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
3190   }
3191 }
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3192 static INLINE void idct64_stage8_sse4_1(
3193     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
3194     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
3195     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
3196     const __m128i *rnding, int bit) {
3197   int i;
3198   __m128i temp1, temp2, temp3, temp4;
3199   temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
3200   u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
3201   u[10] = temp1;
3202   temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
3203   u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
3204   u[11] = temp2;
3205 
3206   for (i = 16; i < 20; ++i) {
3207     addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
3208     addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
3209                   clamp_hi);
3210   }
3211 
3212   temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
3213   temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
3214   temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
3215   temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3216   u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3217   u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3218   u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3219   u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3220   u[36] = temp1;
3221   u[37] = temp2;
3222   u[38] = temp3;
3223   u[39] = temp4;
3224 
3225   temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3226   temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3227   temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3228   temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3229   u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3230   u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3231   u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3232   u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3233   u[40] = temp1;
3234   u[41] = temp2;
3235   u[42] = temp3;
3236   u[43] = temp4;
3237 }
3238 
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3239 static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3240                                         const __m128i *cospi32,
3241                                         const __m128i *clamp_lo,
3242                                         const __m128i *clamp_hi,
3243                                         const __m128i *rnding, int bit) {
3244   int i;
3245   __m128i temp1, temp2, temp3, temp4;
3246   for (i = 0; i < 8; ++i) {
3247     addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3248   }
3249 
3250   temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3251   temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3252   temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3253   temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3254   u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3255   u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3256   u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3257   u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3258   u[20] = temp1;
3259   u[21] = temp2;
3260   u[22] = temp3;
3261   u[23] = temp4;
3262   for (i = 32; i < 40; i++) {
3263     addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3264   }
3265 
3266   for (i = 48; i < 56; i++) {
3267     addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3268   }
3269 }
3270 
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3271 static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3272                                          const __m128i *cospi32,
3273                                          const __m128i *clamp_lo,
3274                                          const __m128i *clamp_hi,
3275                                          const __m128i *rnding, int bit) {
3276   __m128i temp1, temp2, temp3, temp4;
3277   for (int i = 0; i < 16; i++) {
3278     addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3279   }
3280 
3281   temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3282   temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3283   temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3284   temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3285   u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3286   u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3287   u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3288   u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3289   u[40] = temp1;
3290   u[41] = temp2;
3291   u[42] = temp3;
3292   u[43] = temp4;
3293 
3294   temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3295   temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3296   temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3297   temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3298   u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3299   u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3300   u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3301   u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3302   u[44] = temp1;
3303   u[45] = temp2;
3304   u[46] = temp3;
3305   u[47] = temp4;
3306 }
3307 
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const __m128i * clamp_lo,const __m128i * clamp_hi)3308 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3309                                          int bd, int out_shift,
3310                                          const __m128i *clamp_lo,
3311                                          const __m128i *clamp_hi) {
3312   for (int i = 0; i < 32; i++) {
3313     addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
3314   }
3315 
3316   if (!do_cols) {
3317     const int log_range_out = AOMMAX(16, bd + 6);
3318     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3319     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3320 
3321     for (int i = 0; i < 64; i += 4) {
3322       round_shift_4x4(out + i, out_shift);
3323       highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
3324                                 4);
3325     }
3326   }
3327 }
3328 
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3329 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3330                                   int do_cols, int bd, int out_shift) {
3331   const int32_t *cospi = cospi_arr(bit);
3332   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3333   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3334   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3335   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3336 
3337   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3338 
3339   {
3340     __m128i x;
3341 
3342     // stage 1
3343     // stage 2
3344     // stage 3
3345     // stage 4
3346     // stage 5
3347     // stage 6
3348     x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3349 
3350     // stage 8
3351     // stage 9
3352     // stage 10
3353     // stage 11
3354     if (!do_cols) {
3355       const int log_range_out = AOMMAX(16, bd + 6);
3356       clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3357       clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3358       if (out_shift != 0) {
3359         __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3360         x = _mm_add_epi32(x, offset);
3361         x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3362       }
3363     }
3364     x = _mm_max_epi32(x, clamp_lo);
3365     x = _mm_min_epi32(x, clamp_hi);
3366     out[0] = x;
3367     out[1] = x;
3368     out[2] = x;
3369     out[3] = x;
3370     out[4] = x;
3371     out[5] = x;
3372     out[6] = x;
3373     out[7] = x;
3374     out[8] = x;
3375     out[9] = x;
3376     out[10] = x;
3377     out[11] = x;
3378     out[12] = x;
3379     out[13] = x;
3380     out[14] = x;
3381     out[15] = x;
3382     out[16] = x;
3383     out[17] = x;
3384     out[18] = x;
3385     out[19] = x;
3386     out[20] = x;
3387     out[21] = x;
3388     out[22] = x;
3389     out[23] = x;
3390     out[24] = x;
3391     out[25] = x;
3392     out[26] = x;
3393     out[27] = x;
3394     out[28] = x;
3395     out[29] = x;
3396     out[30] = x;
3397     out[31] = x;
3398     out[32] = x;
3399     out[33] = x;
3400     out[34] = x;
3401     out[35] = x;
3402     out[36] = x;
3403     out[37] = x;
3404     out[38] = x;
3405     out[39] = x;
3406     out[40] = x;
3407     out[41] = x;
3408     out[42] = x;
3409     out[43] = x;
3410     out[44] = x;
3411     out[45] = x;
3412     out[46] = x;
3413     out[47] = x;
3414     out[48] = x;
3415     out[49] = x;
3416     out[50] = x;
3417     out[51] = x;
3418     out[52] = x;
3419     out[53] = x;
3420     out[54] = x;
3421     out[55] = x;
3422     out[56] = x;
3423     out[57] = x;
3424     out[58] = x;
3425     out[59] = x;
3426     out[60] = x;
3427     out[61] = x;
3428     out[62] = x;
3429     out[63] = x;
3430   }
3431 }
3432 
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3433 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3434                                   int do_cols, int bd, int out_shift) {
3435   int i, j;
3436   const int32_t *cospi = cospi_arr(bit);
3437   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3438   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3439   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3440   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3441 
3442   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3443   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3444   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3445   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3446   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3447   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3448   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3449   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3450   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3451   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3452   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3453   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3454   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3455   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3456   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3457   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3458   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3459   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3460   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3461   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3462   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3463   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3464   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3465   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3466   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3467   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3468   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3469   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3470   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3471   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3472   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3473   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3474   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3475   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3476   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3477   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3478   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3479   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3480 
3481   {
3482     __m128i u[64];
3483 
3484     // stage 1
3485     u[0] = in[0];
3486     u[8] = in[4];
3487     u[16] = in[2];
3488     u[24] = in[6];
3489     u[32] = in[1];
3490     u[40] = in[5];
3491     u[48] = in[3];
3492     u[56] = in[7];
3493 
3494     // stage 2
3495     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3496     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3497     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3498     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3499     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3500     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3501     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3502     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3503 
3504     // stage 3
3505     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3506     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3507     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3508     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3509     u[33] = u[32];
3510     u[38] = u[39];
3511     u[41] = u[40];
3512     u[46] = u[47];
3513     u[49] = u[48];
3514     u[54] = u[55];
3515     u[57] = u[56];
3516     u[62] = u[63];
3517 
3518     // stage 4
3519     __m128i temp1, temp2;
3520     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3521     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3522     u[17] = u[16];
3523     u[22] = u[23];
3524     u[25] = u[24];
3525     u[30] = u[31];
3526 
3527     temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3528     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3529     u[33] = temp1;
3530 
3531     temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3532     u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3533     u[57] = temp2;
3534 
3535     temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3536     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3537     u[41] = temp1;
3538 
3539     temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3540     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3541     u[46] = temp2;
3542 
3543     // stage 5
3544     u[9] = u[8];
3545     u[14] = u[15];
3546 
3547     temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3548     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3549     u[17] = temp1;
3550 
3551     temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3552     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3553     u[22] = temp2;
3554 
3555     u[35] = u[32];
3556     u[34] = u[33];
3557     u[36] = u[39];
3558     u[37] = u[38];
3559     u[43] = u[40];
3560     u[42] = u[41];
3561     u[44] = u[47];
3562     u[45] = u[46];
3563     u[51] = u[48];
3564     u[50] = u[49];
3565     u[52] = u[55];
3566     u[53] = u[54];
3567     u[59] = u[56];
3568     u[58] = u[57];
3569     u[60] = u[63];
3570     u[61] = u[62];
3571 
3572     // stage 6
3573     temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3574     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3575     u[0] = temp1;
3576 
3577     temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3578     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3579     u[9] = temp2;
3580     u[19] = u[16];
3581     u[18] = u[17];
3582     u[20] = u[23];
3583     u[21] = u[22];
3584     u[27] = u[24];
3585     u[26] = u[25];
3586     u[28] = u[31];
3587     u[29] = u[30];
3588 
3589     temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3590     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3591     u[34] = temp1;
3592     temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3593     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3594     u[35] = temp2;
3595     temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3596     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3597     u[36] = temp1;
3598     temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3599     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3600     u[37] = temp2;
3601     temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3602     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3603     u[42] = temp1;
3604     temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3605     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3606     u[43] = temp2;
3607     temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3608     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3609     u[44] = temp1;
3610     temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3611     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3612     u[45] = temp2;
3613 
3614     // stage 7
3615     u[3] = u[0];
3616     u[2] = u[1];
3617     u[11] = u[8];
3618     u[10] = u[9];
3619     u[12] = u[15];
3620     u[13] = u[14];
3621 
3622     temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3623     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3624     u[18] = temp1;
3625     temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3626     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3627     u[19] = temp2;
3628     temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3629     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3630     u[20] = temp1;
3631     temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3632     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3633     u[21] = temp2;
3634     for (i = 32; i < 64; i += 16) {
3635       for (j = i; j < i + 4; j++) {
3636         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3637         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3638                       &clamp_hi);
3639       }
3640     }
3641 
3642     // stage 8
3643     u[7] = u[0];
3644     u[6] = u[1];
3645     u[5] = u[2];
3646     u[4] = u[3];
3647 
3648     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3649                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3650 
3651     // stage 9
3652     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3653                          bit);
3654 
3655     // stage 10
3656     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3657                           bit);
3658 
3659     // stage 11
3660     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3661   }
3662 }
3663 
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3664 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3665                                    int do_cols, int bd, int out_shift) {
3666   int i, j;
3667   const int32_t *cospi = cospi_arr(bit);
3668   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3669   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3670   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3671   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3672 
3673   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3674   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3675   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3676   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3677   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3678   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3679   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3680   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3681   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3682   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3683   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3684   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3685   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3686   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3687   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3688   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3689   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3690   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3691   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3692   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3693   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3694   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3695   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3696   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3697   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3698   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3699   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3700   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3701   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3702   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3703   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3704   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3705   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3706 
3707   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3708   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3709   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3710   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3711   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3712   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3713   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3714   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3715   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3716   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3717   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3718   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3719   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3720   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3721   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3722   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3723   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3724   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3725   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3726   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3727   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3728 
3729   {
3730     __m128i u[64];
3731     __m128i tmp1, tmp2, tmp3, tmp4;
3732     // stage 1
3733     u[0] = in[0];
3734     u[32] = in[1];
3735     u[36] = in[9];
3736     u[40] = in[5];
3737     u[44] = in[13];
3738     u[48] = in[3];
3739     u[52] = in[11];
3740     u[56] = in[7];
3741     u[60] = in[15];
3742     u[16] = in[2];
3743     u[20] = in[10];
3744     u[24] = in[6];
3745     u[28] = in[14];
3746     u[4] = in[8];
3747     u[8] = in[4];
3748     u[12] = in[12];
3749 
3750     // stage 2
3751     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3752     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3753     u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3754     u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3755     u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3756     u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3757     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3758     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3759     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3760     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3761     u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3762     u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3763     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3764     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3765     u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3766     u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3767 
3768     // stage 3
3769     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3770     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3771     u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3772     u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3773     u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3774     u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3775     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3776     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3777     u[33] = u[32];
3778     u[34] = u[35];
3779     u[37] = u[36];
3780     u[38] = u[39];
3781     u[41] = u[40];
3782     u[42] = u[43];
3783     u[45] = u[44];
3784     u[46] = u[47];
3785     u[49] = u[48];
3786     u[50] = u[51];
3787     u[53] = u[52];
3788     u[54] = u[55];
3789     u[57] = u[56];
3790     u[58] = u[59];
3791     u[61] = u[60];
3792     u[62] = u[63];
3793 
3794     // stage 4
3795     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3796     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3797     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3798     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3799 
3800     u[17] = u[16];
3801     u[18] = u[19];
3802     u[21] = u[20];
3803     u[22] = u[23];
3804     u[25] = u[24];
3805     u[26] = u[27];
3806     u[29] = u[28];
3807     u[30] = u[31];
3808 
3809     tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3810     tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3811     tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3812     tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3813     u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3814     u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3815     u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3816     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3817     u[33] = tmp1;
3818     u[34] = tmp2;
3819     u[37] = tmp3;
3820     u[38] = tmp4;
3821 
3822     tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3823     tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3824     tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3825     tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3826     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3827     u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3828     u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3829     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3830     u[41] = tmp1;
3831     u[42] = tmp2;
3832     u[45] = tmp3;
3833     u[46] = tmp4;
3834 
3835     // stage 5
3836     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3837     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3838 
3839     u[9] = u[8];
3840     u[10] = u[11];
3841     u[13] = u[12];
3842     u[14] = u[15];
3843 
3844     tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3845     tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3846     tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3847     tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3848     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3849     u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3850     u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3851     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3852     u[17] = tmp1;
3853     u[18] = tmp2;
3854     u[21] = tmp3;
3855     u[22] = tmp4;
3856 
3857     for (i = 32; i < 64; i += 8) {
3858       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3859                     &clamp_hi);
3860       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3861                     &clamp_hi);
3862 
3863       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3864                     &clamp_hi);
3865       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3866                     &clamp_hi);
3867     }
3868 
3869     // stage 6
3870     tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3871     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3872     u[0] = tmp1;
3873     u[5] = u[4];
3874     u[6] = u[7];
3875 
3876     tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3877     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3878     u[9] = tmp1;
3879     tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3880     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3881     u[10] = tmp2;
3882 
3883     for (i = 16; i < 32; i += 8) {
3884       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3885                     &clamp_hi);
3886       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3887                     &clamp_hi);
3888 
3889       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3890                     &clamp_hi);
3891       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3892                     &clamp_hi);
3893     }
3894 
3895     tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3896     tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3897     tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3898     tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3899     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3900     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3901     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3902     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3903     u[34] = tmp1;
3904     u[35] = tmp2;
3905     u[36] = tmp3;
3906     u[37] = tmp4;
3907 
3908     tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3909     tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3910     tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3911     tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3912     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3913     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3914     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3915     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3916     u[42] = tmp1;
3917     u[43] = tmp2;
3918     u[44] = tmp3;
3919     u[45] = tmp4;
3920 
3921     // stage 7
3922     u[3] = u[0];
3923     u[2] = u[1];
3924     tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3925     u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3926     u[5] = tmp1;
3927     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3928     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3929     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3930     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3931 
3932     tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3933     tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3934     tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3935     tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3936     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3937     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3938     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3939     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3940     u[18] = tmp1;
3941     u[19] = tmp2;
3942     u[20] = tmp3;
3943     u[21] = tmp4;
3944 
3945     for (i = 32; i < 64; i += 16) {
3946       for (j = i; j < i + 4; j++) {
3947         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3948         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3949                       &clamp_hi);
3950       }
3951     }
3952 
3953     // stage 8
3954     for (i = 0; i < 4; ++i) {
3955       addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3956     }
3957 
3958     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3959                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3960 
3961     // stage 9
3962     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3963                          bit);
3964 
3965     // stage 10
3966     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3967                           bit);
3968 
3969     // stage 11
3970     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3971   }
3972 }
3973 
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3974 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3975                              int bd, int out_shift) {
3976   int i, j;
3977   const int32_t *cospi = cospi_arr(bit);
3978   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3979   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3980   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3981   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3982 
3983   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3984   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3985   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3986   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3987   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3988   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3989   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3990   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3991   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3992   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3993   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3994   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3995   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3996   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3997   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3998   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3999   const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
4000   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4001   const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
4002   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4003   const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
4004   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4005   const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
4006   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4007   const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
4008   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4009   const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
4010   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4011   const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
4012   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4013   const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
4014   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4015   const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
4016   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4017   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4018   const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
4019   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4020   const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
4021   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4022   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4023   const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
4024   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4025   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
4026   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4027   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4028   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
4029   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4030   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
4031   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4032   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4033   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
4034 
4035   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4036   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4037   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
4038   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4039   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4040   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4041   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
4042   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4043   const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
4044   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4045   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4046   const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
4047   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4048   const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
4049   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4050   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
4051   const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
4052   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4053   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
4054   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4055   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4056   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
4057   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4058   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
4059   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4060   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
4061   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
4062 
4063   {
4064     __m128i u[64], v[64];
4065 
4066     // stage 1
4067     u[32] = in[1];
4068     u[34] = in[17];
4069     u[36] = in[9];
4070     u[38] = in[25];
4071     u[40] = in[5];
4072     u[42] = in[21];
4073     u[44] = in[13];
4074     u[46] = in[29];
4075     u[48] = in[3];
4076     u[50] = in[19];
4077     u[52] = in[11];
4078     u[54] = in[27];
4079     u[56] = in[7];
4080     u[58] = in[23];
4081     u[60] = in[15];
4082     u[62] = in[31];
4083 
4084     v[16] = in[2];
4085     v[18] = in[18];
4086     v[20] = in[10];
4087     v[22] = in[26];
4088     v[24] = in[6];
4089     v[26] = in[22];
4090     v[28] = in[14];
4091     v[30] = in[30];
4092 
4093     u[8] = in[4];
4094     u[10] = in[20];
4095     u[12] = in[12];
4096     u[14] = in[28];
4097 
4098     v[4] = in[8];
4099     v[6] = in[24];
4100 
4101     u[0] = in[0];
4102     u[2] = in[16];
4103 
4104     // stage 2
4105     v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
4106     v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
4107     v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
4108     v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
4109     v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
4110     v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
4111     v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
4112     v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
4113     v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
4114     v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
4115     v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
4116     v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
4117     v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
4118     v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
4119     v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
4120     v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
4121     v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
4122     v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
4123     v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
4124     v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
4125     v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
4126     v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
4127     v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
4128     v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
4129     v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
4130     v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
4131     v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
4132     v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
4133     v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
4134     v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
4135     v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
4136     v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
4137 
4138     // stage 3
4139     u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
4140     u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
4141     u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
4142     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
4143     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
4144     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
4145     u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
4146     u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
4147     u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
4148     u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
4149     u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
4150     u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
4151     u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
4152     u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
4153     u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
4154     u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
4155 
4156     for (i = 32; i < 64; i += 4) {
4157       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4158                     &clamp_hi);
4159       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4160                     &clamp_hi);
4161     }
4162 
4163     // stage 4
4164     v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
4165     v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
4166     v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
4167     v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
4168     v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
4169     v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
4170     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
4171     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
4172 
4173     for (i = 16; i < 32; i += 4) {
4174       addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
4175                     &clamp_hi);
4176       addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
4177                     &clamp_hi);
4178     }
4179 
4180     for (i = 32; i < 64; i += 4) {
4181       v[i + 0] = u[i + 0];
4182       v[i + 3] = u[i + 3];
4183     }
4184 
4185     v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
4186     v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
4187     v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
4188     v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
4189     v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
4190     v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
4191     v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
4192     v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
4193     v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
4194     v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
4195     v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
4196     v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
4197     v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
4198     v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
4199     v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
4200     v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
4201 
4202     // stage 5
4203     u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
4204     u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
4205     u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
4206     u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
4207 
4208     for (i = 8; i < 16; i += 4) {
4209       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4210                     &clamp_hi);
4211       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4212                     &clamp_hi);
4213     }
4214 
4215     for (i = 16; i < 32; i += 4) {
4216       u[i + 0] = v[i + 0];
4217       u[i + 3] = v[i + 3];
4218     }
4219 
4220     u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4221     u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4222     u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4223     u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4224     u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4225     u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4226     u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4227     u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4228 
4229     for (i = 32; i < 64; i += 8) {
4230       addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4231                     &clamp_hi);
4232       addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4233                     &clamp_hi);
4234 
4235       addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4236                     &clamp_hi);
4237       addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4238                     &clamp_hi);
4239     }
4240 
4241     // stage 6
4242     v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4243     v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4244     v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4245     v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4246 
4247     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4248     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4249 
4250     for (i = 8; i < 16; i += 4) {
4251       v[i + 0] = u[i + 0];
4252       v[i + 3] = u[i + 3];
4253     }
4254 
4255     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4256     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4257     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4258     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4259 
4260     for (i = 16; i < 32; i += 8) {
4261       addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4262                     &clamp_hi);
4263       addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4264                     &clamp_hi);
4265 
4266       addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4267                     &clamp_hi);
4268       addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4269                     &clamp_hi);
4270     }
4271 
4272     for (i = 32; i < 64; i += 8) {
4273       v[i + 0] = u[i + 0];
4274       v[i + 1] = u[i + 1];
4275       v[i + 6] = u[i + 6];
4276       v[i + 7] = u[i + 7];
4277     }
4278 
4279     v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4280     v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4281     v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4282     v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4283     v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4284     v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4285     v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4286     v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4287     v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4288     v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4289     v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4290     v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4291     v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4292     v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4293     v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4294     v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4295 
4296     // stage 7
4297     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4298     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4299 
4300     u[4] = v[4];
4301     u[7] = v[7];
4302     u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4303     u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4304 
4305     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4306     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4307     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4308     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4309 
4310     for (i = 16; i < 32; i += 8) {
4311       u[i + 0] = v[i + 0];
4312       u[i + 1] = v[i + 1];
4313       u[i + 6] = v[i + 6];
4314       u[i + 7] = v[i + 7];
4315     }
4316 
4317     u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4318     u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4319     u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4320     u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4321     u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4322     u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4323     u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4324     u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4325 
4326     for (i = 32; i < 64; i += 16) {
4327       for (j = i; j < i + 4; j++) {
4328         addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4329         addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4330                       &clamp_hi);
4331       }
4332     }
4333 
4334     // stage 8
4335     for (i = 0; i < 4; ++i) {
4336       addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4337     }
4338 
4339     v[8] = u[8];
4340     v[9] = u[9];
4341     v[14] = u[14];
4342     v[15] = u[15];
4343 
4344     v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4345     v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4346     v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4347     v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4348 
4349     for (i = 16; i < 20; ++i) {
4350       addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4351       addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4352                     &clamp_hi);
4353     }
4354 
4355     for (i = 32; i < 36; ++i) {
4356       v[i] = u[i];
4357       v[i + 12] = u[i + 12];
4358       v[i + 16] = u[i + 16];
4359       v[i + 28] = u[i + 28];
4360     }
4361 
4362     v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4363     v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4364     v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4365     v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4366     v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4367     v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4368     v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4369     v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4370     v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4371     v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4372     v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4373     v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4374     v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4375     v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4376     v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4377     v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4378 
4379     // stage 9
4380     for (i = 0; i < 8; ++i) {
4381       addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4382     }
4383 
4384     for (i = 16; i < 20; ++i) {
4385       u[i] = v[i];
4386       u[i + 12] = v[i + 12];
4387     }
4388 
4389     u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4390     u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4391     u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4392     u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4393     u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4394     u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4395     u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4396     u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4397 
4398     for (i = 32; i < 40; i++) {
4399       addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4400     }
4401 
4402     for (i = 48; i < 56; i++) {
4403       addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4404     }
4405 
4406     // stage 10
4407     for (i = 0; i < 16; i++) {
4408       addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4409     }
4410 
4411     for (i = 32; i < 40; i++) v[i] = u[i];
4412 
4413     v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4414     v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4415     v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4416     v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4417     v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4418     v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4419     v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4420     v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4421     v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4422     v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4423     v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4424     v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4425     v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4426     v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4427     v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4428     v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4429 
4430     for (i = 56; i < 64; i++) v[i] = u[i];
4431 
4432     // stage 11
4433     for (i = 0; i < 32; i++) {
4434       addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4435                     &clamp_hi);
4436     }
4437 
4438     if (!do_cols) {
4439       const int log_range_out = AOMMAX(16, bd + 6);
4440       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4441       const __m128i clamp_hi_out =
4442           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4443       for (i = 0; i < 64; i += 4) {
4444         round_shift_4x4(out + i, out_shift);
4445         highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
4446                                   &clamp_hi_out, 4);
4447       }
4448     }
4449   }
4450 }
4451 
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4452 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4453                                   int do_cols, int bd, int out_shift) {
4454   const int32_t *cospi = cospi_arr(bit);
4455   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4456   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4457   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4458   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4459   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4460   __m128i bf1;
4461 
4462   // stage 0
4463   // stage 1
4464   bf1 = in[0];
4465 
4466   // stage 2
4467   // stage 3
4468   // stage 4
4469   // stage 5
4470   bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4471 
4472   // stage 6
4473   // stage 7
4474   // stage 8
4475   // stage 9
4476   if (do_cols) {
4477     bf1 = _mm_max_epi32(bf1, clamp_lo);
4478     bf1 = _mm_min_epi32(bf1, clamp_hi);
4479   } else {
4480     const int log_range_out = AOMMAX(16, bd + 6);
4481     clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4482     clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4483     if (out_shift != 0) {
4484       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4485       bf1 = _mm_add_epi32(bf1, offset);
4486       bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4487     }
4488   }
4489 
4490   bf1 = _mm_max_epi32(bf1, clamp_lo);
4491   bf1 = _mm_min_epi32(bf1, clamp_hi);
4492   out[0] = bf1;
4493   out[1] = bf1;
4494   out[2] = bf1;
4495   out[3] = bf1;
4496   out[4] = bf1;
4497   out[5] = bf1;
4498   out[6] = bf1;
4499   out[7] = bf1;
4500   out[8] = bf1;
4501   out[9] = bf1;
4502   out[10] = bf1;
4503   out[11] = bf1;
4504   out[12] = bf1;
4505   out[13] = bf1;
4506   out[14] = bf1;
4507   out[15] = bf1;
4508   out[16] = bf1;
4509   out[17] = bf1;
4510   out[18] = bf1;
4511   out[19] = bf1;
4512   out[20] = bf1;
4513   out[21] = bf1;
4514   out[22] = bf1;
4515   out[23] = bf1;
4516   out[24] = bf1;
4517   out[25] = bf1;
4518   out[26] = bf1;
4519   out[27] = bf1;
4520   out[28] = bf1;
4521   out[29] = bf1;
4522   out[30] = bf1;
4523   out[31] = bf1;
4524 }
4525 
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4526 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4527                                   int do_cols, int bd, int out_shift) {
4528   const int32_t *cospi = cospi_arr(bit);
4529   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4530   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4531   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4532   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4533   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4534   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4535   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4536   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4537   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4538   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4539   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4540   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4541   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4542   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4543   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4544   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4545   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4546   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4547   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4548   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4549   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4550   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4551   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4552   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4553   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4554   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4555   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4556   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4557   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4558   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4559   __m128i bf1[32];
4560 
4561   // stage 0
4562   // stage 1
4563   bf1[0] = in[0];
4564   bf1[4] = in[4];
4565   bf1[8] = in[2];
4566   bf1[12] = in[6];
4567   bf1[16] = in[1];
4568   bf1[20] = in[5];
4569   bf1[24] = in[3];
4570   bf1[28] = in[7];
4571 
4572   // stage 2
4573   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4574   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4575   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4576   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4577   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4578   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4579   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4580   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4581 
4582   // stage 3
4583   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4584   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4585 
4586   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4587   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4588   bf1[17] = bf1[16];
4589   bf1[18] = bf1[19];
4590   bf1[21] = bf1[20];
4591   bf1[22] = bf1[23];
4592   bf1[25] = bf1[24];
4593   bf1[26] = bf1[27];
4594   bf1[29] = bf1[28];
4595   bf1[30] = bf1[31];
4596 
4597   // stage 4 :
4598   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4599   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4600 
4601   bf1[9] = bf1[8];
4602   bf1[10] = bf1[11];
4603   bf1[13] = bf1[12];
4604   bf1[14] = bf1[15];
4605 
4606   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4607                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4608 
4609   // stage 5
4610   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4611   bf1[1] = bf1[0];
4612   bf1[5] = bf1[4];
4613   bf1[6] = bf1[7];
4614 
4615   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4616                        &clamp_hi, &rounding, bit);
4617 
4618   // stage 6
4619   bf1[3] = bf1[0];
4620   bf1[2] = bf1[1];
4621 
4622   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4623                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4624 
4625   // stage 7
4626   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4627                        &rounding, bit);
4628 
4629   // stage 8
4630   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4631                        &rounding, bit);
4632 
4633   // stage 9
4634   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4635 }
4636 
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4637 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4638                                    int do_cols, int bd, int out_shift) {
4639   const int32_t *cospi = cospi_arr(bit);
4640   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4641   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4642   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4643   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4644   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4645   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4646   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4647   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4648   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4649   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4650   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4651   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4652   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4653   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4654   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4655   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4656   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4657   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4658   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4659   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4660   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4661   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4662   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4663   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4664   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4665   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4666   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4667   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4668   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4669   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4670   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4671   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4672   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4673   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4674   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4675   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4676   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4677   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4678   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4679   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4680   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4681   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4682   __m128i bf1[32];
4683 
4684   // stage 0
4685   // stage 1
4686 
4687   bf1[0] = in[0];
4688   bf1[2] = in[8];
4689   bf1[4] = in[4];
4690   bf1[6] = in[12];
4691   bf1[8] = in[2];
4692   bf1[10] = in[10];
4693   bf1[12] = in[6];
4694   bf1[14] = in[14];
4695   bf1[16] = in[1];
4696   bf1[18] = in[9];
4697   bf1[20] = in[5];
4698   bf1[22] = in[13];
4699   bf1[24] = in[3];
4700   bf1[26] = in[11];
4701   bf1[28] = in[7];
4702   bf1[30] = in[15];
4703 
4704   // stage 2
4705   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4706   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4707   bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4708   bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4709   bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4710   bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4711   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4712   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4713   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4714   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4715   bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4716   bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4717   bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4718   bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4719   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4720   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4721 
4722   // stage 3
4723   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4724   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4725   bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4726   bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4727   bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4728   bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4729   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4730   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4731 
4732   addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4733   addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4734   addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4735   addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4736   addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4737   addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4738   addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4739   addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4740   // stage 4
4741   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4742   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4743   bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4744   bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4745 
4746   addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4747   addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4748   addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4749   addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4750 
4751   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4752                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4753 
4754   // stage 5
4755   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4756   bf1[1] = bf1[0];
4757   bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4758   bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4759 
4760   addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4761   addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4762 
4763   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4764                        &clamp_hi, &rounding, bit);
4765 
4766   // stage 6
4767   addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4768   addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4769 
4770   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4771                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4772 
4773   // stage 7
4774   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4775                        &rounding, bit);
4776 
4777   // stage 8
4778   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4779                        &rounding, bit);
4780   // stage 9
4781   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4782 }
4783 
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4784 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4785                              int bd, int out_shift) {
4786   const int32_t *cospi = cospi_arr(bit);
4787   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4788   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4789   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4790   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4791   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4792   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4793   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4794   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4795   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4796   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4797   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4798   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4799   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4800   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4801   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4802   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4803   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4804   const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4805   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4806   const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4807   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4808   const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4809   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4810   const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4811   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4812   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4813   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4814   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4815   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4816   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4817   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4818   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4819   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4820   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4821   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4822   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4823   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4824   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4825   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4826   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4827   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4828   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4829   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4830   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4831   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4832   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4833   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4834   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4835   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4836   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4837   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4838   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4839   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4840   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4841   __m128i bf1[32], bf0[32];
4842 
4843   // stage 0
4844   // stage 1
4845   bf1[0] = in[0];
4846   bf1[1] = in[16];
4847   bf1[2] = in[8];
4848   bf1[3] = in[24];
4849   bf1[4] = in[4];
4850   bf1[5] = in[20];
4851   bf1[6] = in[12];
4852   bf1[7] = in[28];
4853   bf1[8] = in[2];
4854   bf1[9] = in[18];
4855   bf1[10] = in[10];
4856   bf1[11] = in[26];
4857   bf1[12] = in[6];
4858   bf1[13] = in[22];
4859   bf1[14] = in[14];
4860   bf1[15] = in[30];
4861   bf1[16] = in[1];
4862   bf1[17] = in[17];
4863   bf1[18] = in[9];
4864   bf1[19] = in[25];
4865   bf1[20] = in[5];
4866   bf1[21] = in[21];
4867   bf1[22] = in[13];
4868   bf1[23] = in[29];
4869   bf1[24] = in[3];
4870   bf1[25] = in[19];
4871   bf1[26] = in[11];
4872   bf1[27] = in[27];
4873   bf1[28] = in[7];
4874   bf1[29] = in[23];
4875   bf1[30] = in[15];
4876   bf1[31] = in[31];
4877 
4878   // stage 2
4879   bf0[0] = bf1[0];
4880   bf0[1] = bf1[1];
4881   bf0[2] = bf1[2];
4882   bf0[3] = bf1[3];
4883   bf0[4] = bf1[4];
4884   bf0[5] = bf1[5];
4885   bf0[6] = bf1[6];
4886   bf0[7] = bf1[7];
4887   bf0[8] = bf1[8];
4888   bf0[9] = bf1[9];
4889   bf0[10] = bf1[10];
4890   bf0[11] = bf1[11];
4891   bf0[12] = bf1[12];
4892   bf0[13] = bf1[13];
4893   bf0[14] = bf1[14];
4894   bf0[15] = bf1[15];
4895   bf0[16] =
4896       half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4897   bf0[17] =
4898       half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4899   bf0[18] =
4900       half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4901   bf0[19] =
4902       half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4903   bf0[20] =
4904       half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4905   bf0[21] =
4906       half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4907   bf0[22] =
4908       half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4909   bf0[23] =
4910       half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4911   bf0[24] =
4912       half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4913   bf0[25] =
4914       half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4915   bf0[26] =
4916       half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4917   bf0[27] =
4918       half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4919   bf0[28] =
4920       half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4921   bf0[29] =
4922       half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4923   bf0[30] =
4924       half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4925   bf0[31] =
4926       half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4927 
4928   // stage 3
4929   bf1[0] = bf0[0];
4930   bf1[1] = bf0[1];
4931   bf1[2] = bf0[2];
4932   bf1[3] = bf0[3];
4933   bf1[4] = bf0[4];
4934   bf1[5] = bf0[5];
4935   bf1[6] = bf0[6];
4936   bf1[7] = bf0[7];
4937   bf1[8] =
4938       half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4939   bf1[9] =
4940       half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4941   bf1[10] =
4942       half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4943   bf1[11] =
4944       half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4945   bf1[12] =
4946       half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4947   bf1[13] =
4948       half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4949   bf1[14] =
4950       half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4951   bf1[15] =
4952       half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4953 
4954   addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4955   addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4956   addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4957   addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4958   addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4959   addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4960   addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4961   addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4962 
4963   // stage 4
4964   bf0[0] = bf1[0];
4965   bf0[1] = bf1[1];
4966   bf0[2] = bf1[2];
4967   bf0[3] = bf1[3];
4968   bf0[4] =
4969       half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4970   bf0[5] =
4971       half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4972   bf0[6] =
4973       half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4974   bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4975 
4976   addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4977   addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4978   addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4979   addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4980 
4981   bf0[16] = bf1[16];
4982   bf0[17] =
4983       half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4984   bf0[18] =
4985       half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4986   bf0[19] = bf1[19];
4987   bf0[20] = bf1[20];
4988   bf0[21] =
4989       half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4990   bf0[22] =
4991       half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4992   bf0[23] = bf1[23];
4993   bf0[24] = bf1[24];
4994   bf0[25] =
4995       half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4996   bf0[26] =
4997       half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4998   bf0[27] = bf1[27];
4999   bf0[28] = bf1[28];
5000   bf0[29] =
5001       half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
5002   bf0[30] =
5003       half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
5004   bf0[31] = bf1[31];
5005 
5006   // stage 5
5007   bf1[0] =
5008       half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
5009   bf1[1] =
5010       half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
5011   bf1[2] =
5012       half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
5013   bf1[3] =
5014       half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
5015   addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
5016   addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
5017   bf1[8] = bf0[8];
5018   bf1[9] =
5019       half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
5020   bf1[10] =
5021       half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
5022   bf1[11] = bf0[11];
5023   bf1[12] = bf0[12];
5024   bf1[13] =
5025       half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
5026   bf1[14] =
5027       half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
5028   bf1[15] = bf0[15];
5029   addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
5030   addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
5031   addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
5032   addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
5033   addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
5034   addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
5035   addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
5036   addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
5037 
5038   // stage 6
5039   addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
5040   addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
5041   bf0[4] = bf1[4];
5042   bf0[5] =
5043       half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5044   bf0[6] =
5045       half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5046   bf0[7] = bf1[7];
5047   addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
5048   addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
5049   addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
5050   addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
5051   bf0[16] = bf1[16];
5052   bf0[17] = bf1[17];
5053   bf0[18] =
5054       half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
5055   bf0[19] =
5056       half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
5057   bf0[20] =
5058       half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
5059   bf0[21] =
5060       half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
5061   bf0[22] = bf1[22];
5062   bf0[23] = bf1[23];
5063   bf0[24] = bf1[24];
5064   bf0[25] = bf1[25];
5065   bf0[26] =
5066       half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
5067   bf0[27] =
5068       half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
5069   bf0[28] =
5070       half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
5071   bf0[29] =
5072       half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
5073   bf0[30] = bf1[30];
5074   bf0[31] = bf1[31];
5075 
5076   // stage 7
5077   addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
5078   addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
5079   addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
5080   addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
5081   bf1[8] = bf0[8];
5082   bf1[9] = bf0[9];
5083   bf1[10] =
5084       half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5085   bf1[11] =
5086       half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5087   bf1[12] =
5088       half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5089   bf1[13] =
5090       half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5091   bf1[14] = bf0[14];
5092   bf1[15] = bf0[15];
5093   addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
5094   addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
5095   addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
5096   addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
5097   addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
5098   addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
5099   addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
5100   addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
5101 
5102   // stage 8
5103   addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
5104   addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
5105   addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
5106   addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
5107   addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
5108   addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
5109   addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
5110   addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
5111   bf0[16] = bf1[16];
5112   bf0[17] = bf1[17];
5113   bf0[18] = bf1[18];
5114   bf0[19] = bf1[19];
5115   bf0[20] =
5116       half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5117   bf0[21] =
5118       half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5119   bf0[22] =
5120       half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5121   bf0[23] =
5122       half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5123   bf0[24] =
5124       half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5125   bf0[25] =
5126       half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5127   bf0[26] =
5128       half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5129   bf0[27] =
5130       half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5131   bf0[28] = bf1[28];
5132   bf0[29] = bf1[29];
5133   bf0[30] = bf1[30];
5134   bf0[31] = bf1[31];
5135 
5136   // stage 9
5137   addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
5138   addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
5139   addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
5140   addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
5141   addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
5142   addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
5143   addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
5144   addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
5145   addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
5146   addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
5147   addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
5148   addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
5149   addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
5150   addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
5151   addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
5152   addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
5153 
5154   if (!do_cols) {
5155     const int log_range_out = AOMMAX(16, bd + 6);
5156     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5157     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5158     round_shift_8x8(out, out_shift);
5159     round_shift_8x8(out + 16, out_shift);
5160     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5161   }
5162 }
5163 
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5164 void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5165                                         int stride,
5166                                         const TxfmParam *txfm_param) {
5167   int bd = txfm_param->bd;
5168   const TX_TYPE tx_type = txfm_param->tx_type;
5169   const int32_t *src = cast_to_int32(input);
5170   switch (tx_type) {
5171     case IDTX:
5172     case H_DCT:
5173     case H_ADST:
5174     case H_FLIPADST:
5175     case V_DCT:
5176     case V_ADST:
5177     case V_FLIPADST:
5178       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5179                                                 txfm_param->tx_size,
5180                                                 txfm_param->eob, bd);
5181       break;
5182     default:
5183       av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5184                                     tx_type, bd);
5185       break;
5186   }
5187 }
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5188 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5189                                         int stride,
5190                                         const TxfmParam *txfm_param) {
5191   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5192   int eob = txfm_param->eob;
5193   int bd = txfm_param->bd;
5194   int lossless = txfm_param->lossless;
5195   const int32_t *src = cast_to_int32(input);
5196   const TX_TYPE tx_type = txfm_param->tx_type;
5197   if (lossless) {
5198     assert(tx_type == DCT_DCT);
5199     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5200     return;
5201   }
5202   av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5203                                 bd);
5204 }
iidentity32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)5205 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
5206                                int bd, int out_shift) {
5207   (void)bit;
5208   for (int i = 0; i < 32; i += 16) {
5209     out[i] = _mm_slli_epi32(in[i], 2);
5210     out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
5211     out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
5212     out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
5213     out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
5214     out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
5215     out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
5216     out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
5217     out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
5218     out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
5219     out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
5220     out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
5221     out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
5222     out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
5223     out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
5224     out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
5225   }
5226 
5227   if (!do_cols) {
5228     const int log_range_out = AOMMAX(16, bd + 6);
5229     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5230     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5231     round_shift_8x8(out, out_shift);
5232     round_shift_8x8(out + 16, out_shift);
5233     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5234   }
5235 }
5236 static const transform_1d_sse4_1
5237     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5238       {
5239           { idct4x4_sse4_1, NULL, NULL, NULL },
5240           { iadst4x4_sse4_1, NULL, NULL, NULL },
5241           { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
5242       },
5243       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5244         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5245         { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
5246       {
5247           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5248             NULL },
5249           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5250             NULL },
5251           { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
5252       },
5253       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5254           idct32x32_sse4_1 },
5255         { NULL, NULL, NULL, NULL },
5256         { iidentity32_sse4_1, NULL, NULL, NULL } },
5257       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5258           idct64x64_sse4_1 },
5259         { NULL, NULL, NULL, NULL },
5260         { NULL, NULL, NULL, NULL } }
5261     };
highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5262 static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
5263                                                     uint16_t *output,
5264                                                     int stride, TX_TYPE tx_type,
5265                                                     TX_SIZE tx_size, int eob,
5266                                                     const int bd) {
5267   __m128i buf1[64];
5268   int eobx, eoby;
5269   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
5270   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5271   const int txw_idx = get_txw_idx(tx_size);
5272   const int txh_idx = get_txh_idx(tx_size);
5273   const int txfm_size_col = tx_size_wide[tx_size];
5274   const int txfm_size_row = tx_size_high[tx_size];
5275   const int input_stride = AOMMIN(32, txfm_size_col);
5276   const int buf_size_w_div4 = input_stride >> 2;
5277   const int buf_size_h_div8 = (eoby + 8) >> 3;
5278   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5279   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5280   const transform_1d_sse4_1 row_txfm =
5281       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5282   const transform_1d_sse4_1 col_txfm =
5283       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5284   int ud_flip, lr_flip;
5285   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5286 
5287   for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5288     __m128i buf0[16];
5289     const int32_t *input_row = input + i * input_stride * 4;
5290     for (int j = 0; j < buf_size_w_div4; ++j) {
5291       __m128i *buf0_cur = buf0 + j * 4;
5292       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5293     }
5294     if (rect_type == 1 || rect_type == -1) {
5295       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5296                                            NewInvSqrt2);
5297     }
5298     row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5299              -shift[0]);
5300 
5301     __m128i *_buf1 = buf1 + i * 4;
5302 
5303     for (int j = 0; j < buf_size_w_div4; ++j) {
5304       _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5305       _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5306       _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5307       _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5308     }
5309   }
5310   for (int i = 0; i < buf_size_w_div4; i++) {
5311     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5312              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5313 
5314     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5315                                     buf1 + i * txfm_size_row, txfm_size_row,
5316                                     -shift[1]);
5317   }
5318 
5319   // write to buffer
5320   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5321     highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
5322                                    stride, ud_flip, txfm_size_row, bd);
5323   }
5324 }
highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5325 static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
5326                                                     uint16_t *output,
5327                                                     int stride, TX_TYPE tx_type,
5328                                                     TX_SIZE tx_size, int eob,
5329                                                     const int bd) {
5330   __m128i buf1[64];
5331   int eobx, eoby;
5332   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
5333   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5334   const int txw_idx = get_txw_idx(tx_size);
5335   const int txh_idx = get_txh_idx(tx_size);
5336   const int txfm_size_col = tx_size_wide[tx_size];
5337   const int txfm_size_row = tx_size_high[tx_size];
5338   const int input_stride = AOMMIN(32, txfm_size_col);
5339   const int buf_size_w_div8 = input_stride >> 2;
5340   const int row_max = AOMMIN(32, txfm_size_row);
5341   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5342   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5343   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5344   const transform_1d_sse4_1 row_txfm =
5345       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5346   const transform_1d_sse4_1 col_txfm =
5347       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5348   int ud_flip, lr_flip;
5349   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5350 
5351   for (int i = 0; i < (row_max >> 2); ++i) {
5352     __m128i buf0[16];
5353     const int32_t *input_row = input + i * input_stride * 4;
5354     for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
5355       __m128i *buf0_cur = buf0 + j * 4;
5356       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5357 
5358       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5359                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5360     }
5361     if (rect_type == 1 || rect_type == -1) {
5362       av1_round_shift_rect_array_32_sse4_1(
5363           buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
5364     }
5365     row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5366              -shift[0]);
5367 
5368     __m128i *_buf1 = buf1 + i * 4;
5369     if (lr_flip) {
5370       for (int j = 0; j < buf_size_w_div8; ++j) {
5371         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5372                       buf0[4 * j],
5373                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5374                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5375                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5376                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5377       }
5378     } else {
5379       for (int j = 0; j < buf_size_w_div8; ++j) {
5380         TRANSPOSE_4X4(
5381             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5382             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5383             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5384       }
5385     }
5386   }
5387   for (int i = 0; i < buf_size_w_div8; i++) {
5388     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5389              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5390 
5391     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5392                                     buf1 + i * txfm_size_row, txfm_size_row,
5393                                     -shift[1]);
5394   }
5395 
5396   // write to buffer
5397   {
5398     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5399       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5400                                      output + 8 * i, stride, ud_flip,
5401                                      txfm_size_row, bd);
5402     }
5403   }
5404 }
highbd_inv_txfm2d_add_idtx_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5405 static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
5406                                               uint16_t *output, int stride,
5407                                               TX_TYPE tx_type, TX_SIZE tx_size,
5408                                               int eob, const int bd) {
5409   (void)eob;
5410   __m128i buf1[64 * 4];
5411   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5412   const int txw_idx = get_txw_idx(tx_size);
5413   const int txh_idx = get_txh_idx(tx_size);
5414   const int txfm_size_col = tx_size_wide[tx_size];
5415   const int txfm_size_row = tx_size_high[tx_size];
5416   const int input_stride = AOMMIN(32, txfm_size_col);
5417   const int row_max = AOMMIN(32, txfm_size_row);
5418   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5419   const transform_1d_sse4_1 row_txfm =
5420       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5421   const transform_1d_sse4_1 col_txfm =
5422       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5423 
5424   for (int i = 0; i < (row_max >> 2); ++i) {
5425     __m128i buf0[32];
5426     const int32_t *input_row = input + i * input_stride * 4;
5427     for (int j = 0; j < (input_stride >> 2); ++j) {
5428       __m128i *buf0_cur = buf0 + j * 4;
5429       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5430     }
5431     if (rect_type == 1 || rect_type == -1) {
5432       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
5433                                            NewInvSqrt2);
5434     }
5435     row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5436              -shift[0]);
5437 
5438     __m128i *_buf1 = buf1 + i * 4;
5439     for (int j = 0; j < (input_stride >> 2); ++j) {
5440       _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
5441       _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
5442       _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
5443       _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
5444     }
5445   }
5446   for (int i = 0; i < (input_stride >> 2); i++) {
5447     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5448              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5449 
5450     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5451                                     buf1 + i * txfm_size_row, txfm_size_row,
5452                                     -shift[1]);
5453   }
5454 
5455   // write to buffer
5456   {
5457     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5458       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5459                                      output + 8 * i, stride, 0, txfm_size_row,
5460                                      bd);
5461     }
5462   }
5463 }
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5464 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5465                                                     uint16_t *output,
5466                                                     int stride, TX_TYPE tx_type,
5467                                                     TX_SIZE tx_size, int eob,
5468                                                     const int bd) {
5469   __m128i buf1[64 * 16];
5470   int eobx, eoby;
5471   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5472   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5473   const int txw_idx = get_txw_idx(tx_size);
5474   const int txh_idx = get_txh_idx(tx_size);
5475   const int txfm_size_col = tx_size_wide[tx_size];
5476   const int txfm_size_row = tx_size_high[tx_size];
5477   const int buf_size_w_div8 = txfm_size_col >> 2;
5478   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5479   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5480   const int input_stride = AOMMIN(32, txfm_size_col);
5481   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5482 
5483   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5484   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5485   const transform_1d_sse4_1 row_txfm =
5486       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5487   const transform_1d_sse4_1 col_txfm =
5488       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5489 
5490   assert(col_txfm != NULL);
5491   assert(row_txfm != NULL);
5492   int ud_flip, lr_flip;
5493   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5494 
5495   // 1st stage: column transform
5496   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5497     __m128i buf0[64];
5498     const int32_t *input_row = input + i * input_stride * 4;
5499     for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5500       __m128i *buf0_cur = buf0 + j * 4;
5501       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5502 
5503       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5504                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5505     }
5506     if (rect_type == 1 || rect_type == -1) {
5507       av1_round_shift_rect_array_32_sse4_1(
5508           buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
5509     }
5510     row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5511              -shift[0]);
5512 
5513     __m128i *_buf1 = buf1 + i * 4;
5514     if (lr_flip) {
5515       for (int j = 0; j < buf_size_w_div8; ++j) {
5516         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5517                       buf0[4 * j],
5518                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5519                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5520                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5521                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5522       }
5523     } else {
5524       for (int j = 0; j < buf_size_w_div8; ++j) {
5525         TRANSPOSE_4X4(
5526             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5527             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5528             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5529       }
5530     }
5531   }
5532   // 2nd stage: column transform
5533   for (int i = 0; i < buf_size_w_div8; i++) {
5534     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5535              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5536 
5537     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5538                                     buf1 + i * txfm_size_row, txfm_size_row,
5539                                     -shift[1]);
5540   }
5541 
5542   // write to buffer
5543   {
5544     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5545       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5546                                      output + 8 * i, stride, ud_flip,
5547                                      txfm_size_row, bd);
5548     }
5549   }
5550 }
5551 
highbd_inv_txfm2d_add_4x8_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5552 static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
5553                                             uint16_t *output, int stride,
5554                                             TX_TYPE tx_type, TX_SIZE tx_size,
5555                                             int eob, const int bd) {
5556   (void)eob;
5557   __m128i buf1[8];
5558   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5559   const int txw_idx = get_txw_idx(tx_size);
5560   const int txh_idx = get_txh_idx(tx_size);
5561   const int txfm_size_col = tx_size_wide[tx_size];
5562   const int txfm_size_row = tx_size_high[tx_size];
5563   const transform_1d_sse4_1 row_txfm =
5564       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5565   const transform_1d_sse4_1 col_txfm =
5566       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
5567   const int input_stride = AOMMIN(32, txfm_size_col);
5568 
5569   assert(col_txfm != NULL);
5570   assert(row_txfm != NULL);
5571   int ud_flip, lr_flip;
5572   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5573 
5574   // 1st stage: column transform
5575   __m128i buf0[8];
5576   const int32_t *input_row = input;
5577   __m128i *buf0_cur = buf0;
5578   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5579   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
5580                                        NewInvSqrt2);
5581   row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5582   row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
5583            -shift[0]);
5584 
5585   if (lr_flip) {
5586     TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
5587                   buf1[3]);
5588 
5589     TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
5590                   buf1[7]);
5591   } else {
5592     TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
5593                   buf1[3]);
5594 
5595     TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
5596                   buf1[7]);
5597   }
5598 
5599   // 2nd stage: column transform
5600   col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5601 
5602   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5603 
5604   // write to buffer
5605   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5606                                  bd);
5607 }
5608 
highbd_inv_txfm2d_add_8x4_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5609 static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
5610                                             uint16_t *output, int stride,
5611                                             TX_TYPE tx_type, TX_SIZE tx_size,
5612                                             int eob, const int bd) {
5613   (void)eob;
5614   __m128i buf1[8];
5615   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5616   const int txw_idx = get_txw_idx(tx_size);
5617   const int txh_idx = get_txh_idx(tx_size);
5618   const int txfm_size_col = tx_size_wide[tx_size];
5619   const int txfm_size_row = tx_size_high[tx_size];
5620   const transform_1d_sse4_1 row_txfm =
5621       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
5622   const transform_1d_sse4_1 col_txfm =
5623       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5624 
5625   assert(col_txfm != NULL);
5626   assert(row_txfm != NULL);
5627   int ud_flip, lr_flip;
5628   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5629 
5630   // 1st stage: column transform
5631   __m128i buf0[8];
5632   const int32_t *input_row = input;
5633   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5634 
5635   TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
5636                 buf1[3]);
5637   TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
5638                 buf1[7]);
5639 
5640   av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
5641                                        NewInvSqrt2);
5642   row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5643 
5644   __m128i *buf1_ptr;
5645   if (lr_flip) {
5646     flip_buf_sse2(buf0, buf1, txfm_size_col);
5647     buf1_ptr = buf1;
5648   } else {
5649     buf1_ptr = buf0;
5650   }
5651 
5652   // 2nd stage: column transform
5653   for (int i = 0; i < 2; i++) {
5654     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5655              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5656   }
5657   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5658   // write to buffer
5659   highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
5660                                  txfm_size_row, bd);
5661 }
5662 
highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5663 static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
5664                                               uint16_t *output, int stride,
5665                                               TX_TYPE tx_type, TX_SIZE tx_size,
5666                                               int eob, const int bd) {
5667   (void)eob;
5668   __m128i buf1[16];
5669   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5670   const int txw_idx = get_txw_idx(tx_size);
5671   const int txh_idx = get_txh_idx(tx_size);
5672   const int txfm_size_col = tx_size_wide[tx_size];
5673   const int txfm_size_row = tx_size_high[tx_size];
5674   const int buf_size_h_div8 = txfm_size_row >> 2;
5675   const transform_1d_sse4_1 row_txfm =
5676       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5677   const transform_1d_sse4_1 col_txfm =
5678       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
5679   const int input_stride = AOMMIN(32, txfm_size_col);
5680 
5681   assert(col_txfm != NULL);
5682   assert(row_txfm != NULL);
5683   int ud_flip, lr_flip;
5684   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5685 
5686   // 1st stage: column transform
5687   __m128i buf0[16];
5688   const int32_t *input_row = input;
5689   __m128i *buf0_cur = buf0;
5690   load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
5691   for (int i = 0; i < (txfm_size_row >> 2); i++) {
5692     row_txfm(buf0 + (i << 2), buf0 + (i << 2),
5693              av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5694   }
5695 
5696   if (lr_flip) {
5697     for (int j = 0; j < buf_size_h_div8; ++j) {
5698       TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5699                     buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
5700                     buf1[4 * j + 3]);
5701     }
5702   } else {
5703     for (int j = 0; j < buf_size_h_div8; ++j) {
5704       TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
5705                     buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
5706                     buf1[4 * j + 2], buf1[4 * j + 3]);
5707     }
5708   }
5709 
5710   // 2nd stage: column transform
5711   col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5712 
5713   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5714 
5715   // write to buffer
5716   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5717                                  bd);
5718 }
5719 
highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5720 static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
5721                                               uint16_t *output, int stride,
5722                                               TX_TYPE tx_type, TX_SIZE tx_size,
5723                                               int eob, const int bd) {
5724   (void)eob;
5725   __m128i buf1[16];
5726   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5727   const int txw_idx = get_txw_idx(tx_size);
5728   const int txh_idx = get_txh_idx(tx_size);
5729   const int txfm_size_col = tx_size_wide[tx_size];
5730   const int txfm_size_row = tx_size_high[tx_size];
5731   const int buf_size_w_div8 = txfm_size_col >> 2;
5732   const transform_1d_sse4_1 row_txfm =
5733       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
5734   const transform_1d_sse4_1 col_txfm =
5735       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5736 
5737   assert(col_txfm != NULL);
5738   assert(row_txfm != NULL);
5739   int ud_flip, lr_flip;
5740   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5741 
5742   // 1st stage: column transform
5743   __m128i buf0[16];
5744   const int32_t *input_row = input;
5745   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5746 
5747   for (int j = 0; j < buf_size_w_div8; j++) {
5748     TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
5749                   buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
5750   }
5751   row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5752 
5753   __m128i *buf1_ptr;
5754   if (lr_flip) {
5755     flip_buf_sse2(buf0, buf1, txfm_size_col);
5756     buf1_ptr = buf1;
5757   } else {
5758     buf1_ptr = buf0;
5759   }
5760 
5761   // 2nd stage: column transform
5762   for (int i = 0; i < buf_size_w_div8; i++) {
5763     col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
5764              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5765   }
5766   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5767 
5768   // write to buffer
5769   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5770     highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
5771                                    output + 8 * i, stride, ud_flip,
5772                                    txfm_size_row, bd);
5773   }
5774 }
5775 
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5776 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5777                                                uint8_t *output, int stride,
5778                                                TX_TYPE tx_type, TX_SIZE tx_size,
5779                                                int eob, const int bd) {
5780   switch (tx_type) {
5781     case DCT_DCT:
5782     case ADST_DCT:
5783     case DCT_ADST:
5784     case ADST_ADST:
5785     case FLIPADST_DCT:
5786     case DCT_FLIPADST:
5787     case FLIPADST_FLIPADST:
5788     case ADST_FLIPADST:
5789     case FLIPADST_ADST:
5790       highbd_inv_txfm2d_add_no_identity_sse41(
5791           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5792           bd);
5793       break;
5794     case V_DCT:
5795     case V_ADST:
5796     case V_FLIPADST:
5797       highbd_inv_txfm2d_add_h_identity_ssse41(
5798           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5799           bd);
5800       break;
5801     case H_DCT:
5802     case H_ADST:
5803     case H_FLIPADST:
5804       highbd_inv_txfm2d_add_v_identity_ssse41(
5805           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5806           bd);
5807       break;
5808     case IDTX:
5809       highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
5810                                         stride, tx_type, tx_size, eob, bd);
5811       break;
5812     default: assert(0); break;
5813   }
5814 }
5815 
av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5816 void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5817                                         int stride,
5818                                         const TxfmParam *txfm_param) {
5819   int bd = txfm_param->bd;
5820   const TX_TYPE tx_type = txfm_param->tx_type;
5821   const TX_SIZE tx_size = txfm_param->tx_size;
5822   int eob = txfm_param->eob;
5823   highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5824                                   tx_type, tx_size, eob, bd);
5825 }
5826 
av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5827 void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5828                                         int stride,
5829                                         const TxfmParam *txfm_param) {
5830   int bd = txfm_param->bd;
5831   const TX_TYPE tx_type = txfm_param->tx_type;
5832   const TX_SIZE tx_size = txfm_param->tx_size;
5833   int eob = txfm_param->eob;
5834   highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5835                                   tx_type, tx_size, eob, bd);
5836 }
5837 
av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5838 void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
5839                                          int stride,
5840                                          const TxfmParam *txfm_param) {
5841   int bd = txfm_param->bd;
5842   const TX_TYPE tx_type = txfm_param->tx_type;
5843   const TX_SIZE tx_size = txfm_param->tx_size;
5844   int eob = txfm_param->eob;
5845   highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5846                                     tx_type, tx_size, eob, bd);
5847 }
5848 
av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5849 void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5850                                          int stride,
5851                                          const TxfmParam *txfm_param) {
5852   int bd = txfm_param->bd;
5853   const TX_TYPE tx_type = txfm_param->tx_type;
5854   const TX_SIZE tx_size = txfm_param->tx_size;
5855   int eob = txfm_param->eob;
5856   highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5857                                     tx_type, tx_size, eob, bd);
5858 }
5859 
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5860 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5861                                     int stride, const TxfmParam *txfm_param) {
5862   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5863   const TX_SIZE tx_size = txfm_param->tx_size;
5864   switch (tx_size) {
5865     case TX_8X8:
5866       av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
5867       break;
5868     case TX_4X8:
5869       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
5870       break;
5871     case TX_8X4:
5872       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
5873       break;
5874     case TX_4X4:
5875       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
5876       break;
5877     case TX_16X4:
5878       av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
5879       break;
5880     case TX_4X16:
5881       av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
5882       break;
5883     default:
5884       av1_highbd_inv_txfm2d_add_universe_sse4_1(
5885           input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
5886           txfm_param->bd);
5887       break;
5888   }
5889 }
5890