1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse4.h"
21 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
22
highbd_clamp_epi16(__m128i u,int bd)23 static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
24 const __m128i zero = _mm_setzero_si128();
25 const __m128i one = _mm_set1_epi16(1);
26 const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
27 __m128i clamped, mask;
28
29 mask = _mm_cmpgt_epi16(u, max);
30 clamped = _mm_andnot_si128(mask, u);
31 mask = _mm_and_si128(mask, max);
32 clamped = _mm_or_si128(mask, clamped);
33 mask = _mm_cmpgt_epi16(clamped, zero);
34 clamped = _mm_and_si128(clamped, mask);
35
36 return clamped;
37 }
38
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)39 static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
40 __m128i res0, __m128i res1,
41 const int bd) {
42 __m128i x0 = _mm_cvtepi16_epi32(pred);
43 __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
44
45 x0 = _mm_add_epi32(res0, x0);
46 x1 = _mm_add_epi32(res1, x1);
47 x0 = _mm_packus_epi32(x0, x1);
48 x0 = highbd_clamp_epi16(x0, bd);
49 return x0;
50 }
51
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)52 static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
53 int stride, int flipud,
54 int height, const int bd) {
55 int j = flipud ? (height - 1) : 0;
56 const int step = flipud ? -1 : 1;
57 for (int i = 0; i < height; ++i, j += step) {
58 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
59 __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
60
61 _mm_storeu_si128((__m128i *)(output + i * stride), u);
62 }
63 }
64
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)65 static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
66 __m128i *out, int out_size) {
67 for (int i = 0; i < out_size; ++i) {
68 out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
69 }
70 }
71
load_buffer_4x4(const int32_t * coeff,__m128i * in)72 static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
73 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
74 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
75 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
76 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
77 }
78
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)79 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
80 __m128i *out1, const __m128i *clamp_lo,
81 const __m128i *clamp_hi) {
82 __m128i a0 = _mm_add_epi32(in0, in1);
83 __m128i a1 = _mm_sub_epi32(in0, in1);
84
85 a0 = _mm_max_epi32(a0, *clamp_lo);
86 a0 = _mm_min_epi32(a0, *clamp_hi);
87 a1 = _mm_max_epi32(a1, *clamp_lo);
88 a1 = _mm_min_epi32(a1, *clamp_hi);
89
90 *out0 = a0;
91 *out1 = a1;
92 }
93
addsub_no_clamp_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1)94 static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
95 __m128i *out0, __m128i *out1) {
96 __m128i a0 = _mm_add_epi32(in0, in1);
97 __m128i a1 = _mm_sub_epi32(in0, in1);
98
99 *out0 = a0;
100 *out1 = a1;
101 }
102
addsub_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)103 static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
104 __m128i *out0, __m128i *out1,
105 const __m128i *clamp_lo,
106 const __m128i *clamp_hi, int shift) {
107 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
108 __m128i in0_w_offset = _mm_add_epi32(in0, offset);
109 __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
110 __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
111
112 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
113 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
114
115 a0 = _mm_max_epi32(a0, *clamp_lo);
116 a0 = _mm_min_epi32(a0, *clamp_hi);
117 a1 = _mm_max_epi32(a1, *clamp_lo);
118 a1 = _mm_min_epi32(a1, *clamp_hi);
119
120 *out0 = a0;
121 *out1 = a1;
122 }
123
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)124 static INLINE void idct32_stage4_sse4_1(
125 __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
126 const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
127 const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
128 const __m128i *rounding, int bit) {
129 __m128i temp1, temp2;
130 temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
131 bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
132 bf1[17] = temp1;
133
134 temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
135 bf1[29] =
136 half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
137 bf1[18] = temp2;
138
139 temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
140 bf1[26] =
141 half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
142 bf1[21] = temp1;
143
144 temp2 =
145 half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
146 bf1[25] =
147 half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
148 bf1[22] = temp2;
149 }
150
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)151 static INLINE void idct32_stage5_sse4_1(
152 __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
153 const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
154 const __m128i *clamp_hi, const __m128i *rounding, int bit) {
155 __m128i temp1, temp2;
156 temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
157 bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
158 bf1[9] = temp1;
159
160 temp2 =
161 half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
162 bf1[13] =
163 half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
164 bf1[10] = temp2;
165
166 addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
167 addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
168 addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
169 addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
170 addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
171 addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
172 addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
173 addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
174 }
175
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)176 static INLINE void idct32_stage6_sse4_1(
177 __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
178 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
179 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
180 const __m128i *rounding, int bit) {
181 __m128i temp1, temp2;
182 temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
183 bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
184 bf1[5] = temp1;
185
186 addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
187 addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
188 addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
189 addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
190
191 temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
192 bf1[29] =
193 half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
194 bf1[18] = temp1;
195 temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
196 bf1[28] =
197 half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
198 bf1[19] = temp2;
199 temp1 =
200 half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
201 bf1[27] =
202 half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
203 bf1[20] = temp1;
204 temp2 =
205 half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
206 bf1[26] =
207 half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
208 bf1[21] = temp2;
209 }
210
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)211 static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
212 const __m128i *cospi32,
213 const __m128i *clamp_lo,
214 const __m128i *clamp_hi,
215 const __m128i *rounding, int bit) {
216 __m128i temp1, temp2;
217 addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
218 addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
219 addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
220 addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
221
222 temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
223 bf1[13] =
224 half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
225 bf1[10] = temp1;
226 temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
227 bf1[12] =
228 half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
229 bf1[11] = temp2;
230
231 addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
232 addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
233 addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
234 addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
235 addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
236 addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
237 addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
238 addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
239 }
240
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)241 static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
242 const __m128i *cospi32,
243 const __m128i *clamp_lo,
244 const __m128i *clamp_hi,
245 const __m128i *rounding, int bit) {
246 __m128i temp1, temp2;
247 addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
248 addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
249 addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
250 addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
251 addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
252 addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
253 addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
254 addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
255
256 temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
257 bf1[27] =
258 half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
259 bf1[20] = temp1;
260 temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
261 bf1[26] =
262 half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
263 bf1[21] = temp2;
264 temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
265 bf1[25] =
266 half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
267 bf1[22] = temp1;
268 temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
269 bf1[24] =
270 half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
271 bf1[23] = temp2;
272 }
273
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const int log_range)274 static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
275 const int do_cols, const int bd,
276 const int out_shift,
277 const int log_range) {
278 if (do_cols) {
279 addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
280 addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
281 addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
282 addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
283 addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
284 addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
285 addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
286 addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
287 addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
288 addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
289 addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
290 addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
291 addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
292 addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
293 addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
294 addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
295 } else {
296 const int log_range_out = AOMMAX(16, bd + 6);
297 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
298 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
299 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
300 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
301
302 addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
303 &clamp_hi_out, out_shift);
304 addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
305 &clamp_hi_out, out_shift);
306 addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
307 &clamp_hi_out, out_shift);
308 addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
309 &clamp_hi_out, out_shift);
310 addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
311 &clamp_hi_out, out_shift);
312 addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
313 &clamp_hi_out, out_shift);
314 addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
315 &clamp_hi_out, out_shift);
316 addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
317 &clamp_hi_out, out_shift);
318 addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
319 &clamp_hi_out, out_shift);
320 addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
321 &clamp_hi_out, out_shift);
322 addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
323 &clamp_hi_out, out_shift);
324 addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
325 &clamp_hi_out, out_shift);
326 addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
327 &clamp_hi_out, out_shift);
328 addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
329 &clamp_hi_out, out_shift);
330 addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
331 &clamp_hi_out, out_shift);
332 addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
333 &clamp_hi_out, out_shift);
334 }
335 }
336
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)337 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
338 __m128i *out0, __m128i *out1,
339 const __m128i *clamp_lo, const __m128i *clamp_hi,
340 int shift) {
341 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
342 __m128i a0 = _mm_add_epi32(offset, in0);
343 __m128i a1 = _mm_sub_epi32(offset, in1);
344
345 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
346 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
347
348 a0 = _mm_max_epi32(a0, *clamp_lo);
349 a0 = _mm_min_epi32(a0, *clamp_hi);
350 a1 = _mm_max_epi32(a1, *clamp_lo);
351 a1 = _mm_min_epi32(a1, *clamp_hi);
352
353 *out0 = a0;
354 *out1 = a1;
355 }
356
idct4x4_sse4_1(__m128i * in,int bit,int do_cols,int bd)357 static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
358 const int32_t *cospi = cospi_arr(bit);
359 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
360 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
361 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
362 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
363 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
364
365 __m128i u0, u1, u2, u3;
366 __m128i v0, v1, v2, v3, x, y;
367
368 v0 = _mm_unpacklo_epi32(in[0], in[1]);
369 v1 = _mm_unpackhi_epi32(in[0], in[1]);
370 v2 = _mm_unpacklo_epi32(in[2], in[3]);
371 v3 = _mm_unpackhi_epi32(in[2], in[3]);
372
373 u0 = _mm_unpacklo_epi64(v0, v2);
374 u1 = _mm_unpackhi_epi64(v0, v2);
375 u2 = _mm_unpacklo_epi64(v1, v3);
376 u3 = _mm_unpackhi_epi64(v1, v3);
377
378 x = _mm_mullo_epi32(u0, cospi32);
379 y = _mm_mullo_epi32(u2, cospi32);
380 v0 = _mm_add_epi32(x, y);
381 v0 = _mm_add_epi32(v0, rnding);
382 v0 = _mm_srai_epi32(v0, bit);
383
384 v1 = _mm_sub_epi32(x, y);
385 v1 = _mm_add_epi32(v1, rnding);
386 v1 = _mm_srai_epi32(v1, bit);
387
388 x = _mm_mullo_epi32(u1, cospi48);
389 y = _mm_mullo_epi32(u3, cospim16);
390 v2 = _mm_add_epi32(x, y);
391 v2 = _mm_add_epi32(v2, rnding);
392 v2 = _mm_srai_epi32(v2, bit);
393
394 x = _mm_mullo_epi32(u1, cospi16);
395 y = _mm_mullo_epi32(u3, cospi48);
396 v3 = _mm_add_epi32(x, y);
397 v3 = _mm_add_epi32(v3, rnding);
398 v3 = _mm_srai_epi32(v3, bit);
399
400 if (do_cols) {
401 addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
402 addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
403 } else {
404 const int log_range = AOMMAX(16, bd + 6);
405 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
406 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
407 addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
408 addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
409 }
410 }
411
iadst4x4_sse4_1(__m128i * in,int bit,int do_cols,int bd)412 static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
413 const int32_t *sinpi = sinpi_arr(bit);
414 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
415 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
416 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
417 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
418 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
419 __m128i t;
420 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
421 __m128i x0, x1, x2, x3;
422 __m128i u0, u1, u2, u3;
423 __m128i v0, v1, v2, v3;
424
425 v0 = _mm_unpacklo_epi32(in[0], in[1]);
426 v1 = _mm_unpackhi_epi32(in[0], in[1]);
427 v2 = _mm_unpacklo_epi32(in[2], in[3]);
428 v3 = _mm_unpackhi_epi32(in[2], in[3]);
429
430 x0 = _mm_unpacklo_epi64(v0, v2);
431 x1 = _mm_unpackhi_epi64(v0, v2);
432 x2 = _mm_unpacklo_epi64(v1, v3);
433 x3 = _mm_unpackhi_epi64(v1, v3);
434
435 s0 = _mm_mullo_epi32(x0, sinpi1);
436 s1 = _mm_mullo_epi32(x0, sinpi2);
437 s2 = _mm_mullo_epi32(x1, sinpi3);
438 s3 = _mm_mullo_epi32(x2, sinpi4);
439 s4 = _mm_mullo_epi32(x2, sinpi1);
440 s5 = _mm_mullo_epi32(x3, sinpi2);
441 s6 = _mm_mullo_epi32(x3, sinpi4);
442 t = _mm_sub_epi32(x0, x2);
443 s7 = _mm_add_epi32(t, x3);
444
445 t = _mm_add_epi32(s0, s3);
446 s0 = _mm_add_epi32(t, s5);
447 t = _mm_sub_epi32(s1, s4);
448 s1 = _mm_sub_epi32(t, s6);
449 s3 = s2;
450 s2 = _mm_mullo_epi32(s7, sinpi3);
451
452 u0 = _mm_add_epi32(s0, s3);
453 u1 = _mm_add_epi32(s1, s3);
454 u2 = s2;
455 t = _mm_add_epi32(s0, s1);
456 u3 = _mm_sub_epi32(t, s3);
457
458 u0 = _mm_add_epi32(u0, rnding);
459 u0 = _mm_srai_epi32(u0, bit);
460
461 u1 = _mm_add_epi32(u1, rnding);
462 u1 = _mm_srai_epi32(u1, bit);
463
464 u2 = _mm_add_epi32(u2, rnding);
465 u2 = _mm_srai_epi32(u2, bit);
466
467 u3 = _mm_add_epi32(u3, rnding);
468 u3 = _mm_srai_epi32(u3, bit);
469
470 if (!do_cols) {
471 const int log_range = AOMMAX(16, bd + 6);
472 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
473 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
474
475 u0 = _mm_max_epi32(u0, clamp_lo);
476 u0 = _mm_min_epi32(u0, clamp_hi);
477 u1 = _mm_max_epi32(u1, clamp_lo);
478 u1 = _mm_min_epi32(u1, clamp_hi);
479 u2 = _mm_max_epi32(u2, clamp_lo);
480 u2 = _mm_min_epi32(u2, clamp_hi);
481 u3 = _mm_max_epi32(u3, clamp_lo);
482 u3 = _mm_min_epi32(u3, clamp_hi);
483 }
484
485 in[0] = u0;
486 in[1] = u1;
487 in[2] = u2;
488 in[3] = u3;
489 }
490
round_shift_4x4(__m128i * in,int shift)491 static INLINE void round_shift_4x4(__m128i *in, int shift) {
492 __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
493
494 in[0] = _mm_add_epi32(in[0], rnding);
495 in[1] = _mm_add_epi32(in[1], rnding);
496 in[2] = _mm_add_epi32(in[2], rnding);
497 in[3] = _mm_add_epi32(in[3], rnding);
498
499 in[0] = _mm_srai_epi32(in[0], shift);
500 in[1] = _mm_srai_epi32(in[1], shift);
501 in[2] = _mm_srai_epi32(in[2], shift);
502 in[3] = _mm_srai_epi32(in[3], shift);
503 }
504
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)505 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
506 int fliplr, int flipud, int shift, int bd) {
507 const __m128i zero = _mm_setzero_si128();
508 __m128i u0, u1, u2, u3;
509 __m128i v0, v1, v2, v3;
510
511 round_shift_4x4(in, shift);
512
513 v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
514 v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
515 v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
516 v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
517
518 v0 = _mm_unpacklo_epi16(v0, zero);
519 v1 = _mm_unpacklo_epi16(v1, zero);
520 v2 = _mm_unpacklo_epi16(v2, zero);
521 v3 = _mm_unpacklo_epi16(v3, zero);
522
523 if (fliplr) {
524 in[0] = _mm_shuffle_epi32(in[0], 0x1B);
525 in[1] = _mm_shuffle_epi32(in[1], 0x1B);
526 in[2] = _mm_shuffle_epi32(in[2], 0x1B);
527 in[3] = _mm_shuffle_epi32(in[3], 0x1B);
528 }
529
530 if (flipud) {
531 u0 = _mm_add_epi32(in[3], v0);
532 u1 = _mm_add_epi32(in[2], v1);
533 u2 = _mm_add_epi32(in[1], v2);
534 u3 = _mm_add_epi32(in[0], v3);
535 } else {
536 u0 = _mm_add_epi32(in[0], v0);
537 u1 = _mm_add_epi32(in[1], v1);
538 u2 = _mm_add_epi32(in[2], v2);
539 u3 = _mm_add_epi32(in[3], v3);
540 }
541
542 v0 = _mm_packus_epi32(u0, u1);
543 v2 = _mm_packus_epi32(u2, u3);
544
545 u0 = highbd_clamp_epi16(v0, bd);
546 u2 = highbd_clamp_epi16(v2, bd);
547
548 v0 = _mm_unpacklo_epi64(u0, u0);
549 v1 = _mm_unpackhi_epi64(u0, u0);
550 v2 = _mm_unpacklo_epi64(u2, u2);
551 v3 = _mm_unpackhi_epi64(u2, u2);
552
553 _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
554 _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
555 _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
556 _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
557 }
558
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)559 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
560 int stride, TX_TYPE tx_type, int bd) {
561 __m128i in[4];
562 const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
563 const int txw_idx = get_txw_idx(TX_4X4);
564 const int txh_idx = get_txh_idx(TX_4X4);
565
566 switch (tx_type) {
567 case DCT_DCT:
568 load_buffer_4x4(coeff, in);
569 idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
570 idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
571 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
572 break;
573 case ADST_DCT:
574 load_buffer_4x4(coeff, in);
575 idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
576 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
577 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
578 break;
579 case DCT_ADST:
580 load_buffer_4x4(coeff, in);
581 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
582 idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
583 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
584 break;
585 case ADST_ADST:
586 load_buffer_4x4(coeff, in);
587 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
588 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
589 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
590 break;
591 case FLIPADST_DCT:
592 load_buffer_4x4(coeff, in);
593 idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
594 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
595 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
596 break;
597 case DCT_FLIPADST:
598 load_buffer_4x4(coeff, in);
599 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
600 idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
601 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
602 break;
603 case FLIPADST_FLIPADST:
604 load_buffer_4x4(coeff, in);
605 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
606 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
607 write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
608 break;
609 case ADST_FLIPADST:
610 load_buffer_4x4(coeff, in);
611 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
612 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
613 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
614 break;
615 case FLIPADST_ADST:
616 load_buffer_4x4(coeff, in);
617 iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
618 iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
619 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
620 break;
621 default: assert(0);
622 }
623 }
624
625 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)626 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
627 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
628 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
629 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
630 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
631 in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
632 in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
633 in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
634 in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
635 in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
636 in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
637 in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
638 in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
639 in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
640 in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
641 in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
642 in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
643 }
644
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)645 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
646 int bd, int out_shift) {
647 const int32_t *cospi = cospi_arr(bit);
648 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
649 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
650 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
651 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
652 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
653 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
654 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
655 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
656 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
657 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
658 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
659 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
660 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
661 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
662 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
663 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
664 __m128i x, y;
665 int col;
666
667 // Note:
668 // Even column: 0, 2, ..., 14
669 // Odd column: 1, 3, ..., 15
670 // one even column plus one odd column constructs one row (8 coeffs)
671 // total we have 8 rows (8x8).
672 for (col = 0; col < 2; ++col) {
673 // stage 0
674 // stage 1
675 // stage 2
676 u0 = in[0 * 2 + col];
677 u1 = in[4 * 2 + col];
678 u2 = in[2 * 2 + col];
679 u3 = in[6 * 2 + col];
680
681 x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
682 y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
683 u4 = _mm_add_epi32(x, y);
684 u4 = _mm_add_epi32(u4, rnding);
685 u4 = _mm_srai_epi32(u4, bit);
686
687 x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
688 y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
689 u7 = _mm_add_epi32(x, y);
690 u7 = _mm_add_epi32(u7, rnding);
691 u7 = _mm_srai_epi32(u7, bit);
692
693 x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
694 y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
695 u5 = _mm_add_epi32(x, y);
696 u5 = _mm_add_epi32(u5, rnding);
697 u5 = _mm_srai_epi32(u5, bit);
698
699 x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
700 y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
701 u6 = _mm_add_epi32(x, y);
702 u6 = _mm_add_epi32(u6, rnding);
703 u6 = _mm_srai_epi32(u6, bit);
704
705 // stage 3
706 x = _mm_mullo_epi32(u0, cospi32);
707 y = _mm_mullo_epi32(u1, cospi32);
708 v0 = _mm_add_epi32(x, y);
709 v0 = _mm_add_epi32(v0, rnding);
710 v0 = _mm_srai_epi32(v0, bit);
711
712 v1 = _mm_sub_epi32(x, y);
713 v1 = _mm_add_epi32(v1, rnding);
714 v1 = _mm_srai_epi32(v1, bit);
715
716 x = _mm_mullo_epi32(u2, cospi48);
717 y = _mm_mullo_epi32(u3, cospim16);
718 v2 = _mm_add_epi32(x, y);
719 v2 = _mm_add_epi32(v2, rnding);
720 v2 = _mm_srai_epi32(v2, bit);
721
722 x = _mm_mullo_epi32(u2, cospi16);
723 y = _mm_mullo_epi32(u3, cospi48);
724 v3 = _mm_add_epi32(x, y);
725 v3 = _mm_add_epi32(v3, rnding);
726 v3 = _mm_srai_epi32(v3, bit);
727
728 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
729 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
730
731 // stage 4
732 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
733 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
734 u4 = v4;
735 u7 = v7;
736
737 x = _mm_mullo_epi32(v5, cospi32);
738 y = _mm_mullo_epi32(v6, cospi32);
739 u6 = _mm_add_epi32(y, x);
740 u6 = _mm_add_epi32(u6, rnding);
741 u6 = _mm_srai_epi32(u6, bit);
742
743 u5 = _mm_sub_epi32(y, x);
744 u5 = _mm_add_epi32(u5, rnding);
745 u5 = _mm_srai_epi32(u5, bit);
746
747 // stage 5
748 if (do_cols) {
749 addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
750 addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
751 addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
752 addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
753 } else {
754 const int log_range_out = AOMMAX(16, bd + 6);
755 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
756 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
757 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
758 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
759 addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
760 &clamp_lo_out, &clamp_hi_out, out_shift);
761 addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
762 &clamp_lo_out, &clamp_hi_out, out_shift);
763 addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
764 &clamp_lo_out, &clamp_hi_out, out_shift);
765 addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
766 &clamp_lo_out, &clamp_hi_out, out_shift);
767 }
768 }
769 }
770
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)771 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
772 int bd, int out_shift) {
773 const int32_t *cospi = cospi_arr(bit);
774 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
775 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
776 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
777 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
778 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
779 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
780 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
781 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
782 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
783 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
784 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
785 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
786 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
787 const __m128i kZero = _mm_setzero_si128();
788 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
789 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
790 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
791 __m128i u[8], v[8], x;
792
793 // Even 8 points: 0, 2, ..., 14
794 // stage 0
795 // stage 1
796 // stage 2
797 // (1)
798 u[0] = _mm_mullo_epi32(in[14], cospi4);
799 x = _mm_mullo_epi32(in[0], cospi60);
800 u[0] = _mm_add_epi32(u[0], x);
801 u[0] = _mm_add_epi32(u[0], rnding);
802 u[0] = _mm_srai_epi32(u[0], bit);
803
804 u[1] = _mm_mullo_epi32(in[14], cospi60);
805 x = _mm_mullo_epi32(in[0], cospi4);
806 u[1] = _mm_sub_epi32(u[1], x);
807 u[1] = _mm_add_epi32(u[1], rnding);
808 u[1] = _mm_srai_epi32(u[1], bit);
809
810 // (2)
811 u[2] = _mm_mullo_epi32(in[10], cospi20);
812 x = _mm_mullo_epi32(in[4], cospi44);
813 u[2] = _mm_add_epi32(u[2], x);
814 u[2] = _mm_add_epi32(u[2], rnding);
815 u[2] = _mm_srai_epi32(u[2], bit);
816
817 u[3] = _mm_mullo_epi32(in[10], cospi44);
818 x = _mm_mullo_epi32(in[4], cospi20);
819 u[3] = _mm_sub_epi32(u[3], x);
820 u[3] = _mm_add_epi32(u[3], rnding);
821 u[3] = _mm_srai_epi32(u[3], bit);
822
823 // (3)
824 u[4] = _mm_mullo_epi32(in[6], cospi36);
825 x = _mm_mullo_epi32(in[8], cospi28);
826 u[4] = _mm_add_epi32(u[4], x);
827 u[4] = _mm_add_epi32(u[4], rnding);
828 u[4] = _mm_srai_epi32(u[4], bit);
829
830 u[5] = _mm_mullo_epi32(in[6], cospi28);
831 x = _mm_mullo_epi32(in[8], cospi36);
832 u[5] = _mm_sub_epi32(u[5], x);
833 u[5] = _mm_add_epi32(u[5], rnding);
834 u[5] = _mm_srai_epi32(u[5], bit);
835
836 // (4)
837 u[6] = _mm_mullo_epi32(in[2], cospi52);
838 x = _mm_mullo_epi32(in[12], cospi12);
839 u[6] = _mm_add_epi32(u[6], x);
840 u[6] = _mm_add_epi32(u[6], rnding);
841 u[6] = _mm_srai_epi32(u[6], bit);
842
843 u[7] = _mm_mullo_epi32(in[2], cospi12);
844 x = _mm_mullo_epi32(in[12], cospi52);
845 u[7] = _mm_sub_epi32(u[7], x);
846 u[7] = _mm_add_epi32(u[7], rnding);
847 u[7] = _mm_srai_epi32(u[7], bit);
848
849 // stage 3
850 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
851 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
852 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
853 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
854
855 // stage 4
856 u[0] = v[0];
857 u[1] = v[1];
858 u[2] = v[2];
859 u[3] = v[3];
860
861 u[4] = _mm_mullo_epi32(v[4], cospi16);
862 x = _mm_mullo_epi32(v[5], cospi48);
863 u[4] = _mm_add_epi32(u[4], x);
864 u[4] = _mm_add_epi32(u[4], rnding);
865 u[4] = _mm_srai_epi32(u[4], bit);
866
867 u[5] = _mm_mullo_epi32(v[4], cospi48);
868 x = _mm_mullo_epi32(v[5], cospi16);
869 u[5] = _mm_sub_epi32(u[5], x);
870 u[5] = _mm_add_epi32(u[5], rnding);
871 u[5] = _mm_srai_epi32(u[5], bit);
872
873 u[6] = _mm_mullo_epi32(v[6], cospim48);
874 x = _mm_mullo_epi32(v[7], cospi16);
875 u[6] = _mm_add_epi32(u[6], x);
876 u[6] = _mm_add_epi32(u[6], rnding);
877 u[6] = _mm_srai_epi32(u[6], bit);
878
879 u[7] = _mm_mullo_epi32(v[6], cospi16);
880 x = _mm_mullo_epi32(v[7], cospim48);
881 u[7] = _mm_sub_epi32(u[7], x);
882 u[7] = _mm_add_epi32(u[7], rnding);
883 u[7] = _mm_srai_epi32(u[7], bit);
884
885 // stage 5
886 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
887 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
888 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
889 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
890
891 // stage 6
892 u[0] = v[0];
893 u[1] = v[1];
894 u[4] = v[4];
895 u[5] = v[5];
896
897 v[0] = _mm_mullo_epi32(v[2], cospi32);
898 x = _mm_mullo_epi32(v[3], cospi32);
899 u[2] = _mm_add_epi32(v[0], x);
900 u[2] = _mm_add_epi32(u[2], rnding);
901 u[2] = _mm_srai_epi32(u[2], bit);
902
903 u[3] = _mm_sub_epi32(v[0], x);
904 u[3] = _mm_add_epi32(u[3], rnding);
905 u[3] = _mm_srai_epi32(u[3], bit);
906
907 v[0] = _mm_mullo_epi32(v[6], cospi32);
908 x = _mm_mullo_epi32(v[7], cospi32);
909 u[6] = _mm_add_epi32(v[0], x);
910 u[6] = _mm_add_epi32(u[6], rnding);
911 u[6] = _mm_srai_epi32(u[6], bit);
912
913 u[7] = _mm_sub_epi32(v[0], x);
914 u[7] = _mm_add_epi32(u[7], rnding);
915 u[7] = _mm_srai_epi32(u[7], bit);
916
917 // stage 7
918 if (do_cols) {
919 out[0] = u[0];
920 out[2] = _mm_sub_epi32(kZero, u[4]);
921 out[4] = u[6];
922 out[6] = _mm_sub_epi32(kZero, u[2]);
923 out[8] = u[3];
924 out[10] = _mm_sub_epi32(kZero, u[7]);
925 out[12] = u[5];
926 out[14] = _mm_sub_epi32(kZero, u[1]);
927 } else {
928 const int log_range_out = AOMMAX(16, bd + 6);
929 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
930 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
931
932 neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
933 out_shift);
934 neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
935 out_shift);
936 neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
937 &clamp_hi_out, out_shift);
938 neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
939 &clamp_hi_out, out_shift);
940 }
941
942 // Odd 8 points: 1, 3, ..., 15
943 // stage 0
944 // stage 1
945 // stage 2
946 // (1)
947 u[0] = _mm_mullo_epi32(in[15], cospi4);
948 x = _mm_mullo_epi32(in[1], cospi60);
949 u[0] = _mm_add_epi32(u[0], x);
950 u[0] = _mm_add_epi32(u[0], rnding);
951 u[0] = _mm_srai_epi32(u[0], bit);
952
953 u[1] = _mm_mullo_epi32(in[15], cospi60);
954 x = _mm_mullo_epi32(in[1], cospi4);
955 u[1] = _mm_sub_epi32(u[1], x);
956 u[1] = _mm_add_epi32(u[1], rnding);
957 u[1] = _mm_srai_epi32(u[1], bit);
958
959 // (2)
960 u[2] = _mm_mullo_epi32(in[11], cospi20);
961 x = _mm_mullo_epi32(in[5], cospi44);
962 u[2] = _mm_add_epi32(u[2], x);
963 u[2] = _mm_add_epi32(u[2], rnding);
964 u[2] = _mm_srai_epi32(u[2], bit);
965
966 u[3] = _mm_mullo_epi32(in[11], cospi44);
967 x = _mm_mullo_epi32(in[5], cospi20);
968 u[3] = _mm_sub_epi32(u[3], x);
969 u[3] = _mm_add_epi32(u[3], rnding);
970 u[3] = _mm_srai_epi32(u[3], bit);
971
972 // (3)
973 u[4] = _mm_mullo_epi32(in[7], cospi36);
974 x = _mm_mullo_epi32(in[9], cospi28);
975 u[4] = _mm_add_epi32(u[4], x);
976 u[4] = _mm_add_epi32(u[4], rnding);
977 u[4] = _mm_srai_epi32(u[4], bit);
978
979 u[5] = _mm_mullo_epi32(in[7], cospi28);
980 x = _mm_mullo_epi32(in[9], cospi36);
981 u[5] = _mm_sub_epi32(u[5], x);
982 u[5] = _mm_add_epi32(u[5], rnding);
983 u[5] = _mm_srai_epi32(u[5], bit);
984
985 // (4)
986 u[6] = _mm_mullo_epi32(in[3], cospi52);
987 x = _mm_mullo_epi32(in[13], cospi12);
988 u[6] = _mm_add_epi32(u[6], x);
989 u[6] = _mm_add_epi32(u[6], rnding);
990 u[6] = _mm_srai_epi32(u[6], bit);
991
992 u[7] = _mm_mullo_epi32(in[3], cospi12);
993 x = _mm_mullo_epi32(in[13], cospi52);
994 u[7] = _mm_sub_epi32(u[7], x);
995 u[7] = _mm_add_epi32(u[7], rnding);
996 u[7] = _mm_srai_epi32(u[7], bit);
997
998 // stage 3
999 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1000 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1001 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1002 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1003
1004 // stage 4
1005 u[0] = v[0];
1006 u[1] = v[1];
1007 u[2] = v[2];
1008 u[3] = v[3];
1009
1010 u[4] = _mm_mullo_epi32(v[4], cospi16);
1011 x = _mm_mullo_epi32(v[5], cospi48);
1012 u[4] = _mm_add_epi32(u[4], x);
1013 u[4] = _mm_add_epi32(u[4], rnding);
1014 u[4] = _mm_srai_epi32(u[4], bit);
1015
1016 u[5] = _mm_mullo_epi32(v[4], cospi48);
1017 x = _mm_mullo_epi32(v[5], cospi16);
1018 u[5] = _mm_sub_epi32(u[5], x);
1019 u[5] = _mm_add_epi32(u[5], rnding);
1020 u[5] = _mm_srai_epi32(u[5], bit);
1021
1022 u[6] = _mm_mullo_epi32(v[6], cospim48);
1023 x = _mm_mullo_epi32(v[7], cospi16);
1024 u[6] = _mm_add_epi32(u[6], x);
1025 u[6] = _mm_add_epi32(u[6], rnding);
1026 u[6] = _mm_srai_epi32(u[6], bit);
1027
1028 u[7] = _mm_mullo_epi32(v[6], cospi16);
1029 x = _mm_mullo_epi32(v[7], cospim48);
1030 u[7] = _mm_sub_epi32(u[7], x);
1031 u[7] = _mm_add_epi32(u[7], rnding);
1032 u[7] = _mm_srai_epi32(u[7], bit);
1033
1034 // stage 5
1035 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1036 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1037 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1038 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1039
1040 // stage 6
1041 u[0] = v[0];
1042 u[1] = v[1];
1043 u[4] = v[4];
1044 u[5] = v[5];
1045
1046 v[0] = _mm_mullo_epi32(v[2], cospi32);
1047 x = _mm_mullo_epi32(v[3], cospi32);
1048 u[2] = _mm_add_epi32(v[0], x);
1049 u[2] = _mm_add_epi32(u[2], rnding);
1050 u[2] = _mm_srai_epi32(u[2], bit);
1051
1052 u[3] = _mm_sub_epi32(v[0], x);
1053 u[3] = _mm_add_epi32(u[3], rnding);
1054 u[3] = _mm_srai_epi32(u[3], bit);
1055
1056 v[0] = _mm_mullo_epi32(v[6], cospi32);
1057 x = _mm_mullo_epi32(v[7], cospi32);
1058 u[6] = _mm_add_epi32(v[0], x);
1059 u[6] = _mm_add_epi32(u[6], rnding);
1060 u[6] = _mm_srai_epi32(u[6], bit);
1061
1062 u[7] = _mm_sub_epi32(v[0], x);
1063 u[7] = _mm_add_epi32(u[7], rnding);
1064 u[7] = _mm_srai_epi32(u[7], bit);
1065
1066 // stage 7
1067 if (do_cols) {
1068 out[1] = u[0];
1069 out[3] = _mm_sub_epi32(kZero, u[4]);
1070 out[5] = u[6];
1071 out[7] = _mm_sub_epi32(kZero, u[2]);
1072 out[9] = u[3];
1073 out[11] = _mm_sub_epi32(kZero, u[7]);
1074 out[13] = u[5];
1075 out[15] = _mm_sub_epi32(kZero, u[1]);
1076 } else {
1077 const int log_range_out = AOMMAX(16, bd + 6);
1078 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1079 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1080
1081 neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1082 out_shift);
1083 neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1084 out_shift);
1085 neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1086 &clamp_hi_out, out_shift);
1087 neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1088 &clamp_hi_out, out_shift);
1089 }
1090 }
1091
round_shift_8x8(__m128i * in,int shift)1092 static void round_shift_8x8(__m128i *in, int shift) {
1093 round_shift_4x4(&in[0], shift);
1094 round_shift_4x4(&in[4], shift);
1095 round_shift_4x4(&in[8], shift);
1096 round_shift_4x4(&in[12], shift);
1097 }
1098
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1099 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1100 int fliplr, int bd) {
1101 __m128i x0, x1;
1102 const __m128i zero = _mm_setzero_si128();
1103
1104 x0 = _mm_unpacklo_epi16(pred, zero);
1105 x1 = _mm_unpackhi_epi16(pred, zero);
1106
1107 if (fliplr) {
1108 res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1109 res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1110 x0 = _mm_add_epi32(res_hi, x0);
1111 x1 = _mm_add_epi32(res_lo, x1);
1112
1113 } else {
1114 x0 = _mm_add_epi32(res_lo, x0);
1115 x1 = _mm_add_epi32(res_hi, x1);
1116 }
1117
1118 x0 = _mm_packus_epi32(x0, x1);
1119 return highbd_clamp_epi16(x0, bd);
1120 }
1121
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1122 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1123 int fliplr, int flipud, int shift, int bd) {
1124 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1125 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1126
1127 round_shift_8x8(in, shift);
1128
1129 v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1130 v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1131 v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1132 v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1133 v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1134 v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1135 v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1136 v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1137
1138 if (flipud) {
1139 u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1140 u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1141 u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1142 u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1143 u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1144 u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1145 u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1146 u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1147 } else {
1148 u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1149 u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1150 u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1151 u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1152 u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1153 u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1154 u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1155 u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1156 }
1157
1158 _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1159 _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1160 _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1161 _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1162 _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1163 _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1164 _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1165 _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1166 }
1167
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * coeff,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1168 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
1169 int stride, TX_TYPE tx_type, int bd) {
1170 __m128i in[16], out[16];
1171 const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
1172 const int txw_idx = get_txw_idx(TX_8X8);
1173 const int txh_idx = get_txh_idx(TX_8X8);
1174
1175 switch (tx_type) {
1176 case DCT_DCT:
1177 load_buffer_8x8(coeff, in);
1178 transpose_8x8(in, out);
1179 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1180 -shift[0]);
1181 transpose_8x8(in, out);
1182 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1183 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1184 break;
1185 case DCT_ADST:
1186 load_buffer_8x8(coeff, in);
1187 transpose_8x8(in, out);
1188 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1189 -shift[0]);
1190 transpose_8x8(in, out);
1191 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1192 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1193 break;
1194 case ADST_DCT:
1195 load_buffer_8x8(coeff, in);
1196 transpose_8x8(in, out);
1197 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1198 -shift[0]);
1199 transpose_8x8(in, out);
1200 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1201 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1202 break;
1203 case ADST_ADST:
1204 load_buffer_8x8(coeff, in);
1205 transpose_8x8(in, out);
1206 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1207 -shift[0]);
1208 transpose_8x8(in, out);
1209 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1210 write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
1211 break;
1212 case FLIPADST_DCT:
1213 load_buffer_8x8(coeff, in);
1214 transpose_8x8(in, out);
1215 idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1216 -shift[0]);
1217 transpose_8x8(in, out);
1218 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1219 write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1220 break;
1221 case DCT_FLIPADST:
1222 load_buffer_8x8(coeff, in);
1223 transpose_8x8(in, out);
1224 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1225 -shift[0]);
1226 transpose_8x8(in, out);
1227 idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1228 write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1229 break;
1230 case ADST_FLIPADST:
1231 load_buffer_8x8(coeff, in);
1232 transpose_8x8(in, out);
1233 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1234 -shift[0]);
1235 transpose_8x8(in, out);
1236 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1237 write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
1238 break;
1239 case FLIPADST_FLIPADST:
1240 load_buffer_8x8(coeff, in);
1241 transpose_8x8(in, out);
1242 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1243 -shift[0]);
1244 transpose_8x8(in, out);
1245 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1246 write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
1247 break;
1248 case FLIPADST_ADST:
1249 load_buffer_8x8(coeff, in);
1250 transpose_8x8(in, out);
1251 iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
1252 -shift[0]);
1253 transpose_8x8(in, out);
1254 iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
1255 write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
1256 break;
1257 default: assert(0);
1258 }
1259 }
1260
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1261 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1262 int bd, int out_shift) {
1263 const int32_t *cospi = cospi_arr(bit);
1264 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1265 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1266 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1267 __m128i x;
1268
1269 // stage 0
1270 // stage 1
1271 // stage 2
1272 // stage 3
1273 x = _mm_mullo_epi32(in[0], cospi32);
1274 x = _mm_add_epi32(x, rnding);
1275 x = _mm_srai_epi32(x, bit);
1276
1277 // stage 4
1278 // stage 5
1279 if (!do_cols) {
1280 const int log_range_out = AOMMAX(16, bd + 6);
1281 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1282 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1283 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1284 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1285
1286 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1287 x = _mm_add_epi32(x, offset);
1288 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1289 x = _mm_max_epi32(x, clamp_lo_out);
1290 x = _mm_min_epi32(x, clamp_hi_out);
1291 }
1292
1293 out[0] = x;
1294 out[1] = x;
1295 out[2] = x;
1296 out[3] = x;
1297 out[4] = x;
1298 out[5] = x;
1299 out[6] = x;
1300 out[7] = x;
1301 }
1302
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1303 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1304 int bd, int out_shift) {
1305 const int32_t *cospi = cospi_arr(bit);
1306 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1307 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1308 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1309 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1310 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1311 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1312 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1313 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1314 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1315 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1316 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1317 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1318 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1319 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1320 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1321 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1322 __m128i x, y;
1323
1324 // stage 0
1325 // stage 1
1326 // stage 2
1327 u0 = in[0];
1328 u1 = in[4];
1329 u2 = in[2];
1330 u3 = in[6];
1331
1332 x = _mm_mullo_epi32(in[1], cospi56);
1333 y = _mm_mullo_epi32(in[7], cospim8);
1334 u4 = _mm_add_epi32(x, y);
1335 u4 = _mm_add_epi32(u4, rnding);
1336 u4 = _mm_srai_epi32(u4, bit);
1337
1338 x = _mm_mullo_epi32(in[1], cospi8);
1339 y = _mm_mullo_epi32(in[7], cospi56);
1340 u7 = _mm_add_epi32(x, y);
1341 u7 = _mm_add_epi32(u7, rnding);
1342 u7 = _mm_srai_epi32(u7, bit);
1343
1344 x = _mm_mullo_epi32(in[5], cospi24);
1345 y = _mm_mullo_epi32(in[3], cospim40);
1346 u5 = _mm_add_epi32(x, y);
1347 u5 = _mm_add_epi32(u5, rnding);
1348 u5 = _mm_srai_epi32(u5, bit);
1349
1350 x = _mm_mullo_epi32(in[5], cospi40);
1351 y = _mm_mullo_epi32(in[3], cospi24);
1352 u6 = _mm_add_epi32(x, y);
1353 u6 = _mm_add_epi32(u6, rnding);
1354 u6 = _mm_srai_epi32(u6, bit);
1355
1356 // stage 3
1357 x = _mm_mullo_epi32(u0, cospi32);
1358 y = _mm_mullo_epi32(u1, cospi32);
1359 v0 = _mm_add_epi32(x, y);
1360 v0 = _mm_add_epi32(v0, rnding);
1361 v0 = _mm_srai_epi32(v0, bit);
1362
1363 v1 = _mm_sub_epi32(x, y);
1364 v1 = _mm_add_epi32(v1, rnding);
1365 v1 = _mm_srai_epi32(v1, bit);
1366
1367 x = _mm_mullo_epi32(u2, cospi48);
1368 y = _mm_mullo_epi32(u3, cospim16);
1369 v2 = _mm_add_epi32(x, y);
1370 v2 = _mm_add_epi32(v2, rnding);
1371 v2 = _mm_srai_epi32(v2, bit);
1372
1373 x = _mm_mullo_epi32(u2, cospi16);
1374 y = _mm_mullo_epi32(u3, cospi48);
1375 v3 = _mm_add_epi32(x, y);
1376 v3 = _mm_add_epi32(v3, rnding);
1377 v3 = _mm_srai_epi32(v3, bit);
1378
1379 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1380 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1381
1382 // stage 4
1383 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1384 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1385 u4 = v4;
1386 u7 = v7;
1387
1388 x = _mm_mullo_epi32(v5, cospi32);
1389 y = _mm_mullo_epi32(v6, cospi32);
1390 u6 = _mm_add_epi32(y, x);
1391 u6 = _mm_add_epi32(u6, rnding);
1392 u6 = _mm_srai_epi32(u6, bit);
1393
1394 u5 = _mm_sub_epi32(y, x);
1395 u5 = _mm_add_epi32(u5, rnding);
1396 u5 = _mm_srai_epi32(u5, bit);
1397
1398 // stage 5
1399 if (do_cols) {
1400 addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
1401 addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
1402 addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
1403 addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
1404 } else {
1405 const int log_range_out = AOMMAX(16, bd + 6);
1406 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1407 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1408 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1409 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1410 addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
1411 out_shift);
1412 addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
1413 out_shift);
1414 addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
1415 out_shift);
1416 addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
1417 out_shift);
1418 }
1419 }
1420
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1421 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1422 int do_cols, int bd, int out_shift) {
1423 const int32_t *cospi = cospi_arr(bit);
1424 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1425 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1426 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1427 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1428 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1429 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1430 const __m128i kZero = _mm_setzero_si128();
1431 __m128i u[8], x;
1432
1433 // stage 0
1434 // stage 1
1435 // stage 2
1436
1437 x = _mm_mullo_epi32(in[0], cospi60);
1438 u[0] = _mm_add_epi32(x, rnding);
1439 u[0] = _mm_srai_epi32(u[0], bit);
1440
1441 x = _mm_mullo_epi32(in[0], cospi4);
1442 u[1] = _mm_sub_epi32(kZero, x);
1443 u[1] = _mm_add_epi32(u[1], rnding);
1444 u[1] = _mm_srai_epi32(u[1], bit);
1445
1446 // stage 3
1447 // stage 4
1448 __m128i temp1, temp2;
1449 temp1 = _mm_mullo_epi32(u[0], cospi16);
1450 x = _mm_mullo_epi32(u[1], cospi48);
1451 temp1 = _mm_add_epi32(temp1, x);
1452 temp1 = _mm_add_epi32(temp1, rnding);
1453 temp1 = _mm_srai_epi32(temp1, bit);
1454 u[4] = temp1;
1455
1456 temp2 = _mm_mullo_epi32(u[0], cospi48);
1457 x = _mm_mullo_epi32(u[1], cospi16);
1458 u[5] = _mm_sub_epi32(temp2, x);
1459 u[5] = _mm_add_epi32(u[5], rnding);
1460 u[5] = _mm_srai_epi32(u[5], bit);
1461
1462 // stage 5
1463 // stage 6
1464 temp1 = _mm_mullo_epi32(u[0], cospi32);
1465 x = _mm_mullo_epi32(u[1], cospi32);
1466 u[2] = _mm_add_epi32(temp1, x);
1467 u[2] = _mm_add_epi32(u[2], rnding);
1468 u[2] = _mm_srai_epi32(u[2], bit);
1469
1470 u[3] = _mm_sub_epi32(temp1, x);
1471 u[3] = _mm_add_epi32(u[3], rnding);
1472 u[3] = _mm_srai_epi32(u[3], bit);
1473
1474 temp1 = _mm_mullo_epi32(u[4], cospi32);
1475 x = _mm_mullo_epi32(u[5], cospi32);
1476 u[6] = _mm_add_epi32(temp1, x);
1477 u[6] = _mm_add_epi32(u[6], rnding);
1478 u[6] = _mm_srai_epi32(u[6], bit);
1479
1480 u[7] = _mm_sub_epi32(temp1, x);
1481 u[7] = _mm_add_epi32(u[7], rnding);
1482 u[7] = _mm_srai_epi32(u[7], bit);
1483
1484 // stage 7
1485 if (do_cols) {
1486 out[0] = u[0];
1487 out[1] = _mm_sub_epi32(kZero, u[4]);
1488 out[2] = u[6];
1489 out[3] = _mm_sub_epi32(kZero, u[2]);
1490 out[4] = u[3];
1491 out[5] = _mm_sub_epi32(kZero, u[7]);
1492 out[6] = u[5];
1493 out[7] = _mm_sub_epi32(kZero, u[1]);
1494 } else {
1495 const int log_range_out = AOMMAX(16, bd + 6);
1496 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1497 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1498
1499 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1500 out_shift);
1501 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1502 out_shift);
1503 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1504 out_shift);
1505 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1506 out_shift);
1507 }
1508 }
1509
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1510 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1511 int bd, int out_shift) {
1512 const int32_t *cospi = cospi_arr(bit);
1513 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1514 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1515 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1516 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1517 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1518 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1519 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1520 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1521 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1522 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1523 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1524 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1525 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1526 const __m128i kZero = _mm_setzero_si128();
1527 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1528 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1529 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1530 __m128i u[8], v[8], x;
1531
1532 // stage 0
1533 // stage 1
1534 // stage 2
1535
1536 u[0] = _mm_mullo_epi32(in[7], cospi4);
1537 x = _mm_mullo_epi32(in[0], cospi60);
1538 u[0] = _mm_add_epi32(u[0], x);
1539 u[0] = _mm_add_epi32(u[0], rnding);
1540 u[0] = _mm_srai_epi32(u[0], bit);
1541
1542 u[1] = _mm_mullo_epi32(in[7], cospi60);
1543 x = _mm_mullo_epi32(in[0], cospi4);
1544 u[1] = _mm_sub_epi32(u[1], x);
1545 u[1] = _mm_add_epi32(u[1], rnding);
1546 u[1] = _mm_srai_epi32(u[1], bit);
1547
1548 // (2)
1549 u[2] = _mm_mullo_epi32(in[5], cospi20);
1550 x = _mm_mullo_epi32(in[2], cospi44);
1551 u[2] = _mm_add_epi32(u[2], x);
1552 u[2] = _mm_add_epi32(u[2], rnding);
1553 u[2] = _mm_srai_epi32(u[2], bit);
1554
1555 u[3] = _mm_mullo_epi32(in[5], cospi44);
1556 x = _mm_mullo_epi32(in[2], cospi20);
1557 u[3] = _mm_sub_epi32(u[3], x);
1558 u[3] = _mm_add_epi32(u[3], rnding);
1559 u[3] = _mm_srai_epi32(u[3], bit);
1560
1561 // (3)
1562 u[4] = _mm_mullo_epi32(in[3], cospi36);
1563 x = _mm_mullo_epi32(in[4], cospi28);
1564 u[4] = _mm_add_epi32(u[4], x);
1565 u[4] = _mm_add_epi32(u[4], rnding);
1566 u[4] = _mm_srai_epi32(u[4], bit);
1567
1568 u[5] = _mm_mullo_epi32(in[3], cospi28);
1569 x = _mm_mullo_epi32(in[4], cospi36);
1570 u[5] = _mm_sub_epi32(u[5], x);
1571 u[5] = _mm_add_epi32(u[5], rnding);
1572 u[5] = _mm_srai_epi32(u[5], bit);
1573
1574 // (4)
1575 u[6] = _mm_mullo_epi32(in[1], cospi52);
1576 x = _mm_mullo_epi32(in[6], cospi12);
1577 u[6] = _mm_add_epi32(u[6], x);
1578 u[6] = _mm_add_epi32(u[6], rnding);
1579 u[6] = _mm_srai_epi32(u[6], bit);
1580
1581 u[7] = _mm_mullo_epi32(in[1], cospi12);
1582 x = _mm_mullo_epi32(in[6], cospi52);
1583 u[7] = _mm_sub_epi32(u[7], x);
1584 u[7] = _mm_add_epi32(u[7], rnding);
1585 u[7] = _mm_srai_epi32(u[7], bit);
1586
1587 // stage 3
1588 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1589 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1590 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1591 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1592
1593 // stage 4
1594 u[0] = v[0];
1595 u[1] = v[1];
1596 u[2] = v[2];
1597 u[3] = v[3];
1598
1599 u[4] = _mm_mullo_epi32(v[4], cospi16);
1600 x = _mm_mullo_epi32(v[5], cospi48);
1601 u[4] = _mm_add_epi32(u[4], x);
1602 u[4] = _mm_add_epi32(u[4], rnding);
1603 u[4] = _mm_srai_epi32(u[4], bit);
1604
1605 u[5] = _mm_mullo_epi32(v[4], cospi48);
1606 x = _mm_mullo_epi32(v[5], cospi16);
1607 u[5] = _mm_sub_epi32(u[5], x);
1608 u[5] = _mm_add_epi32(u[5], rnding);
1609 u[5] = _mm_srai_epi32(u[5], bit);
1610
1611 u[6] = _mm_mullo_epi32(v[6], cospim48);
1612 x = _mm_mullo_epi32(v[7], cospi16);
1613 u[6] = _mm_add_epi32(u[6], x);
1614 u[6] = _mm_add_epi32(u[6], rnding);
1615 u[6] = _mm_srai_epi32(u[6], bit);
1616
1617 u[7] = _mm_mullo_epi32(v[6], cospi16);
1618 x = _mm_mullo_epi32(v[7], cospim48);
1619 u[7] = _mm_sub_epi32(u[7], x);
1620 u[7] = _mm_add_epi32(u[7], rnding);
1621 u[7] = _mm_srai_epi32(u[7], bit);
1622
1623 // stage 5
1624 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1625 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1626 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1627 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1628
1629 // stage 6
1630 u[0] = v[0];
1631 u[1] = v[1];
1632 u[4] = v[4];
1633 u[5] = v[5];
1634
1635 v[0] = _mm_mullo_epi32(v[2], cospi32);
1636 x = _mm_mullo_epi32(v[3], cospi32);
1637 u[2] = _mm_add_epi32(v[0], x);
1638 u[2] = _mm_add_epi32(u[2], rnding);
1639 u[2] = _mm_srai_epi32(u[2], bit);
1640
1641 u[3] = _mm_sub_epi32(v[0], x);
1642 u[3] = _mm_add_epi32(u[3], rnding);
1643 u[3] = _mm_srai_epi32(u[3], bit);
1644
1645 v[0] = _mm_mullo_epi32(v[6], cospi32);
1646 x = _mm_mullo_epi32(v[7], cospi32);
1647 u[6] = _mm_add_epi32(v[0], x);
1648 u[6] = _mm_add_epi32(u[6], rnding);
1649 u[6] = _mm_srai_epi32(u[6], bit);
1650
1651 u[7] = _mm_sub_epi32(v[0], x);
1652 u[7] = _mm_add_epi32(u[7], rnding);
1653 u[7] = _mm_srai_epi32(u[7], bit);
1654
1655 // stage 7
1656 if (do_cols) {
1657 out[0] = u[0];
1658 out[1] = _mm_sub_epi32(kZero, u[4]);
1659 out[2] = u[6];
1660 out[3] = _mm_sub_epi32(kZero, u[2]);
1661 out[4] = u[3];
1662 out[5] = _mm_sub_epi32(kZero, u[7]);
1663 out[6] = u[5];
1664 out[7] = _mm_sub_epi32(kZero, u[1]);
1665 } else {
1666 const int log_range_out = AOMMAX(16, bd + 6);
1667 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1668 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1669
1670 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1671 out_shift);
1672 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1673 out_shift);
1674 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1675 out_shift);
1676 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1677 out_shift);
1678 }
1679 }
1680
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1681 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1682 int do_cols, int bd, int out_shift) {
1683 const int32_t *cospi = cospi_arr(bit);
1684 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1685 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1686 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1687 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1688 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1689
1690 {
1691 // stage 0
1692 // stage 1
1693 // stage 2
1694 // stage 3
1695 // stage 4
1696 in[0] = _mm_mullo_epi32(in[0], cospi32);
1697 in[0] = _mm_add_epi32(in[0], rnding);
1698 in[0] = _mm_srai_epi32(in[0], bit);
1699
1700 // stage 5
1701 // stage 6
1702 // stage 7
1703 if (do_cols) {
1704 in[0] = _mm_max_epi32(in[0], clamp_lo);
1705 in[0] = _mm_min_epi32(in[0], clamp_hi);
1706 } else {
1707 const int log_range_out = AOMMAX(16, bd + 6);
1708 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1709 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1710 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1711 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1712 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1713 in[0] = _mm_add_epi32(in[0], offset);
1714 in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1715 in[0] = _mm_max_epi32(in[0], clamp_lo_out);
1716 in[0] = _mm_min_epi32(in[0], clamp_hi_out);
1717 }
1718
1719 out[0] = in[0];
1720 out[1] = in[0];
1721 out[2] = in[0];
1722 out[3] = in[0];
1723 out[4] = in[0];
1724 out[5] = in[0];
1725 out[6] = in[0];
1726 out[7] = in[0];
1727 out[8] = in[0];
1728 out[9] = in[0];
1729 out[10] = in[0];
1730 out[11] = in[0];
1731 out[12] = in[0];
1732 out[13] = in[0];
1733 out[14] = in[0];
1734 out[15] = in[0];
1735 }
1736 }
1737
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1738 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1739 int do_cols, int bd, int out_shift) {
1740 const int32_t *cospi = cospi_arr(bit);
1741 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1742 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1743 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1744 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1745 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1746 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1747 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1748 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1749 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1750 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1751 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1752 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1753 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1754 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1755 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1756 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1757 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1758 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1759 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1760 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1761 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1762 __m128i u[16], x, y;
1763
1764 {
1765 // stage 0
1766 // stage 1
1767 u[0] = in[0];
1768 u[2] = in[4];
1769 u[4] = in[2];
1770 u[6] = in[6];
1771 u[8] = in[1];
1772 u[10] = in[5];
1773 u[12] = in[3];
1774 u[14] = in[7];
1775
1776 // stage 2
1777 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1778 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1779
1780 u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1781 u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1782
1783 u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1784 u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1785
1786 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1787 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1788
1789 // stage 3
1790 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1791 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1792 u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1793 u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1794
1795 addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1796 addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1797 addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1798 addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1799
1800 // stage 4
1801 x = _mm_mullo_epi32(u[0], cospi32);
1802 u[0] = _mm_add_epi32(x, rnding);
1803 u[0] = _mm_srai_epi32(u[0], bit);
1804 u[1] = u[0];
1805
1806 u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
1807 u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
1808
1809 addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1810 addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1811
1812 x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1813 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1814 u[9] = x;
1815 y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1816 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1817 u[10] = y;
1818
1819 // stage 5
1820 addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1821 addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1822
1823 x = _mm_mullo_epi32(u[5], cospi32);
1824 y = _mm_mullo_epi32(u[6], cospi32);
1825 u[5] = _mm_sub_epi32(y, x);
1826 u[5] = _mm_add_epi32(u[5], rnding);
1827 u[5] = _mm_srai_epi32(u[5], bit);
1828
1829 u[6] = _mm_add_epi32(y, x);
1830 u[6] = _mm_add_epi32(u[6], rnding);
1831 u[6] = _mm_srai_epi32(u[6], bit);
1832
1833 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1834 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1835 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1836 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1837
1838 // stage 6
1839 addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1840 addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1841 addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1842 addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1843
1844 x = _mm_mullo_epi32(u[10], cospi32);
1845 y = _mm_mullo_epi32(u[13], cospi32);
1846 u[10] = _mm_sub_epi32(y, x);
1847 u[10] = _mm_add_epi32(u[10], rnding);
1848 u[10] = _mm_srai_epi32(u[10], bit);
1849
1850 u[13] = _mm_add_epi32(x, y);
1851 u[13] = _mm_add_epi32(u[13], rnding);
1852 u[13] = _mm_srai_epi32(u[13], bit);
1853
1854 x = _mm_mullo_epi32(u[11], cospi32);
1855 y = _mm_mullo_epi32(u[12], cospi32);
1856 u[11] = _mm_sub_epi32(y, x);
1857 u[11] = _mm_add_epi32(u[11], rnding);
1858 u[11] = _mm_srai_epi32(u[11], bit);
1859
1860 u[12] = _mm_add_epi32(x, y);
1861 u[12] = _mm_add_epi32(u[12], rnding);
1862 u[12] = _mm_srai_epi32(u[12], bit);
1863 // stage 7
1864 if (do_cols) {
1865 addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
1866 addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
1867 addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
1868 addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
1869 addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
1870 addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
1871 addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
1872 addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
1873 } else {
1874 const int log_range_out = AOMMAX(16, bd + 6);
1875 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
1876 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
1877 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
1878 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
1879
1880 addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
1881 &clamp_hi_out, out_shift);
1882 addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
1883 &clamp_hi_out, out_shift);
1884 addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
1885 &clamp_hi_out, out_shift);
1886 addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
1887 &clamp_hi_out, out_shift);
1888 addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
1889 &clamp_hi_out, out_shift);
1890 addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
1891 &clamp_hi_out, out_shift);
1892 addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
1893 &clamp_hi_out, out_shift);
1894 addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
1895 &clamp_hi_out, out_shift);
1896 }
1897 }
1898 }
1899
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1900 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1901 int do_cols, int bd, int out_shift) {
1902 const int32_t *cospi = cospi_arr(bit);
1903 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
1904 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
1905 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1906 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1907 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1908 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1909 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1910 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1911 const __m128i zero = _mm_setzero_si128();
1912 __m128i v[16], x, y, temp1, temp2;
1913
1914 // Calculate the column 0, 1, 2, 3
1915 {
1916 // stage 0
1917 // stage 1
1918 // stage 2
1919 x = _mm_mullo_epi32(in[0], cospi62);
1920 v[0] = _mm_add_epi32(x, rnding);
1921 v[0] = _mm_srai_epi32(v[0], bit);
1922
1923 x = _mm_mullo_epi32(in[0], cospi2);
1924 v[1] = _mm_sub_epi32(zero, x);
1925 v[1] = _mm_add_epi32(v[1], rnding);
1926 v[1] = _mm_srai_epi32(v[1], bit);
1927
1928 // stage 3
1929 v[8] = v[0];
1930 v[9] = v[1];
1931
1932 // stage 4
1933 temp1 = _mm_mullo_epi32(v[8], cospi8);
1934 x = _mm_mullo_epi32(v[9], cospi56);
1935 temp1 = _mm_add_epi32(temp1, x);
1936 temp1 = _mm_add_epi32(temp1, rnding);
1937 temp1 = _mm_srai_epi32(temp1, bit);
1938
1939 temp2 = _mm_mullo_epi32(v[8], cospi56);
1940 x = _mm_mullo_epi32(v[9], cospi8);
1941 temp2 = _mm_sub_epi32(temp2, x);
1942 temp2 = _mm_add_epi32(temp2, rnding);
1943 temp2 = _mm_srai_epi32(temp2, bit);
1944 v[8] = temp1;
1945 v[9] = temp2;
1946
1947 // stage 5
1948 v[4] = v[0];
1949 v[5] = v[1];
1950 v[12] = v[8];
1951 v[13] = v[9];
1952
1953 // stage 6
1954 temp1 = _mm_mullo_epi32(v[4], cospi16);
1955 x = _mm_mullo_epi32(v[5], cospi48);
1956 temp1 = _mm_add_epi32(temp1, x);
1957 temp1 = _mm_add_epi32(temp1, rnding);
1958 temp1 = _mm_srai_epi32(temp1, bit);
1959
1960 temp2 = _mm_mullo_epi32(v[4], cospi48);
1961 x = _mm_mullo_epi32(v[5], cospi16);
1962 temp2 = _mm_sub_epi32(temp2, x);
1963 temp2 = _mm_add_epi32(temp2, rnding);
1964 temp2 = _mm_srai_epi32(temp2, bit);
1965 v[4] = temp1;
1966 v[5] = temp2;
1967
1968 temp1 = _mm_mullo_epi32(v[12], cospi16);
1969 x = _mm_mullo_epi32(v[13], cospi48);
1970 temp1 = _mm_add_epi32(temp1, x);
1971 temp1 = _mm_add_epi32(temp1, rnding);
1972 temp1 = _mm_srai_epi32(temp1, bit);
1973
1974 temp2 = _mm_mullo_epi32(v[12], cospi48);
1975 x = _mm_mullo_epi32(v[13], cospi16);
1976 temp2 = _mm_sub_epi32(temp2, x);
1977 temp2 = _mm_add_epi32(temp2, rnding);
1978 temp2 = _mm_srai_epi32(temp2, bit);
1979 v[12] = temp1;
1980 v[13] = temp2;
1981
1982 // stage 7
1983 v[2] = v[0];
1984 v[3] = v[1];
1985 v[6] = v[4];
1986 v[7] = v[5];
1987 v[10] = v[8];
1988 v[11] = v[9];
1989 v[14] = v[12];
1990 v[15] = v[13];
1991
1992 // stage 8
1993 y = _mm_mullo_epi32(v[2], cospi32);
1994 x = _mm_mullo_epi32(v[3], cospi32);
1995 v[2] = _mm_add_epi32(y, x);
1996 v[2] = _mm_add_epi32(v[2], rnding);
1997 v[2] = _mm_srai_epi32(v[2], bit);
1998
1999 v[3] = _mm_sub_epi32(y, x);
2000 v[3] = _mm_add_epi32(v[3], rnding);
2001 v[3] = _mm_srai_epi32(v[3], bit);
2002
2003 y = _mm_mullo_epi32(v[6], cospi32);
2004 x = _mm_mullo_epi32(v[7], cospi32);
2005 v[6] = _mm_add_epi32(y, x);
2006 v[6] = _mm_add_epi32(v[6], rnding);
2007 v[6] = _mm_srai_epi32(v[6], bit);
2008
2009 v[7] = _mm_sub_epi32(y, x);
2010 v[7] = _mm_add_epi32(v[7], rnding);
2011 v[7] = _mm_srai_epi32(v[7], bit);
2012
2013 y = _mm_mullo_epi32(v[10], cospi32);
2014 x = _mm_mullo_epi32(v[11], cospi32);
2015 v[10] = _mm_add_epi32(y, x);
2016 v[10] = _mm_add_epi32(v[10], rnding);
2017 v[10] = _mm_srai_epi32(v[10], bit);
2018
2019 v[11] = _mm_sub_epi32(y, x);
2020 v[11] = _mm_add_epi32(v[11], rnding);
2021 v[11] = _mm_srai_epi32(v[11], bit);
2022
2023 y = _mm_mullo_epi32(v[14], cospi32);
2024 x = _mm_mullo_epi32(v[15], cospi32);
2025 v[14] = _mm_add_epi32(y, x);
2026 v[14] = _mm_add_epi32(v[14], rnding);
2027 v[14] = _mm_srai_epi32(v[14], bit);
2028
2029 v[15] = _mm_sub_epi32(y, x);
2030 v[15] = _mm_add_epi32(v[15], rnding);
2031 v[15] = _mm_srai_epi32(v[15], bit);
2032
2033 // stage 9
2034 if (do_cols) {
2035 out[0] = v[0];
2036 out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2037 out[2] = v[12];
2038 out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2039 out[4] = v[6];
2040 out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2041 out[6] = v[10];
2042 out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2043 out[8] = v[3];
2044 out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2045 out[10] = v[15];
2046 out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2047 out[12] = v[5];
2048 out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2049 out[14] = v[9];
2050 out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2051 } else {
2052 const int log_range_out = AOMMAX(16, bd + 6);
2053 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2054 const __m128i clamp_hi_out =
2055 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2056
2057 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2058 &clamp_hi_out, out_shift);
2059 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2060 &clamp_hi_out, out_shift);
2061 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2062 &clamp_hi_out, out_shift);
2063 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2064 &clamp_hi_out, out_shift);
2065 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2066 &clamp_hi_out, out_shift);
2067 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2068 &clamp_hi_out, out_shift);
2069 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2070 &clamp_hi_out, out_shift);
2071 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2072 &clamp_hi_out, out_shift);
2073 }
2074 }
2075 }
2076
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2077 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2078 int do_cols, int bd, int out_shift) {
2079 const int32_t *cospi = cospi_arr(bit);
2080 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2081 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2082 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2083 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2084 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2085 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2086 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2087 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2088 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2089 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2090 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2091 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2092 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2093 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2094 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2095 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2096 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2097 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2098 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2099 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2100 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2101 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2102 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2103 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2104 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2105 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2106 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2107 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2108 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2109 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2110 __m128i u[16], x, y;
2111
2112 // Calculate the column 0, 1, 2, 3
2113 {
2114 // stage 0
2115 // stage 1
2116 // stage 2
2117 __m128i zero = _mm_setzero_si128();
2118 x = _mm_mullo_epi32(in[0], cospi62);
2119 u[0] = _mm_add_epi32(x, rnding);
2120 u[0] = _mm_srai_epi32(u[0], bit);
2121
2122 x = _mm_mullo_epi32(in[0], cospi2);
2123 u[1] = _mm_sub_epi32(zero, x);
2124 u[1] = _mm_add_epi32(u[1], rnding);
2125 u[1] = _mm_srai_epi32(u[1], bit);
2126
2127 x = _mm_mullo_epi32(in[2], cospi54);
2128 u[2] = _mm_add_epi32(x, rnding);
2129 u[2] = _mm_srai_epi32(u[2], bit);
2130
2131 x = _mm_mullo_epi32(in[2], cospi10);
2132 u[3] = _mm_sub_epi32(zero, x);
2133 u[3] = _mm_add_epi32(u[3], rnding);
2134 u[3] = _mm_srai_epi32(u[3], bit);
2135
2136 x = _mm_mullo_epi32(in[4], cospi46);
2137 u[4] = _mm_add_epi32(x, rnding);
2138 u[4] = _mm_srai_epi32(u[4], bit);
2139
2140 x = _mm_mullo_epi32(in[4], cospi18);
2141 u[5] = _mm_sub_epi32(zero, x);
2142 u[5] = _mm_add_epi32(u[5], rnding);
2143 u[5] = _mm_srai_epi32(u[5], bit);
2144
2145 x = _mm_mullo_epi32(in[6], cospi38);
2146 u[6] = _mm_add_epi32(x, rnding);
2147 u[6] = _mm_srai_epi32(u[6], bit);
2148
2149 x = _mm_mullo_epi32(in[6], cospi26);
2150 u[7] = _mm_sub_epi32(zero, x);
2151 u[7] = _mm_add_epi32(u[7], rnding);
2152 u[7] = _mm_srai_epi32(u[7], bit);
2153
2154 u[8] = _mm_mullo_epi32(in[7], cospi34);
2155 u[8] = _mm_add_epi32(u[8], rnding);
2156 u[8] = _mm_srai_epi32(u[8], bit);
2157
2158 u[9] = _mm_mullo_epi32(in[7], cospi30);
2159 u[9] = _mm_add_epi32(u[9], rnding);
2160 u[9] = _mm_srai_epi32(u[9], bit);
2161
2162 u[10] = _mm_mullo_epi32(in[5], cospi42);
2163 u[10] = _mm_add_epi32(u[10], rnding);
2164 u[10] = _mm_srai_epi32(u[10], bit);
2165
2166 u[11] = _mm_mullo_epi32(in[5], cospi22);
2167 u[11] = _mm_add_epi32(u[11], rnding);
2168 u[11] = _mm_srai_epi32(u[11], bit);
2169
2170 u[12] = _mm_mullo_epi32(in[3], cospi50);
2171 u[12] = _mm_add_epi32(u[12], rnding);
2172 u[12] = _mm_srai_epi32(u[12], bit);
2173
2174 u[13] = _mm_mullo_epi32(in[3], cospi14);
2175 u[13] = _mm_add_epi32(u[13], rnding);
2176 u[13] = _mm_srai_epi32(u[13], bit);
2177
2178 u[14] = _mm_mullo_epi32(in[1], cospi58);
2179 u[14] = _mm_add_epi32(u[14], rnding);
2180 u[14] = _mm_srai_epi32(u[14], bit);
2181
2182 u[15] = _mm_mullo_epi32(in[1], cospi6);
2183 u[15] = _mm_add_epi32(u[15], rnding);
2184 u[15] = _mm_srai_epi32(u[15], bit);
2185
2186 // stage 3
2187 addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2188 addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2189 addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2190 addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2191 addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2192 addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2193 addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2194 addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2195
2196 // stage 4
2197 y = _mm_mullo_epi32(u[8], cospi56);
2198 x = _mm_mullo_epi32(u[9], cospi56);
2199 u[8] = _mm_mullo_epi32(u[8], cospi8);
2200 u[8] = _mm_add_epi32(u[8], x);
2201 u[8] = _mm_add_epi32(u[8], rnding);
2202 u[8] = _mm_srai_epi32(u[8], bit);
2203
2204 x = _mm_mullo_epi32(u[9], cospi8);
2205 u[9] = _mm_sub_epi32(y, x);
2206 u[9] = _mm_add_epi32(u[9], rnding);
2207 u[9] = _mm_srai_epi32(u[9], bit);
2208
2209 x = _mm_mullo_epi32(u[11], cospi24);
2210 y = _mm_mullo_epi32(u[10], cospi24);
2211 u[10] = _mm_mullo_epi32(u[10], cospi40);
2212 u[10] = _mm_add_epi32(u[10], x);
2213 u[10] = _mm_add_epi32(u[10], rnding);
2214 u[10] = _mm_srai_epi32(u[10], bit);
2215
2216 x = _mm_mullo_epi32(u[11], cospi40);
2217 u[11] = _mm_sub_epi32(y, x);
2218 u[11] = _mm_add_epi32(u[11], rnding);
2219 u[11] = _mm_srai_epi32(u[11], bit);
2220
2221 x = _mm_mullo_epi32(u[13], cospi8);
2222 y = _mm_mullo_epi32(u[12], cospi8);
2223 u[12] = _mm_mullo_epi32(u[12], cospim56);
2224 u[12] = _mm_add_epi32(u[12], x);
2225 u[12] = _mm_add_epi32(u[12], rnding);
2226 u[12] = _mm_srai_epi32(u[12], bit);
2227
2228 x = _mm_mullo_epi32(u[13], cospim56);
2229 u[13] = _mm_sub_epi32(y, x);
2230 u[13] = _mm_add_epi32(u[13], rnding);
2231 u[13] = _mm_srai_epi32(u[13], bit);
2232
2233 x = _mm_mullo_epi32(u[15], cospi40);
2234 y = _mm_mullo_epi32(u[14], cospi40);
2235 u[14] = _mm_mullo_epi32(u[14], cospim24);
2236 u[14] = _mm_add_epi32(u[14], x);
2237 u[14] = _mm_add_epi32(u[14], rnding);
2238 u[14] = _mm_srai_epi32(u[14], bit);
2239
2240 x = _mm_mullo_epi32(u[15], cospim24);
2241 u[15] = _mm_sub_epi32(y, x);
2242 u[15] = _mm_add_epi32(u[15], rnding);
2243 u[15] = _mm_srai_epi32(u[15], bit);
2244
2245 // stage 5
2246 addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2247 addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2248 addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2249 addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2250 addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2251 addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2252 addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2253 addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2254
2255 // stage 6
2256 x = _mm_mullo_epi32(u[5], cospi48);
2257 y = _mm_mullo_epi32(u[4], cospi48);
2258 u[4] = _mm_mullo_epi32(u[4], cospi16);
2259 u[4] = _mm_add_epi32(u[4], x);
2260 u[4] = _mm_add_epi32(u[4], rnding);
2261 u[4] = _mm_srai_epi32(u[4], bit);
2262
2263 x = _mm_mullo_epi32(u[5], cospi16);
2264 u[5] = _mm_sub_epi32(y, x);
2265 u[5] = _mm_add_epi32(u[5], rnding);
2266 u[5] = _mm_srai_epi32(u[5], bit);
2267
2268 x = _mm_mullo_epi32(u[7], cospi16);
2269 y = _mm_mullo_epi32(u[6], cospi16);
2270 u[6] = _mm_mullo_epi32(u[6], cospim48);
2271 u[6] = _mm_add_epi32(u[6], x);
2272 u[6] = _mm_add_epi32(u[6], rnding);
2273 u[6] = _mm_srai_epi32(u[6], bit);
2274
2275 x = _mm_mullo_epi32(u[7], cospim48);
2276 u[7] = _mm_sub_epi32(y, x);
2277 u[7] = _mm_add_epi32(u[7], rnding);
2278 u[7] = _mm_srai_epi32(u[7], bit);
2279
2280 x = _mm_mullo_epi32(u[13], cospi48);
2281 y = _mm_mullo_epi32(u[12], cospi48);
2282 u[12] = _mm_mullo_epi32(u[12], cospi16);
2283 u[12] = _mm_add_epi32(u[12], x);
2284 u[12] = _mm_add_epi32(u[12], rnding);
2285 u[12] = _mm_srai_epi32(u[12], bit);
2286
2287 x = _mm_mullo_epi32(u[13], cospi16);
2288 u[13] = _mm_sub_epi32(y, x);
2289 u[13] = _mm_add_epi32(u[13], rnding);
2290 u[13] = _mm_srai_epi32(u[13], bit);
2291
2292 x = _mm_mullo_epi32(u[15], cospi16);
2293 y = _mm_mullo_epi32(u[14], cospi16);
2294 u[14] = _mm_mullo_epi32(u[14], cospim48);
2295 u[14] = _mm_add_epi32(u[14], x);
2296 u[14] = _mm_add_epi32(u[14], rnding);
2297 u[14] = _mm_srai_epi32(u[14], bit);
2298
2299 x = _mm_mullo_epi32(u[15], cospim48);
2300 u[15] = _mm_sub_epi32(y, x);
2301 u[15] = _mm_add_epi32(u[15], rnding);
2302 u[15] = _mm_srai_epi32(u[15], bit);
2303
2304 // stage 7
2305 addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2306 addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2307 addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2308 addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2309 addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2310 addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2311 addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2312 addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2313
2314 // stage 8
2315 y = _mm_mullo_epi32(u[2], cospi32);
2316 x = _mm_mullo_epi32(u[3], cospi32);
2317 u[2] = _mm_add_epi32(y, x);
2318 u[2] = _mm_add_epi32(u[2], rnding);
2319 u[2] = _mm_srai_epi32(u[2], bit);
2320
2321 u[3] = _mm_sub_epi32(y, x);
2322 u[3] = _mm_add_epi32(u[3], rnding);
2323 u[3] = _mm_srai_epi32(u[3], bit);
2324 y = _mm_mullo_epi32(u[6], cospi32);
2325 x = _mm_mullo_epi32(u[7], cospi32);
2326 u[6] = _mm_add_epi32(y, x);
2327 u[6] = _mm_add_epi32(u[6], rnding);
2328 u[6] = _mm_srai_epi32(u[6], bit);
2329
2330 u[7] = _mm_sub_epi32(y, x);
2331 u[7] = _mm_add_epi32(u[7], rnding);
2332 u[7] = _mm_srai_epi32(u[7], bit);
2333
2334 y = _mm_mullo_epi32(u[10], cospi32);
2335 x = _mm_mullo_epi32(u[11], cospi32);
2336 u[10] = _mm_add_epi32(y, x);
2337 u[10] = _mm_add_epi32(u[10], rnding);
2338 u[10] = _mm_srai_epi32(u[10], bit);
2339
2340 u[11] = _mm_sub_epi32(y, x);
2341 u[11] = _mm_add_epi32(u[11], rnding);
2342 u[11] = _mm_srai_epi32(u[11], bit);
2343
2344 y = _mm_mullo_epi32(u[14], cospi32);
2345 x = _mm_mullo_epi32(u[15], cospi32);
2346 u[14] = _mm_add_epi32(y, x);
2347 u[14] = _mm_add_epi32(u[14], rnding);
2348 u[14] = _mm_srai_epi32(u[14], bit);
2349
2350 u[15] = _mm_sub_epi32(y, x);
2351 u[15] = _mm_add_epi32(u[15], rnding);
2352 u[15] = _mm_srai_epi32(u[15], bit);
2353
2354 // stage 9
2355 if (do_cols) {
2356 out[0] = u[0];
2357 out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
2358 out[2] = u[12];
2359 out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
2360 out[4] = u[6];
2361 out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
2362 out[6] = u[10];
2363 out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
2364 out[8] = u[3];
2365 out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
2366 out[10] = u[15];
2367 out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
2368 out[12] = u[5];
2369 out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
2370 out[14] = u[9];
2371 out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
2372 } else {
2373 const int log_range_out = AOMMAX(16, bd + 6);
2374 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2375 const __m128i clamp_hi_out =
2376 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2377
2378 neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
2379 &clamp_hi_out, out_shift);
2380 neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2381 &clamp_hi_out, out_shift);
2382 neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2383 &clamp_hi_out, out_shift);
2384 neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2385 &clamp_hi_out, out_shift);
2386 neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2387 &clamp_hi_out, out_shift);
2388 neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2389 &clamp_hi_out, out_shift);
2390 neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2391 &clamp_hi_out, out_shift);
2392 neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2393 &clamp_hi_out, out_shift);
2394 }
2395 }
2396 }
2397
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2398 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2399 int bd, int out_shift) {
2400 const int32_t *cospi = cospi_arr(bit);
2401 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2402 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2403 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2404 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2405 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2406 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2407 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2408 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2409 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2410 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2411 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2412 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2413 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2414 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2415 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2416 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2417 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2418 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2419 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2420 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2421 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2422 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2423 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2424 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2425 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2426 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2427 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2428 __m128i u[16], v[16], x, y;
2429
2430 {
2431 // stage 0
2432 // stage 1
2433 u[0] = in[0];
2434 u[1] = in[8];
2435 u[2] = in[4];
2436 u[3] = in[12];
2437 u[4] = in[2];
2438 u[5] = in[10];
2439 u[6] = in[6];
2440 u[7] = in[14];
2441 u[8] = in[1];
2442 u[9] = in[9];
2443 u[10] = in[5];
2444 u[11] = in[13];
2445 u[12] = in[3];
2446 u[13] = in[11];
2447 u[14] = in[7];
2448 u[15] = in[15];
2449
2450 // stage 2
2451 v[0] = u[0];
2452 v[1] = u[1];
2453 v[2] = u[2];
2454 v[3] = u[3];
2455 v[4] = u[4];
2456 v[5] = u[5];
2457 v[6] = u[6];
2458 v[7] = u[7];
2459
2460 v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2461 v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2462 v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2463 v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2464 v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2465 v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2466 v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2467 v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2468
2469 // stage 3
2470 u[0] = v[0];
2471 u[1] = v[1];
2472 u[2] = v[2];
2473 u[3] = v[3];
2474 u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2475 u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2476 u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2477 u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2478 addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2479 addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2480 addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2481 addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2482
2483 // stage 4
2484 x = _mm_mullo_epi32(u[0], cospi32);
2485 y = _mm_mullo_epi32(u[1], cospi32);
2486 v[0] = _mm_add_epi32(x, y);
2487 v[0] = _mm_add_epi32(v[0], rnding);
2488 v[0] = _mm_srai_epi32(v[0], bit);
2489
2490 v[1] = _mm_sub_epi32(x, y);
2491 v[1] = _mm_add_epi32(v[1], rnding);
2492 v[1] = _mm_srai_epi32(v[1], bit);
2493
2494 v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2495 v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2496 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2497 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2498 v[8] = u[8];
2499 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2500 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2501 v[11] = u[11];
2502 v[12] = u[12];
2503 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2504 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2505 v[15] = u[15];
2506
2507 // stage 5
2508 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2509 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2510 u[4] = v[4];
2511
2512 x = _mm_mullo_epi32(v[5], cospi32);
2513 y = _mm_mullo_epi32(v[6], cospi32);
2514 u[5] = _mm_sub_epi32(y, x);
2515 u[5] = _mm_add_epi32(u[5], rnding);
2516 u[5] = _mm_srai_epi32(u[5], bit);
2517
2518 u[6] = _mm_add_epi32(y, x);
2519 u[6] = _mm_add_epi32(u[6], rnding);
2520 u[6] = _mm_srai_epi32(u[6], bit);
2521
2522 u[7] = v[7];
2523 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2524 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2525 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2526 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2527
2528 // stage 6
2529 addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2530 addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2531 addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2532 addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2533 v[8] = u[8];
2534 v[9] = u[9];
2535
2536 x = _mm_mullo_epi32(u[10], cospi32);
2537 y = _mm_mullo_epi32(u[13], cospi32);
2538 v[10] = _mm_sub_epi32(y, x);
2539 v[10] = _mm_add_epi32(v[10], rnding);
2540 v[10] = _mm_srai_epi32(v[10], bit);
2541
2542 v[13] = _mm_add_epi32(x, y);
2543 v[13] = _mm_add_epi32(v[13], rnding);
2544 v[13] = _mm_srai_epi32(v[13], bit);
2545
2546 x = _mm_mullo_epi32(u[11], cospi32);
2547 y = _mm_mullo_epi32(u[12], cospi32);
2548 v[11] = _mm_sub_epi32(y, x);
2549 v[11] = _mm_add_epi32(v[11], rnding);
2550 v[11] = _mm_srai_epi32(v[11], bit);
2551
2552 v[12] = _mm_add_epi32(x, y);
2553 v[12] = _mm_add_epi32(v[12], rnding);
2554 v[12] = _mm_srai_epi32(v[12], bit);
2555
2556 v[14] = u[14];
2557 v[15] = u[15];
2558
2559 // stage 7
2560 if (do_cols) {
2561 addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
2562 addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
2563 addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
2564 addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
2565 addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
2566 addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
2567 addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
2568 addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
2569 } else {
2570 const int log_range_out = AOMMAX(16, bd + 6);
2571 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
2572 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
2573 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
2574 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
2575
2576 addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
2577 &clamp_hi_out, out_shift);
2578 addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
2579 &clamp_hi_out, out_shift);
2580 addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
2581 &clamp_hi_out, out_shift);
2582 addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
2583 &clamp_hi_out, out_shift);
2584 addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
2585 &clamp_hi_out, out_shift);
2586 addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
2587 &clamp_hi_out, out_shift);
2588 addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
2589 &clamp_hi_out, out_shift);
2590 addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
2591 &clamp_hi_out, out_shift);
2592 }
2593 }
2594 }
2595
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2596 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2597 int bd, int out_shift) {
2598 const int32_t *cospi = cospi_arr(bit);
2599 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2600 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2601 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2602 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2603 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2604 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2605 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2606 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2607 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2608 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2609 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2610 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2611 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2612 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2613 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2614 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2615 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2616 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2617 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2618 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2619 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2620 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2621 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2622 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2623 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2624 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2625 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2626 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2627 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2628 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2629 __m128i u[16], v[16], x, y;
2630
2631 // Calculate the column 0, 1, 2, 3
2632 {
2633 // stage 0
2634 // stage 1
2635 // stage 2
2636 v[0] = _mm_mullo_epi32(in[15], cospi2);
2637 x = _mm_mullo_epi32(in[0], cospi62);
2638 v[0] = _mm_add_epi32(v[0], x);
2639 v[0] = _mm_add_epi32(v[0], rnding);
2640 v[0] = _mm_srai_epi32(v[0], bit);
2641
2642 v[1] = _mm_mullo_epi32(in[15], cospi62);
2643 x = _mm_mullo_epi32(in[0], cospi2);
2644 v[1] = _mm_sub_epi32(v[1], x);
2645 v[1] = _mm_add_epi32(v[1], rnding);
2646 v[1] = _mm_srai_epi32(v[1], bit);
2647
2648 v[2] = _mm_mullo_epi32(in[13], cospi10);
2649 x = _mm_mullo_epi32(in[2], cospi54);
2650 v[2] = _mm_add_epi32(v[2], x);
2651 v[2] = _mm_add_epi32(v[2], rnding);
2652 v[2] = _mm_srai_epi32(v[2], bit);
2653
2654 v[3] = _mm_mullo_epi32(in[13], cospi54);
2655 x = _mm_mullo_epi32(in[2], cospi10);
2656 v[3] = _mm_sub_epi32(v[3], x);
2657 v[3] = _mm_add_epi32(v[3], rnding);
2658 v[3] = _mm_srai_epi32(v[3], bit);
2659
2660 v[4] = _mm_mullo_epi32(in[11], cospi18);
2661 x = _mm_mullo_epi32(in[4], cospi46);
2662 v[4] = _mm_add_epi32(v[4], x);
2663 v[4] = _mm_add_epi32(v[4], rnding);
2664 v[4] = _mm_srai_epi32(v[4], bit);
2665
2666 v[5] = _mm_mullo_epi32(in[11], cospi46);
2667 x = _mm_mullo_epi32(in[4], cospi18);
2668 v[5] = _mm_sub_epi32(v[5], x);
2669 v[5] = _mm_add_epi32(v[5], rnding);
2670 v[5] = _mm_srai_epi32(v[5], bit);
2671
2672 v[6] = _mm_mullo_epi32(in[9], cospi26);
2673 x = _mm_mullo_epi32(in[6], cospi38);
2674 v[6] = _mm_add_epi32(v[6], x);
2675 v[6] = _mm_add_epi32(v[6], rnding);
2676 v[6] = _mm_srai_epi32(v[6], bit);
2677
2678 v[7] = _mm_mullo_epi32(in[9], cospi38);
2679 x = _mm_mullo_epi32(in[6], cospi26);
2680 v[7] = _mm_sub_epi32(v[7], x);
2681 v[7] = _mm_add_epi32(v[7], rnding);
2682 v[7] = _mm_srai_epi32(v[7], bit);
2683
2684 v[8] = _mm_mullo_epi32(in[7], cospi34);
2685 x = _mm_mullo_epi32(in[8], cospi30);
2686 v[8] = _mm_add_epi32(v[8], x);
2687 v[8] = _mm_add_epi32(v[8], rnding);
2688 v[8] = _mm_srai_epi32(v[8], bit);
2689
2690 v[9] = _mm_mullo_epi32(in[7], cospi30);
2691 x = _mm_mullo_epi32(in[8], cospi34);
2692 v[9] = _mm_sub_epi32(v[9], x);
2693 v[9] = _mm_add_epi32(v[9], rnding);
2694 v[9] = _mm_srai_epi32(v[9], bit);
2695
2696 v[10] = _mm_mullo_epi32(in[5], cospi42);
2697 x = _mm_mullo_epi32(in[10], cospi22);
2698 v[10] = _mm_add_epi32(v[10], x);
2699 v[10] = _mm_add_epi32(v[10], rnding);
2700 v[10] = _mm_srai_epi32(v[10], bit);
2701
2702 v[11] = _mm_mullo_epi32(in[5], cospi22);
2703 x = _mm_mullo_epi32(in[10], cospi42);
2704 v[11] = _mm_sub_epi32(v[11], x);
2705 v[11] = _mm_add_epi32(v[11], rnding);
2706 v[11] = _mm_srai_epi32(v[11], bit);
2707
2708 v[12] = _mm_mullo_epi32(in[3], cospi50);
2709 x = _mm_mullo_epi32(in[12], cospi14);
2710 v[12] = _mm_add_epi32(v[12], x);
2711 v[12] = _mm_add_epi32(v[12], rnding);
2712 v[12] = _mm_srai_epi32(v[12], bit);
2713
2714 v[13] = _mm_mullo_epi32(in[3], cospi14);
2715 x = _mm_mullo_epi32(in[12], cospi50);
2716 v[13] = _mm_sub_epi32(v[13], x);
2717 v[13] = _mm_add_epi32(v[13], rnding);
2718 v[13] = _mm_srai_epi32(v[13], bit);
2719
2720 v[14] = _mm_mullo_epi32(in[1], cospi58);
2721 x = _mm_mullo_epi32(in[14], cospi6);
2722 v[14] = _mm_add_epi32(v[14], x);
2723 v[14] = _mm_add_epi32(v[14], rnding);
2724 v[14] = _mm_srai_epi32(v[14], bit);
2725
2726 v[15] = _mm_mullo_epi32(in[1], cospi6);
2727 x = _mm_mullo_epi32(in[14], cospi58);
2728 v[15] = _mm_sub_epi32(v[15], x);
2729 v[15] = _mm_add_epi32(v[15], rnding);
2730 v[15] = _mm_srai_epi32(v[15], bit);
2731
2732 // stage 3
2733 addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2734 addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2735 addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2736 addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2737 addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2738 addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2739 addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2740 addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2741
2742 // stage 4
2743 v[0] = u[0];
2744 v[1] = u[1];
2745 v[2] = u[2];
2746 v[3] = u[3];
2747 v[4] = u[4];
2748 v[5] = u[5];
2749 v[6] = u[6];
2750 v[7] = u[7];
2751
2752 v[8] = _mm_mullo_epi32(u[8], cospi8);
2753 x = _mm_mullo_epi32(u[9], cospi56);
2754 v[8] = _mm_add_epi32(v[8], x);
2755 v[8] = _mm_add_epi32(v[8], rnding);
2756 v[8] = _mm_srai_epi32(v[8], bit);
2757
2758 v[9] = _mm_mullo_epi32(u[8], cospi56);
2759 x = _mm_mullo_epi32(u[9], cospi8);
2760 v[9] = _mm_sub_epi32(v[9], x);
2761 v[9] = _mm_add_epi32(v[9], rnding);
2762 v[9] = _mm_srai_epi32(v[9], bit);
2763
2764 v[10] = _mm_mullo_epi32(u[10], cospi40);
2765 x = _mm_mullo_epi32(u[11], cospi24);
2766 v[10] = _mm_add_epi32(v[10], x);
2767 v[10] = _mm_add_epi32(v[10], rnding);
2768 v[10] = _mm_srai_epi32(v[10], bit);
2769
2770 v[11] = _mm_mullo_epi32(u[10], cospi24);
2771 x = _mm_mullo_epi32(u[11], cospi40);
2772 v[11] = _mm_sub_epi32(v[11], x);
2773 v[11] = _mm_add_epi32(v[11], rnding);
2774 v[11] = _mm_srai_epi32(v[11], bit);
2775
2776 v[12] = _mm_mullo_epi32(u[12], cospim56);
2777 x = _mm_mullo_epi32(u[13], cospi8);
2778 v[12] = _mm_add_epi32(v[12], x);
2779 v[12] = _mm_add_epi32(v[12], rnding);
2780 v[12] = _mm_srai_epi32(v[12], bit);
2781
2782 v[13] = _mm_mullo_epi32(u[12], cospi8);
2783 x = _mm_mullo_epi32(u[13], cospim56);
2784 v[13] = _mm_sub_epi32(v[13], x);
2785 v[13] = _mm_add_epi32(v[13], rnding);
2786 v[13] = _mm_srai_epi32(v[13], bit);
2787
2788 v[14] = _mm_mullo_epi32(u[14], cospim24);
2789 x = _mm_mullo_epi32(u[15], cospi40);
2790 v[14] = _mm_add_epi32(v[14], x);
2791 v[14] = _mm_add_epi32(v[14], rnding);
2792 v[14] = _mm_srai_epi32(v[14], bit);
2793
2794 v[15] = _mm_mullo_epi32(u[14], cospi40);
2795 x = _mm_mullo_epi32(u[15], cospim24);
2796 v[15] = _mm_sub_epi32(v[15], x);
2797 v[15] = _mm_add_epi32(v[15], rnding);
2798 v[15] = _mm_srai_epi32(v[15], bit);
2799
2800 // stage 5
2801 addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2802 addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2803 addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2804 addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2805 addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2806 addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2807 addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2808 addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2809
2810 // stage 6
2811 v[0] = u[0];
2812 v[1] = u[1];
2813 v[2] = u[2];
2814 v[3] = u[3];
2815
2816 v[4] = _mm_mullo_epi32(u[4], cospi16);
2817 x = _mm_mullo_epi32(u[5], cospi48);
2818 v[4] = _mm_add_epi32(v[4], x);
2819 v[4] = _mm_add_epi32(v[4], rnding);
2820 v[4] = _mm_srai_epi32(v[4], bit);
2821
2822 v[5] = _mm_mullo_epi32(u[4], cospi48);
2823 x = _mm_mullo_epi32(u[5], cospi16);
2824 v[5] = _mm_sub_epi32(v[5], x);
2825 v[5] = _mm_add_epi32(v[5], rnding);
2826 v[5] = _mm_srai_epi32(v[5], bit);
2827
2828 v[6] = _mm_mullo_epi32(u[6], cospim48);
2829 x = _mm_mullo_epi32(u[7], cospi16);
2830 v[6] = _mm_add_epi32(v[6], x);
2831 v[6] = _mm_add_epi32(v[6], rnding);
2832 v[6] = _mm_srai_epi32(v[6], bit);
2833
2834 v[7] = _mm_mullo_epi32(u[6], cospi16);
2835 x = _mm_mullo_epi32(u[7], cospim48);
2836 v[7] = _mm_sub_epi32(v[7], x);
2837 v[7] = _mm_add_epi32(v[7], rnding);
2838 v[7] = _mm_srai_epi32(v[7], bit);
2839
2840 v[8] = u[8];
2841 v[9] = u[9];
2842 v[10] = u[10];
2843 v[11] = u[11];
2844
2845 v[12] = _mm_mullo_epi32(u[12], cospi16);
2846 x = _mm_mullo_epi32(u[13], cospi48);
2847 v[12] = _mm_add_epi32(v[12], x);
2848 v[12] = _mm_add_epi32(v[12], rnding);
2849 v[12] = _mm_srai_epi32(v[12], bit);
2850
2851 v[13] = _mm_mullo_epi32(u[12], cospi48);
2852 x = _mm_mullo_epi32(u[13], cospi16);
2853 v[13] = _mm_sub_epi32(v[13], x);
2854 v[13] = _mm_add_epi32(v[13], rnding);
2855 v[13] = _mm_srai_epi32(v[13], bit);
2856
2857 v[14] = _mm_mullo_epi32(u[14], cospim48);
2858 x = _mm_mullo_epi32(u[15], cospi16);
2859 v[14] = _mm_add_epi32(v[14], x);
2860 v[14] = _mm_add_epi32(v[14], rnding);
2861 v[14] = _mm_srai_epi32(v[14], bit);
2862
2863 v[15] = _mm_mullo_epi32(u[14], cospi16);
2864 x = _mm_mullo_epi32(u[15], cospim48);
2865 v[15] = _mm_sub_epi32(v[15], x);
2866 v[15] = _mm_add_epi32(v[15], rnding);
2867 v[15] = _mm_srai_epi32(v[15], bit);
2868
2869 // stage 7
2870 addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2871 addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2872 addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2873 addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2874 addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2875 addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2876 addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2877 addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2878
2879 // stage 8
2880 v[0] = u[0];
2881 v[1] = u[1];
2882
2883 y = _mm_mullo_epi32(u[2], cospi32);
2884 x = _mm_mullo_epi32(u[3], cospi32);
2885 v[2] = _mm_add_epi32(y, x);
2886 v[2] = _mm_add_epi32(v[2], rnding);
2887 v[2] = _mm_srai_epi32(v[2], bit);
2888
2889 v[3] = _mm_sub_epi32(y, x);
2890 v[3] = _mm_add_epi32(v[3], rnding);
2891 v[3] = _mm_srai_epi32(v[3], bit);
2892
2893 v[4] = u[4];
2894 v[5] = u[5];
2895
2896 y = _mm_mullo_epi32(u[6], cospi32);
2897 x = _mm_mullo_epi32(u[7], cospi32);
2898 v[6] = _mm_add_epi32(y, x);
2899 v[6] = _mm_add_epi32(v[6], rnding);
2900 v[6] = _mm_srai_epi32(v[6], bit);
2901
2902 v[7] = _mm_sub_epi32(y, x);
2903 v[7] = _mm_add_epi32(v[7], rnding);
2904 v[7] = _mm_srai_epi32(v[7], bit);
2905
2906 v[8] = u[8];
2907 v[9] = u[9];
2908
2909 y = _mm_mullo_epi32(u[10], cospi32);
2910 x = _mm_mullo_epi32(u[11], cospi32);
2911 v[10] = _mm_add_epi32(y, x);
2912 v[10] = _mm_add_epi32(v[10], rnding);
2913 v[10] = _mm_srai_epi32(v[10], bit);
2914
2915 v[11] = _mm_sub_epi32(y, x);
2916 v[11] = _mm_add_epi32(v[11], rnding);
2917 v[11] = _mm_srai_epi32(v[11], bit);
2918
2919 v[12] = u[12];
2920 v[13] = u[13];
2921
2922 y = _mm_mullo_epi32(u[14], cospi32);
2923 x = _mm_mullo_epi32(u[15], cospi32);
2924 v[14] = _mm_add_epi32(y, x);
2925 v[14] = _mm_add_epi32(v[14], rnding);
2926 v[14] = _mm_srai_epi32(v[14], bit);
2927
2928 v[15] = _mm_sub_epi32(y, x);
2929 v[15] = _mm_add_epi32(v[15], rnding);
2930 v[15] = _mm_srai_epi32(v[15], bit);
2931
2932 // stage 9
2933 if (do_cols) {
2934 out[0] = v[0];
2935 out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
2936 out[2] = v[12];
2937 out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
2938 out[4] = v[6];
2939 out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
2940 out[6] = v[10];
2941 out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
2942 out[8] = v[3];
2943 out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
2944 out[10] = v[15];
2945 out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
2946 out[12] = v[5];
2947 out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
2948 out[14] = v[9];
2949 out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
2950 } else {
2951 const int log_range_out = AOMMAX(16, bd + 6);
2952 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2953 const __m128i clamp_hi_out =
2954 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2955
2956 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
2957 &clamp_hi_out, out_shift);
2958 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2959 &clamp_hi_out, out_shift);
2960 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2961 &clamp_hi_out, out_shift);
2962 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2963 &clamp_hi_out, out_shift);
2964 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2965 &clamp_hi_out, out_shift);
2966 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2967 &clamp_hi_out, out_shift);
2968 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2969 &clamp_hi_out, out_shift);
2970 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2971 &clamp_hi_out, out_shift);
2972 }
2973 }
2974 }
2975
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)2976 static INLINE void idct64_stage8_sse4_1(
2977 __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
2978 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
2979 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
2980 const __m128i *rnding, int bit) {
2981 int i;
2982 __m128i temp1, temp2, temp3, temp4;
2983 temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2984 u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2985 u[10] = temp1;
2986 temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2987 u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2988 u[11] = temp2;
2989
2990 for (i = 16; i < 20; ++i) {
2991 addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2992 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
2993 clamp_hi);
2994 }
2995
2996 temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2997 temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2998 temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2999 temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3000 u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3001 u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3002 u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3003 u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3004 u[36] = temp1;
3005 u[37] = temp2;
3006 u[38] = temp3;
3007 u[39] = temp4;
3008
3009 temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3010 temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3011 temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3012 temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3013 u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3014 u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3015 u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3016 u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3017 u[40] = temp1;
3018 u[41] = temp2;
3019 u[42] = temp3;
3020 u[43] = temp4;
3021 }
3022
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3023 static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3024 const __m128i *cospi32,
3025 const __m128i *clamp_lo,
3026 const __m128i *clamp_hi,
3027 const __m128i *rnding, int bit) {
3028 int i;
3029 __m128i temp1, temp2, temp3, temp4;
3030 for (i = 0; i < 8; ++i) {
3031 addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3032 }
3033
3034 temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3035 temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3036 temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3037 temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3038 u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3039 u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3040 u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3041 u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3042 u[20] = temp1;
3043 u[21] = temp2;
3044 u[22] = temp3;
3045 u[23] = temp4;
3046 for (i = 32; i < 40; i++) {
3047 addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3048 }
3049
3050 for (i = 48; i < 56; i++) {
3051 addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3052 }
3053 }
3054
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3055 static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3056 const __m128i *cospi32,
3057 const __m128i *clamp_lo,
3058 const __m128i *clamp_hi,
3059 const __m128i *rnding, int bit) {
3060 __m128i temp1, temp2, temp3, temp4;
3061 for (int i = 0; i < 16; i++) {
3062 addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3063 }
3064
3065 temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3066 temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3067 temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3068 temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3069 u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3070 u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3071 u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3072 u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3073 u[40] = temp1;
3074 u[41] = temp2;
3075 u[42] = temp3;
3076 u[43] = temp4;
3077
3078 temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3079 temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3080 temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3081 temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3082 u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3083 u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3084 u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3085 u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3086 u[44] = temp1;
3087 u[45] = temp2;
3088 u[46] = temp3;
3089 u[47] = temp4;
3090 }
3091
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const int log_range)3092 static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3093 int bd, int out_shift,
3094 const int log_range) {
3095 if (do_cols) {
3096 for (int i = 0; i < 32; i++) {
3097 addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
3098 }
3099 } else {
3100 const int log_range_out = AOMMAX(16, bd + 6);
3101 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3102 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3103 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3104 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3105
3106 for (int i = 0; i < 32; i++) {
3107 addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
3108 &clamp_lo_out, &clamp_hi_out, out_shift);
3109 }
3110 }
3111 }
3112
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3113 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3114 int do_cols, int bd, int out_shift) {
3115 const int32_t *cospi = cospi_arr(bit);
3116 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3117 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3118 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3119 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3120
3121 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3122
3123 {
3124 __m128i x;
3125
3126 // stage 1
3127 // stage 2
3128 // stage 3
3129 // stage 4
3130 // stage 5
3131 // stage 6
3132 x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3133
3134 // stage 8
3135 // stage 9
3136 // stage 10
3137 // stage 11
3138 if (do_cols) {
3139 x = _mm_max_epi32(x, clamp_lo);
3140 x = _mm_min_epi32(x, clamp_hi);
3141 } else {
3142 const int log_range_out = AOMMAX(16, bd + 6);
3143 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
3144 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
3145 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
3146 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
3147
3148 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3149 x = _mm_add_epi32(x, offset);
3150 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3151
3152 x = _mm_max_epi32(x, clamp_lo_out);
3153 x = _mm_min_epi32(x, clamp_hi_out);
3154 }
3155
3156 out[0] = x;
3157 out[63] = x;
3158 out[1] = x;
3159 out[62] = x;
3160 out[2] = x;
3161 out[61] = x;
3162 out[3] = x;
3163 out[60] = x;
3164 out[4] = x;
3165 out[59] = x;
3166 out[5] = x;
3167 out[58] = x;
3168 out[6] = x;
3169 out[57] = x;
3170 out[7] = x;
3171 out[56] = x;
3172 out[8] = x;
3173 out[55] = x;
3174 out[9] = x;
3175 out[54] = x;
3176 out[10] = x;
3177 out[53] = x;
3178 out[11] = x;
3179 out[52] = x;
3180 out[12] = x;
3181 out[51] = x;
3182 out[13] = x;
3183 out[50] = x;
3184 out[14] = x;
3185 out[49] = x;
3186 out[15] = x;
3187 out[48] = x;
3188 out[16] = x;
3189 out[47] = x;
3190 out[17] = x;
3191 out[46] = x;
3192 out[18] = x;
3193 out[45] = x;
3194 out[19] = x;
3195 out[44] = x;
3196 out[20] = x;
3197 out[43] = x;
3198 out[21] = x;
3199 out[42] = x;
3200 out[22] = x;
3201 out[41] = x;
3202 out[23] = x;
3203 out[40] = x;
3204 out[24] = x;
3205 out[39] = x;
3206 out[25] = x;
3207 out[38] = x;
3208 out[26] = x;
3209 out[37] = x;
3210 out[27] = x;
3211 out[36] = x;
3212 out[28] = x;
3213 out[35] = x;
3214 out[29] = x;
3215 out[34] = x;
3216 out[30] = x;
3217 out[33] = x;
3218 out[31] = x;
3219 out[32] = x;
3220 }
3221 }
3222
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3223 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3224 int do_cols, int bd, int out_shift) {
3225 int i, j;
3226 const int32_t *cospi = cospi_arr(bit);
3227 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3228 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3229 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3230 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3231
3232 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3233 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3234 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3235 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3236 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3237 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3238 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3239 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3240 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3241 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3242 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3243 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3244 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3245 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3246 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3247 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3248 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3249 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3250 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3251 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3252 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3253 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3254 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3255 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3256 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3257 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3258 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3259 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3260 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3261 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3262 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3263 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3264 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3265 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3266 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3267 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3268 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3269 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3270
3271 {
3272 __m128i u[64];
3273
3274 // stage 1
3275 u[0] = in[0];
3276 u[8] = in[4];
3277 u[16] = in[2];
3278 u[24] = in[6];
3279 u[32] = in[1];
3280 u[40] = in[5];
3281 u[48] = in[3];
3282 u[56] = in[7];
3283
3284 // stage 2
3285 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3286 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3287 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3288 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3289 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3290 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3291 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3292 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3293
3294 // stage 3
3295 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3296 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3297 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3298 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3299 u[33] = u[32];
3300 u[38] = u[39];
3301 u[41] = u[40];
3302 u[46] = u[47];
3303 u[49] = u[48];
3304 u[54] = u[55];
3305 u[57] = u[56];
3306 u[62] = u[63];
3307
3308 // stage 4
3309 __m128i temp1, temp2;
3310 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3311 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3312 u[17] = u[16];
3313 u[22] = u[23];
3314 u[25] = u[24];
3315 u[30] = u[31];
3316
3317 temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3318 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3319 u[33] = temp1;
3320
3321 temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3322 u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3323 u[57] = temp2;
3324
3325 temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3326 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3327 u[41] = temp1;
3328
3329 temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3330 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3331 u[46] = temp2;
3332
3333 // stage 5
3334 u[9] = u[8];
3335 u[14] = u[15];
3336
3337 temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3338 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3339 u[17] = temp1;
3340
3341 temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3342 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3343 u[22] = temp2;
3344
3345 u[35] = u[32];
3346 u[34] = u[33];
3347 u[36] = u[39];
3348 u[37] = u[38];
3349 u[43] = u[40];
3350 u[42] = u[41];
3351 u[44] = u[47];
3352 u[45] = u[46];
3353 u[51] = u[48];
3354 u[50] = u[49];
3355 u[52] = u[55];
3356 u[53] = u[54];
3357 u[59] = u[56];
3358 u[58] = u[57];
3359 u[60] = u[63];
3360 u[61] = u[62];
3361
3362 // stage 6
3363 temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3364 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3365 u[0] = temp1;
3366
3367 temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3368 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3369 u[9] = temp2;
3370 u[19] = u[16];
3371 u[18] = u[17];
3372 u[20] = u[23];
3373 u[21] = u[22];
3374 u[27] = u[24];
3375 u[26] = u[25];
3376 u[28] = u[31];
3377 u[29] = u[30];
3378
3379 temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3380 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3381 u[34] = temp1;
3382 temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3383 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3384 u[35] = temp2;
3385 temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3386 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3387 u[36] = temp1;
3388 temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3389 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3390 u[37] = temp2;
3391 temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3392 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3393 u[42] = temp1;
3394 temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3395 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3396 u[43] = temp2;
3397 temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3398 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3399 u[44] = temp1;
3400 temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3401 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3402 u[45] = temp2;
3403
3404 // stage 7
3405 u[3] = u[0];
3406 u[2] = u[1];
3407 u[11] = u[8];
3408 u[10] = u[9];
3409 u[12] = u[15];
3410 u[13] = u[14];
3411
3412 temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3413 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3414 u[18] = temp1;
3415 temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3416 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3417 u[19] = temp2;
3418 temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3419 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3420 u[20] = temp1;
3421 temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3422 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3423 u[21] = temp2;
3424 for (i = 32; i < 64; i += 16) {
3425 for (j = i; j < i + 4; j++) {
3426 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3427 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3428 &clamp_hi);
3429 }
3430 }
3431
3432 // stage 8
3433 u[7] = u[0];
3434 u[6] = u[1];
3435 u[5] = u[2];
3436 u[4] = u[3];
3437 u[9] = u[9];
3438
3439 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3440 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3441
3442 // stage 9
3443 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3444 bit);
3445
3446 // stage 10
3447 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3448 bit);
3449
3450 // stage 11
3451 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3452 }
3453 }
3454
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3455 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3456 int do_cols, int bd, int out_shift) {
3457 int i, j;
3458 const int32_t *cospi = cospi_arr(bit);
3459 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3460 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3461 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3462 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3463
3464 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3465 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3466 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3467 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3468 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3469 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3470 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3471 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3472 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3473 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3474 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3475 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3476 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3477 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3478 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3479 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3480 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3481 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3482 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3483 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3484 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3485 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3486 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3487 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3488 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3489 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3490 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3491 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3492 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3493 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3494 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3495 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3496 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3497
3498 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3499 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3500 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3501 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3502 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3503 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3504 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3505 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3506 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3507 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3508 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3509 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3510 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3511 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3512 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3513 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3514 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3515 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3516 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3517 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3518 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3519
3520 {
3521 __m128i u[64];
3522 __m128i tmp1, tmp2, tmp3, tmp4;
3523 // stage 1
3524 u[0] = in[0];
3525 u[32] = in[1];
3526 u[36] = in[9];
3527 u[40] = in[5];
3528 u[44] = in[13];
3529 u[48] = in[3];
3530 u[52] = in[11];
3531 u[56] = in[7];
3532 u[60] = in[15];
3533 u[16] = in[2];
3534 u[20] = in[10];
3535 u[24] = in[6];
3536 u[28] = in[14];
3537 u[4] = in[8];
3538 u[8] = in[4];
3539 u[12] = in[12];
3540
3541 // stage 2
3542 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3543 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3544 u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3545 u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3546 u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3547 u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3548 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3549 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3550 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3551 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3552 u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3553 u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3554 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3555 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3556 u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3557 u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3558
3559 // stage 3
3560 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3561 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3562 u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3563 u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3564 u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3565 u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3566 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3567 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3568 u[33] = u[32];
3569 u[34] = u[35];
3570 u[37] = u[36];
3571 u[38] = u[39];
3572 u[41] = u[40];
3573 u[42] = u[43];
3574 u[45] = u[44];
3575 u[46] = u[47];
3576 u[49] = u[48];
3577 u[50] = u[51];
3578 u[53] = u[52];
3579 u[54] = u[55];
3580 u[57] = u[56];
3581 u[58] = u[59];
3582 u[61] = u[60];
3583 u[62] = u[63];
3584
3585 // stage 4
3586 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3587 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3588 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3589 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3590
3591 u[17] = u[16];
3592 u[18] = u[19];
3593 u[21] = u[20];
3594 u[22] = u[23];
3595 u[25] = u[24];
3596 u[26] = u[27];
3597 u[29] = u[28];
3598 u[30] = u[31];
3599
3600 tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3601 tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3602 tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3603 tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3604 u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3605 u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3606 u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3607 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3608 u[33] = tmp1;
3609 u[34] = tmp2;
3610 u[37] = tmp3;
3611 u[38] = tmp4;
3612
3613 tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3614 tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3615 tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3616 tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3617 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3618 u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3619 u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3620 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3621 u[41] = tmp1;
3622 u[42] = tmp2;
3623 u[45] = tmp3;
3624 u[46] = tmp4;
3625
3626 // stage 5
3627 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3628 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3629
3630 u[9] = u[8];
3631 u[10] = u[11];
3632 u[13] = u[12];
3633 u[14] = u[15];
3634
3635 tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3636 tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3637 tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3638 tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3639 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3640 u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3641 u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3642 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3643 u[17] = tmp1;
3644 u[18] = tmp2;
3645 u[21] = tmp3;
3646 u[22] = tmp4;
3647
3648 for (i = 32; i < 64; i += 8) {
3649 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3650 &clamp_hi);
3651 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3652 &clamp_hi);
3653
3654 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3655 &clamp_hi);
3656 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3657 &clamp_hi);
3658 }
3659
3660 // stage 6
3661 tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3662 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3663 u[0] = tmp1;
3664 u[5] = u[4];
3665 u[6] = u[7];
3666
3667 tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3668 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3669 u[9] = tmp1;
3670 tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3671 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3672 u[10] = tmp2;
3673
3674 for (i = 16; i < 32; i += 8) {
3675 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3676 &clamp_hi);
3677 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3678 &clamp_hi);
3679
3680 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3681 &clamp_hi);
3682 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3683 &clamp_hi);
3684 }
3685
3686 tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3687 tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3688 tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3689 tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3690 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3691 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3692 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3693 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3694 u[34] = tmp1;
3695 u[35] = tmp2;
3696 u[36] = tmp3;
3697 u[37] = tmp4;
3698
3699 tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3700 tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3701 tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3702 tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3703 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3704 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3705 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3706 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3707 u[42] = tmp1;
3708 u[43] = tmp2;
3709 u[44] = tmp3;
3710 u[45] = tmp4;
3711
3712 // stage 7
3713 u[3] = u[0];
3714 u[2] = u[1];
3715 tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3716 u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3717 u[5] = tmp1;
3718 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3719 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3720 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3721 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3722
3723 tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3724 tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3725 tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3726 tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3727 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3728 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3729 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3730 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3731 u[18] = tmp1;
3732 u[19] = tmp2;
3733 u[20] = tmp3;
3734 u[21] = tmp4;
3735
3736 for (i = 32; i < 64; i += 16) {
3737 for (j = i; j < i + 4; j++) {
3738 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3739 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3740 &clamp_hi);
3741 }
3742 }
3743
3744 // stage 8
3745 for (i = 0; i < 4; ++i) {
3746 addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3747 }
3748
3749 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3750 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3751
3752 // stage 9
3753 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3754 bit);
3755
3756 // stage 10
3757 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3758 bit);
3759
3760 // stage 11
3761 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
3762 }
3763 }
3764
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3765 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3766 int bd, int out_shift) {
3767 int i, j;
3768 const int32_t *cospi = cospi_arr(bit);
3769 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3770 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3771 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3772 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3773
3774 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3775 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3776 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3777 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3778 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3779 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3780 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3781 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3782 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3783 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3784 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3785 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3786 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3787 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3788 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3789 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3790 const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
3791 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
3792 const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
3793 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3794 const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
3795 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
3796 const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
3797 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3798 const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
3799 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
3800 const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
3801 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3802 const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
3803 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
3804 const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
3805 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3806 const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
3807 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3808 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
3809 const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
3810 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3811 const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
3812 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3813 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
3814 const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
3815 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3816 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3817 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3818 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3819 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3820 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3821 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3822 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3823 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3824 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3825
3826 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3827 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3828 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3829 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3830 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3831 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3832 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3833 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3834 const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
3835 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
3836 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3837 const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
3838 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3839 const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
3840 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
3841 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3842 const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
3843 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3844 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3845 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3846 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3847 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3848 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3849 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3850 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3851 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3852 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3853
3854 {
3855 __m128i u[64], v[64];
3856
3857 // stage 1
3858 u[32] = in[1];
3859 u[34] = in[17];
3860 u[36] = in[9];
3861 u[38] = in[25];
3862 u[40] = in[5];
3863 u[42] = in[21];
3864 u[44] = in[13];
3865 u[46] = in[29];
3866 u[48] = in[3];
3867 u[50] = in[19];
3868 u[52] = in[11];
3869 u[54] = in[27];
3870 u[56] = in[7];
3871 u[58] = in[23];
3872 u[60] = in[15];
3873 u[62] = in[31];
3874
3875 v[16] = in[2];
3876 v[18] = in[18];
3877 v[20] = in[10];
3878 v[22] = in[26];
3879 v[24] = in[6];
3880 v[26] = in[22];
3881 v[28] = in[14];
3882 v[30] = in[30];
3883
3884 u[8] = in[4];
3885 u[10] = in[20];
3886 u[12] = in[12];
3887 u[14] = in[28];
3888
3889 v[4] = in[8];
3890 v[6] = in[24];
3891
3892 u[0] = in[0];
3893 u[2] = in[16];
3894
3895 // stage 2
3896 v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3897 v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
3898 v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
3899 v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3900 v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3901 v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
3902 v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
3903 v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3904 v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3905 v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
3906 v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
3907 v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3908 v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3909 v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
3910 v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
3911 v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3912 v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3913 v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
3914 v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
3915 v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3916 v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3917 v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
3918 v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
3919 v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3920 v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3921 v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
3922 v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
3923 v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3924 v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3925 v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
3926 v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
3927 v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3928
3929 // stage 3
3930 u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
3931 u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
3932 u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
3933 u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
3934 u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
3935 u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
3936 u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
3937 u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
3938 u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
3939 u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
3940 u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
3941 u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
3942 u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
3943 u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
3944 u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
3945 u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
3946
3947 for (i = 32; i < 64; i += 4) {
3948 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3949 &clamp_hi);
3950 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3951 &clamp_hi);
3952 }
3953
3954 // stage 4
3955 v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3956 v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
3957 v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
3958 v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3959 v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3960 v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
3961 v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
3962 v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3963
3964 for (i = 16; i < 32; i += 4) {
3965 addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3966 &clamp_hi);
3967 addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3968 &clamp_hi);
3969 }
3970
3971 for (i = 32; i < 64; i += 4) {
3972 v[i + 0] = u[i + 0];
3973 v[i + 3] = u[i + 3];
3974 }
3975
3976 v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3977 v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3978 v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3979 v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3980 v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3981 v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3982 v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3983 v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3984 v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3985 v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3986 v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3987 v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3988 v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3989 v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3990 v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3991 v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3992
3993 // stage 5
3994 u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
3995 u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
3996 u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
3997 u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
3998
3999 for (i = 8; i < 16; i += 4) {
4000 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4001 &clamp_hi);
4002 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4003 &clamp_hi);
4004 }
4005
4006 for (i = 16; i < 32; i += 4) {
4007 u[i + 0] = v[i + 0];
4008 u[i + 3] = v[i + 3];
4009 }
4010
4011 u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4012 u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4013 u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4014 u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4015 u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4016 u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4017 u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4018 u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4019
4020 for (i = 32; i < 64; i += 8) {
4021 addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4022 &clamp_hi);
4023 addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4024 &clamp_hi);
4025
4026 addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4027 &clamp_hi);
4028 addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4029 &clamp_hi);
4030 }
4031
4032 // stage 6
4033 v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4034 v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4035 v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4036 v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4037
4038 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4039 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4040
4041 for (i = 8; i < 16; i += 4) {
4042 v[i + 0] = u[i + 0];
4043 v[i + 3] = u[i + 3];
4044 }
4045
4046 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4047 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4048 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4049 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4050
4051 for (i = 16; i < 32; i += 8) {
4052 addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4053 &clamp_hi);
4054 addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4055 &clamp_hi);
4056
4057 addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4058 &clamp_hi);
4059 addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4060 &clamp_hi);
4061 }
4062
4063 for (i = 32; i < 64; i += 8) {
4064 v[i + 0] = u[i + 0];
4065 v[i + 1] = u[i + 1];
4066 v[i + 6] = u[i + 6];
4067 v[i + 7] = u[i + 7];
4068 }
4069
4070 v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4071 v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4072 v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4073 v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4074 v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4075 v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4076 v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4077 v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4078 v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4079 v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4080 v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4081 v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4082 v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4083 v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4084 v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4085 v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4086
4087 // stage 7
4088 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4089 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4090
4091 u[4] = v[4];
4092 u[7] = v[7];
4093 u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4094 u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4095
4096 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4097 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4098 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4099 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4100
4101 for (i = 16; i < 32; i += 8) {
4102 u[i + 0] = v[i + 0];
4103 u[i + 1] = v[i + 1];
4104 u[i + 6] = v[i + 6];
4105 u[i + 7] = v[i + 7];
4106 }
4107
4108 u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4109 u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4110 u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4111 u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4112 u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4113 u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4114 u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4115 u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4116
4117 for (i = 32; i < 64; i += 16) {
4118 for (j = i; j < i + 4; j++) {
4119 addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4120 addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4121 &clamp_hi);
4122 }
4123 }
4124
4125 // stage 8
4126 for (i = 0; i < 4; ++i) {
4127 addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4128 }
4129
4130 v[8] = u[8];
4131 v[9] = u[9];
4132 v[14] = u[14];
4133 v[15] = u[15];
4134
4135 v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4136 v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4137 v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4138 v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4139
4140 for (i = 16; i < 20; ++i) {
4141 addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4142 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4143 &clamp_hi);
4144 }
4145
4146 for (i = 32; i < 36; ++i) {
4147 v[i] = u[i];
4148 v[i + 12] = u[i + 12];
4149 v[i + 16] = u[i + 16];
4150 v[i + 28] = u[i + 28];
4151 }
4152
4153 v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4154 v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4155 v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4156 v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4157 v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4158 v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4159 v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4160 v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4161 v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4162 v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4163 v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4164 v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4165 v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4166 v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4167 v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4168 v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4169
4170 // stage 9
4171 for (i = 0; i < 8; ++i) {
4172 addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4173 }
4174
4175 for (i = 16; i < 20; ++i) {
4176 u[i] = v[i];
4177 u[i + 12] = v[i + 12];
4178 }
4179
4180 u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4181 u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4182 u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4183 u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4184 u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4185 u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4186 u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4187 u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4188
4189 for (i = 32; i < 40; i++) {
4190 addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4191 }
4192
4193 for (i = 48; i < 56; i++) {
4194 addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4195 }
4196
4197 // stage 10
4198 for (i = 0; i < 16; i++) {
4199 addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4200 }
4201
4202 for (i = 32; i < 40; i++) v[i] = u[i];
4203
4204 v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4205 v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4206 v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4207 v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4208 v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4209 v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4210 v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4211 v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4212 v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4213 v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4214 v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4215 v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4216 v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4217 v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4218 v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4219 v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4220
4221 for (i = 56; i < 64; i++) v[i] = u[i];
4222
4223 // stage 11
4224 if (do_cols) {
4225 for (i = 0; i < 32; i++) {
4226 addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
4227 }
4228 } else {
4229 const int log_range_out = AOMMAX(16, bd + 6);
4230 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4231 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4232 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4233 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4234
4235 for (i = 0; i < 32; i++) {
4236 addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
4237 &clamp_lo_out, &clamp_hi_out, out_shift);
4238 }
4239 }
4240 }
4241 }
4242
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4243 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4244 int do_cols, int bd, int out_shift) {
4245 const int32_t *cospi = cospi_arr(bit);
4246 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4247 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4248 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4249 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4250 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4251 __m128i bf1;
4252
4253 // stage 0
4254 // stage 1
4255 bf1 = in[0];
4256
4257 // stage 2
4258 // stage 3
4259 // stage 4
4260 // stage 5
4261 bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4262
4263 // stage 6
4264 // stage 7
4265 // stage 8
4266 // stage 9
4267 if (do_cols) {
4268 bf1 = _mm_max_epi32(bf1, clamp_lo);
4269 bf1 = _mm_min_epi32(bf1, clamp_hi);
4270 } else {
4271 const int log_range_out = AOMMAX(16, bd + 6);
4272 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4273 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4274 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4275 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4276
4277 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4278 bf1 = _mm_add_epi32(bf1, offset);
4279 bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4280 bf1 = _mm_max_epi32(bf1, clamp_lo_out);
4281 bf1 = _mm_min_epi32(bf1, clamp_hi_out);
4282 }
4283 out[0] = bf1;
4284 out[1] = bf1;
4285 out[2] = bf1;
4286 out[3] = bf1;
4287 out[4] = bf1;
4288 out[5] = bf1;
4289 out[6] = bf1;
4290 out[7] = bf1;
4291 out[8] = bf1;
4292 out[9] = bf1;
4293 out[10] = bf1;
4294 out[11] = bf1;
4295 out[12] = bf1;
4296 out[13] = bf1;
4297 out[14] = bf1;
4298 out[15] = bf1;
4299 out[16] = bf1;
4300 out[17] = bf1;
4301 out[18] = bf1;
4302 out[19] = bf1;
4303 out[20] = bf1;
4304 out[21] = bf1;
4305 out[22] = bf1;
4306 out[23] = bf1;
4307 out[24] = bf1;
4308 out[25] = bf1;
4309 out[26] = bf1;
4310 out[27] = bf1;
4311 out[28] = bf1;
4312 out[29] = bf1;
4313 out[30] = bf1;
4314 out[31] = bf1;
4315 }
4316
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4317 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4318 int do_cols, int bd, int out_shift) {
4319 const int32_t *cospi = cospi_arr(bit);
4320 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4321 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4322 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4323 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4324 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4325 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4326 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4327 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4328 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4329 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4330 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4331 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4332 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4333 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4334 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4335 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4336 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4337 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4338 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4339 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4340 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4341 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4342 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4343 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4344 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4345 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4346 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4347 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4348 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4349 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4350 __m128i bf1[32];
4351
4352 // stage 0
4353 // stage 1
4354 bf1[0] = in[0];
4355 bf1[4] = in[4];
4356 bf1[8] = in[2];
4357 bf1[12] = in[6];
4358 bf1[16] = in[1];
4359 bf1[20] = in[5];
4360 bf1[24] = in[3];
4361 bf1[28] = in[7];
4362
4363 // stage 2
4364 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4365 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4366 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4367 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4368 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4369 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4370 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4371 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4372
4373 // stage 3
4374 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4375 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4376
4377 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4378 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4379 bf1[17] = bf1[16];
4380 bf1[18] = bf1[19];
4381 bf1[21] = bf1[20];
4382 bf1[22] = bf1[23];
4383 bf1[25] = bf1[24];
4384 bf1[26] = bf1[27];
4385 bf1[29] = bf1[28];
4386 bf1[30] = bf1[31];
4387
4388 // stage 4 :
4389 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4390 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4391
4392 bf1[9] = bf1[8];
4393 bf1[10] = bf1[11];
4394 bf1[13] = bf1[12];
4395 bf1[14] = bf1[15];
4396
4397 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4398 &cospi24, &cospi40, &cospim24, &rounding, bit);
4399
4400 // stage 5
4401 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4402 bf1[1] = bf1[0];
4403 bf1[5] = bf1[4];
4404 bf1[6] = bf1[7];
4405
4406 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4407 &clamp_hi, &rounding, bit);
4408
4409 // stage 6
4410 bf1[3] = bf1[0];
4411 bf1[2] = bf1[1];
4412
4413 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4414 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4415
4416 // stage 7
4417 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4418 &rounding, bit);
4419
4420 // stage 8
4421 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4422 &rounding, bit);
4423
4424 // stage 9
4425 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4426 }
4427
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4428 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4429 int do_cols, int bd, int out_shift) {
4430 const int32_t *cospi = cospi_arr(bit);
4431 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4432 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4433 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4434 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4435 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4436 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4437 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4438 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4439 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4440 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4441 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4442 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4443 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4444 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4445 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4446 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4447 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4448 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4449 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4450 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4451 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4452 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4453 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4454 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4455 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4456 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4457 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4458 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4459 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4460 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4461 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4462 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4463 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4464 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4465 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4466 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4467 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4468 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4469 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4470 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4471 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4472 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4473 __m128i bf1[32];
4474
4475 // stage 0
4476 // stage 1
4477
4478 bf1[0] = in[0];
4479 bf1[2] = in[8];
4480 bf1[4] = in[4];
4481 bf1[6] = in[12];
4482 bf1[8] = in[2];
4483 bf1[10] = in[10];
4484 bf1[12] = in[6];
4485 bf1[14] = in[14];
4486 bf1[16] = in[1];
4487 bf1[18] = in[9];
4488 bf1[20] = in[5];
4489 bf1[22] = in[13];
4490 bf1[24] = in[3];
4491 bf1[26] = in[11];
4492 bf1[28] = in[7];
4493 bf1[30] = in[15];
4494
4495 // stage 2
4496 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4497 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4498 bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4499 bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4500 bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4501 bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4502 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4503 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4504 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4505 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4506 bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4507 bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4508 bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4509 bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4510 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4511 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4512
4513 // stage 3
4514 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4515 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4516 bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4517 bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4518 bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4519 bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4520 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4521 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4522
4523 addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4524 addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4525 addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4526 addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4527 addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4528 addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4529 addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4530 addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4531 // stage 4
4532 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4533 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4534 bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4535 bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4536
4537 addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4538 addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4539 addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4540 addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4541
4542 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4543 &cospi24, &cospi40, &cospim24, &rounding, bit);
4544
4545 // stage 5
4546 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4547 bf1[1] = bf1[0];
4548 bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4549 bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4550
4551 addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4552 addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4553
4554 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4555 &clamp_hi, &rounding, bit);
4556
4557 // stage 6
4558 addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4559 addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4560
4561 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4562 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4563
4564 // stage 7
4565 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4566 &rounding, bit);
4567
4568 // stage 8
4569 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4570 &rounding, bit);
4571
4572 // stage 9
4573 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
4574 }
4575
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4576 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4577 int bd, int out_shift) {
4578 const int32_t *cospi = cospi_arr(bit);
4579 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4580 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4581 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4582 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4583 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4584 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4585 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4586 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4587 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4588 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4589 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4590 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4591 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4592 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4593 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4594 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4595 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4596 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4597 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4598 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4599 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4600 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4601 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4602 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4603 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4604 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4605 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4606 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4607 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4608 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4609 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4610 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4611 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4612 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4613 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4614 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4615 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4616 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4617 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4618 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4619 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4620 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4621 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4622 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4623 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4624 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4625 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4626 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4627 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4628 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4629 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4630 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4631 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4632 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4633 __m128i bf1[32], bf0[32];
4634
4635 // stage 0
4636 // stage 1
4637 bf1[0] = in[0];
4638 bf1[1] = in[16];
4639 bf1[2] = in[8];
4640 bf1[3] = in[24];
4641 bf1[4] = in[4];
4642 bf1[5] = in[20];
4643 bf1[6] = in[12];
4644 bf1[7] = in[28];
4645 bf1[8] = in[2];
4646 bf1[9] = in[18];
4647 bf1[10] = in[10];
4648 bf1[11] = in[26];
4649 bf1[12] = in[6];
4650 bf1[13] = in[22];
4651 bf1[14] = in[14];
4652 bf1[15] = in[30];
4653 bf1[16] = in[1];
4654 bf1[17] = in[17];
4655 bf1[18] = in[9];
4656 bf1[19] = in[25];
4657 bf1[20] = in[5];
4658 bf1[21] = in[21];
4659 bf1[22] = in[13];
4660 bf1[23] = in[29];
4661 bf1[24] = in[3];
4662 bf1[25] = in[19];
4663 bf1[26] = in[11];
4664 bf1[27] = in[27];
4665 bf1[28] = in[7];
4666 bf1[29] = in[23];
4667 bf1[30] = in[15];
4668 bf1[31] = in[31];
4669
4670 // stage 2
4671 bf0[0] = bf1[0];
4672 bf0[1] = bf1[1];
4673 bf0[2] = bf1[2];
4674 bf0[3] = bf1[3];
4675 bf0[4] = bf1[4];
4676 bf0[5] = bf1[5];
4677 bf0[6] = bf1[6];
4678 bf0[7] = bf1[7];
4679 bf0[8] = bf1[8];
4680 bf0[9] = bf1[9];
4681 bf0[10] = bf1[10];
4682 bf0[11] = bf1[11];
4683 bf0[12] = bf1[12];
4684 bf0[13] = bf1[13];
4685 bf0[14] = bf1[14];
4686 bf0[15] = bf1[15];
4687 bf0[16] =
4688 half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4689 bf0[17] =
4690 half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4691 bf0[18] =
4692 half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4693 bf0[19] =
4694 half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4695 bf0[20] =
4696 half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4697 bf0[21] =
4698 half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4699 bf0[22] =
4700 half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4701 bf0[23] =
4702 half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4703 bf0[24] =
4704 half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4705 bf0[25] =
4706 half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4707 bf0[26] =
4708 half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4709 bf0[27] =
4710 half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4711 bf0[28] =
4712 half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4713 bf0[29] =
4714 half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4715 bf0[30] =
4716 half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4717 bf0[31] =
4718 half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4719
4720 // stage 3
4721 bf1[0] = bf0[0];
4722 bf1[1] = bf0[1];
4723 bf1[2] = bf0[2];
4724 bf1[3] = bf0[3];
4725 bf1[4] = bf0[4];
4726 bf1[5] = bf0[5];
4727 bf1[6] = bf0[6];
4728 bf1[7] = bf0[7];
4729 bf1[8] =
4730 half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4731 bf1[9] =
4732 half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4733 bf1[10] =
4734 half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4735 bf1[11] =
4736 half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4737 bf1[12] =
4738 half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4739 bf1[13] =
4740 half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4741 bf1[14] =
4742 half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4743 bf1[15] =
4744 half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4745
4746 addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4747 addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4748 addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4749 addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4750 addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4751 addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4752 addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4753 addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4754
4755 // stage 4
4756 bf0[0] = bf1[0];
4757 bf0[1] = bf1[1];
4758 bf0[2] = bf1[2];
4759 bf0[3] = bf1[3];
4760 bf0[4] =
4761 half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4762 bf0[5] =
4763 half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4764 bf0[6] =
4765 half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4766 bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4767
4768 addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4769 addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4770 addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4771 addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4772
4773 bf0[16] = bf1[16];
4774 bf0[17] =
4775 half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4776 bf0[18] =
4777 half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4778 bf0[19] = bf1[19];
4779 bf0[20] = bf1[20];
4780 bf0[21] =
4781 half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4782 bf0[22] =
4783 half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4784 bf0[23] = bf1[23];
4785 bf0[24] = bf1[24];
4786 bf0[25] =
4787 half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4788 bf0[26] =
4789 half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4790 bf0[27] = bf1[27];
4791 bf0[28] = bf1[28];
4792 bf0[29] =
4793 half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
4794 bf0[30] =
4795 half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
4796 bf0[31] = bf1[31];
4797
4798 // stage 5
4799 bf1[0] =
4800 half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
4801 bf1[1] =
4802 half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
4803 bf1[2] =
4804 half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
4805 bf1[3] =
4806 half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
4807 addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4808 addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4809 bf1[8] = bf0[8];
4810 bf1[9] =
4811 half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
4812 bf1[10] =
4813 half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
4814 bf1[11] = bf0[11];
4815 bf1[12] = bf0[12];
4816 bf1[13] =
4817 half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
4818 bf1[14] =
4819 half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
4820 bf1[15] = bf0[15];
4821 addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4822 addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4823 addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4824 addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4825 addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4826 addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4827 addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
4828 addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
4829
4830 // stage 6
4831 addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
4832 addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
4833 bf0[4] = bf1[4];
4834 bf0[5] =
4835 half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4836 bf0[6] =
4837 half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
4838 bf0[7] = bf1[7];
4839 addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
4840 addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
4841 addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
4842 addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
4843 bf0[16] = bf1[16];
4844 bf0[17] = bf1[17];
4845 bf0[18] =
4846 half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
4847 bf0[19] =
4848 half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
4849 bf0[20] =
4850 half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
4851 bf0[21] =
4852 half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
4853 bf0[22] = bf1[22];
4854 bf0[23] = bf1[23];
4855 bf0[24] = bf1[24];
4856 bf0[25] = bf1[25];
4857 bf0[26] =
4858 half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
4859 bf0[27] =
4860 half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
4861 bf0[28] =
4862 half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
4863 bf0[29] =
4864 half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
4865 bf0[30] = bf1[30];
4866 bf0[31] = bf1[31];
4867
4868 // stage 7
4869 addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
4870 addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
4871 addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
4872 addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
4873 bf1[8] = bf0[8];
4874 bf1[9] = bf0[9];
4875 bf1[10] =
4876 half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4877 bf1[11] =
4878 half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4879 bf1[12] =
4880 half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
4881 bf1[13] =
4882 half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
4883 bf1[14] = bf0[14];
4884 bf1[15] = bf0[15];
4885 addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
4886 addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
4887 addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
4888 addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
4889 addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
4890 addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
4891 addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
4892 addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
4893
4894 // stage 8
4895 addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
4896 addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
4897 addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
4898 addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
4899 addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
4900 addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
4901 addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
4902 addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
4903 bf0[16] = bf1[16];
4904 bf0[17] = bf1[17];
4905 bf0[18] = bf1[18];
4906 bf0[19] = bf1[19];
4907 bf0[20] =
4908 half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4909 bf0[21] =
4910 half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4911 bf0[22] =
4912 half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4913 bf0[23] =
4914 half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4915 bf0[24] =
4916 half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
4917 bf0[25] =
4918 half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
4919 bf0[26] =
4920 half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
4921 bf0[27] =
4922 half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
4923 bf0[28] = bf1[28];
4924 bf0[29] = bf1[29];
4925 bf0[30] = bf1[30];
4926 bf0[31] = bf1[31];
4927
4928 // stage 9
4929 if (do_cols) {
4930 addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
4931 addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
4932 addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
4933 addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
4934 addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
4935 addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
4936 addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
4937 addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
4938 addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
4939 addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
4940 addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
4941 addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
4942 addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
4943 addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
4944 addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
4945 addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
4946 } else {
4947 const int log_range_out = AOMMAX(16, bd + 6);
4948 const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
4949 -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
4950 const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
4951 (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
4952
4953 addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
4954 &clamp_hi_out, out_shift);
4955 addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
4956 &clamp_hi_out, out_shift);
4957 addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
4958 &clamp_hi_out, out_shift);
4959 addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
4960 &clamp_hi_out, out_shift);
4961 addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
4962 &clamp_hi_out, out_shift);
4963 addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
4964 &clamp_hi_out, out_shift);
4965 addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
4966 &clamp_hi_out, out_shift);
4967 addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
4968 &clamp_hi_out, out_shift);
4969 addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
4970 &clamp_hi_out, out_shift);
4971 addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
4972 &clamp_hi_out, out_shift);
4973 addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
4974 &clamp_hi_out, out_shift);
4975 addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
4976 &clamp_hi_out, out_shift);
4977 addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
4978 &clamp_hi_out, out_shift);
4979 addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
4980 &clamp_hi_out, out_shift);
4981 addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
4982 &clamp_hi_out, out_shift);
4983 addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
4984 &clamp_hi_out, out_shift);
4985 }
4986 }
4987
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)4988 void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
4989 int stride,
4990 const TxfmParam *txfm_param) {
4991 int bd = txfm_param->bd;
4992 const TX_TYPE tx_type = txfm_param->tx_type;
4993 const int32_t *src = cast_to_int32(input);
4994 switch (tx_type) {
4995 // Assembly version doesn't support some transform types, so use C version
4996 // for those.
4997 case V_DCT:
4998 case H_DCT:
4999 case V_ADST:
5000 case H_ADST:
5001 case V_FLIPADST:
5002 case H_FLIPADST:
5003 case IDTX:
5004 av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5005 bd);
5006 break;
5007 default:
5008 av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5009 tx_type, bd);
5010 break;
5011 }
5012 }
5013
av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5014 void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
5015 int stride,
5016 const TxfmParam *txfm_param) {
5017 int bd = txfm_param->bd;
5018 const TX_TYPE tx_type = txfm_param->tx_type;
5019 const int32_t *src = cast_to_int32(input);
5020 switch (tx_type) {
5021 // Assembly version doesn't support some transform types, so use C version
5022 // for those.
5023 case V_DCT:
5024 case H_DCT:
5025 case V_ADST:
5026 case H_ADST:
5027 case V_FLIPADST:
5028 case H_FLIPADST:
5029 case IDTX:
5030 av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5031 txfm_param->tx_type, txfm_param->bd);
5032 break;
5033 default:
5034 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5035 txfm_param->tx_size,
5036 txfm_param->eob, bd);
5037 break;
5038 }
5039 }
5040
av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5041 void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
5042 int stride,
5043 const TxfmParam *txfm_param) {
5044 int bd = txfm_param->bd;
5045 const TX_TYPE tx_type = txfm_param->tx_type;
5046 const int32_t *src = cast_to_int32(input);
5047 switch (tx_type) {
5048 // Assembly version doesn't support some transform types, so use C version
5049 // for those.
5050 case V_DCT:
5051 case H_DCT:
5052 case V_ADST:
5053 case H_ADST:
5054 case V_FLIPADST:
5055 case H_FLIPADST:
5056 case IDTX:
5057 av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5058 txfm_param->tx_type, txfm_param->bd);
5059 break;
5060 default:
5061 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5062 txfm_param->tx_size,
5063 txfm_param->eob, bd);
5064 break;
5065 }
5066 }
5067
av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5068 void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
5069 uint8_t *dest, int stride,
5070 const TxfmParam *txfm_param) {
5071 int bd = txfm_param->bd;
5072 const TX_TYPE tx_type = txfm_param->tx_type;
5073 const int32_t *src = cast_to_int32(input);
5074 switch (tx_type) {
5075 // Assembly version doesn't support some transform types, so use C version
5076 // for those.
5077 case V_DCT:
5078 case H_DCT:
5079 case V_ADST:
5080 case H_ADST:
5081 case V_FLIPADST:
5082 case H_FLIPADST:
5083 case IDTX:
5084 av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5085 tx_type, bd);
5086 break;
5087 default:
5088 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5089 txfm_param->tx_size,
5090 txfm_param->eob, bd);
5091 break;
5092 }
5093 }
5094
av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5095 void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
5096 uint8_t *dest, int stride,
5097 const TxfmParam *txfm_param) {
5098 int bd = txfm_param->bd;
5099 const TX_TYPE tx_type = txfm_param->tx_type;
5100 const int32_t *src = cast_to_int32(input);
5101 switch (tx_type) {
5102 case DCT_DCT:
5103 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5104 txfm_param->tx_size,
5105 txfm_param->eob, bd);
5106 break;
5107 // Assembly version doesn't support IDTX, so use C version for it.
5108 case IDTX:
5109 av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
5110 tx_type, bd);
5111 break;
5112 default: assert(0);
5113 }
5114 }
5115
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5116 void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
5117 int stride,
5118 const TxfmParam *txfm_param) {
5119 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5120 int eob = txfm_param->eob;
5121 int bd = txfm_param->bd;
5122 int lossless = txfm_param->lossless;
5123 const int32_t *src = cast_to_int32(input);
5124 const TX_TYPE tx_type = txfm_param->tx_type;
5125 if (lossless) {
5126 assert(tx_type == DCT_DCT);
5127 av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5128 return;
5129 }
5130 switch (tx_type) {
5131 // Assembly version doesn't support some transform types, so use C version
5132 // for those.
5133 case V_DCT:
5134 case H_DCT:
5135 case V_ADST:
5136 case H_ADST:
5137 case V_FLIPADST:
5138 case H_FLIPADST:
5139 case IDTX:
5140 av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5141 bd);
5142 break;
5143 default:
5144 av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5145 tx_type, bd);
5146 break;
5147 }
5148 }
5149
5150 static const transform_1d_sse4_1
5151 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5152 {
5153 { NULL, NULL, NULL, NULL },
5154 { NULL, NULL, NULL, NULL },
5155 { NULL, NULL, NULL, NULL },
5156 },
5157 { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5158 { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5159 { NULL, NULL, NULL, NULL } },
5160 {
5161 { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5162 NULL },
5163 { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5164 NULL },
5165 { NULL, NULL, NULL, NULL },
5166 },
5167 { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5168 idct32x32_sse4_1 },
5169 { NULL, NULL, NULL, NULL },
5170 { NULL, NULL, NULL, NULL } },
5171 { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5172 idct64x64_sse4_1 },
5173 { NULL, NULL, NULL, NULL },
5174 { NULL, NULL, NULL, NULL } }
5175 };
5176
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5177 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5178 uint16_t *output,
5179 int stride, TX_TYPE tx_type,
5180 TX_SIZE tx_size, int eob,
5181 const int bd) {
5182 __m128i buf1[64 * 16];
5183 int eobx, eoby;
5184 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5185 const int8_t *shift = inv_txfm_shift_ls[tx_size];
5186 const int txw_idx = get_txw_idx(tx_size);
5187 const int txh_idx = get_txh_idx(tx_size);
5188 const int txfm_size_col = tx_size_wide[tx_size];
5189 const int txfm_size_row = tx_size_high[tx_size];
5190 const int buf_size_w_div8 = txfm_size_col >> 2;
5191 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5192 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5193 const int input_stride = AOMMIN(32, txfm_size_col);
5194 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5195
5196 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5197 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5198 const transform_1d_sse4_1 row_txfm =
5199 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5200 const transform_1d_sse4_1 col_txfm =
5201 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5202
5203 assert(col_txfm != NULL);
5204 assert(row_txfm != NULL);
5205 int ud_flip, lr_flip;
5206 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5207
5208 // 1st stage: column transform
5209 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5210 __m128i buf0[64];
5211 const int32_t *input_row = input + i * input_stride * 4;
5212 for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5213 __m128i *buf0_cur = buf0 + j * 4;
5214 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5215
5216 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5217 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5218 }
5219 if (rect_type == 1 || rect_type == -1) {
5220 av1_round_shift_rect_array_32_sse4_1(
5221 buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
5222 }
5223 row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
5224
5225 __m128i *_buf1 = buf1 + i * 4;
5226 if (lr_flip) {
5227 for (int j = 0; j < buf_size_w_div8; ++j) {
5228 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5229 buf0[4 * j],
5230 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5231 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5232 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5233 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5234 }
5235 } else {
5236 for (int j = 0; j < buf_size_w_div8; ++j) {
5237 TRANSPOSE_4X4(
5238 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5239 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5240 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5241 }
5242 }
5243 }
5244 // 2nd stage: column transform
5245 for (int i = 0; i < buf_size_w_div8; i++) {
5246 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
5247 inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
5248
5249 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5250 buf1 + i * txfm_size_row, txfm_size_row,
5251 -shift[1]);
5252 }
5253
5254 // write to buffer
5255 {
5256 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5257 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5258 output + 8 * i, stride, ud_flip,
5259 txfm_size_row, bd);
5260 }
5261 }
5262 }
5263
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5264 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5265 uint8_t *output, int stride,
5266 TX_TYPE tx_type, TX_SIZE tx_size,
5267 int eob, const int bd) {
5268 switch (tx_type) {
5269 case DCT_DCT:
5270 case ADST_DCT:
5271 case DCT_ADST:
5272 case ADST_ADST:
5273 case FLIPADST_DCT:
5274 case DCT_FLIPADST:
5275 case FLIPADST_FLIPADST:
5276 case ADST_FLIPADST:
5277 case FLIPADST_ADST:
5278 highbd_inv_txfm2d_add_no_identity_sse41(
5279 input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5280 bd);
5281 break;
5282 default: assert(0); break;
5283 }
5284 }
5285
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5286 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5287 int stride, const TxfmParam *txfm_param) {
5288 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5289 const TX_SIZE tx_size = txfm_param->tx_size;
5290 switch (tx_size) {
5291 case TX_32X32:
5292 av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
5293 break;
5294 case TX_16X16:
5295 av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
5296 break;
5297 case TX_8X8:
5298 av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
5299 break;
5300 case TX_4X8:
5301 av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
5302 break;
5303 case TX_8X4:
5304 av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
5305 break;
5306 case TX_8X16:
5307 av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
5308 break;
5309 case TX_16X8:
5310 av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
5311 break;
5312 case TX_16X32:
5313 av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
5314 break;
5315 case TX_32X16:
5316 av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
5317 break;
5318 case TX_32X64:
5319 av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
5320 break;
5321 case TX_64X32:
5322 av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
5323 break;
5324 case TX_4X4:
5325 av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
5326 break;
5327 case TX_16X4:
5328 av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
5329 break;
5330 case TX_4X16:
5331 av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
5332 break;
5333 case TX_8X32:
5334 av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
5335 break;
5336 case TX_32X8:
5337 av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
5338 break;
5339 case TX_64X64:
5340 case TX_16X64:
5341 case TX_64X16:
5342 av1_highbd_inv_txfm2d_add_universe_sse4_1(
5343 input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
5344 txfm_param->eob, txfm_param->bd);
5345 break;
5346 default: assert(0 && "Invalid transform size"); break;
5347 }
5348 }
5349