1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/txfm_common_sse2.h"
14
15 #define RECON_AND_STORE4X4(dest, in_x) \
16 { \
17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18 d0 = _mm_unpacklo_epi8(d0, zero); \
19 d0 = _mm_add_epi16(in_x, d0); \
20 d0 = _mm_packus_epi16(d0, d0); \
21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22 }
23
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)24 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
25 int stride) {
26 const __m128i zero = _mm_setzero_si128();
27 const __m128i eight = _mm_set1_epi16(8);
28 const __m128i cst = _mm_setr_epi16(
29 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33 __m128i input0, input1, input2, input3;
34
35 // Rows
36 input0 = load_input_data(input);
37 input2 = load_input_data(input + 8);
38
39 // Construct i3, i1, i3, i1, i2, i0, i2, i0
40 input0 = _mm_shufflelo_epi16(input0, 0xd8);
41 input0 = _mm_shufflehi_epi16(input0, 0xd8);
42 input2 = _mm_shufflelo_epi16(input2, 0xd8);
43 input2 = _mm_shufflehi_epi16(input2, 0xd8);
44
45 input1 = _mm_unpackhi_epi32(input0, input0);
46 input0 = _mm_unpacklo_epi32(input0, input0);
47 input3 = _mm_unpackhi_epi32(input2, input2);
48 input2 = _mm_unpacklo_epi32(input2, input2);
49
50 // Stage 1
51 input0 = _mm_madd_epi16(input0, cst);
52 input1 = _mm_madd_epi16(input1, cst);
53 input2 = _mm_madd_epi16(input2, cst);
54 input3 = _mm_madd_epi16(input3, cst);
55
56 input0 = _mm_add_epi32(input0, rounding);
57 input1 = _mm_add_epi32(input1, rounding);
58 input2 = _mm_add_epi32(input2, rounding);
59 input3 = _mm_add_epi32(input3, rounding);
60
61 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65
66 // Stage 2
67 input0 = _mm_packs_epi32(input0, input1);
68 input1 = _mm_packs_epi32(input2, input3);
69
70 // Transpose
71 input2 = _mm_unpacklo_epi16(input0, input1);
72 input3 = _mm_unpackhi_epi16(input0, input1);
73 input0 = _mm_unpacklo_epi32(input2, input3);
74 input1 = _mm_unpackhi_epi32(input2, input3);
75
76 // Switch column2, column 3, and then, we got:
77 // input2: column1, column 0; input3: column2, column 3.
78 input1 = _mm_shuffle_epi32(input1, 0x4e);
79 input2 = _mm_add_epi16(input0, input1);
80 input3 = _mm_sub_epi16(input0, input1);
81
82 // Columns
83 // Construct i3, i1, i3, i1, i2, i0, i2, i0
84 input0 = _mm_unpacklo_epi32(input2, input2);
85 input1 = _mm_unpackhi_epi32(input2, input2);
86 input2 = _mm_unpackhi_epi32(input3, input3);
87 input3 = _mm_unpacklo_epi32(input3, input3);
88
89 // Stage 1
90 input0 = _mm_madd_epi16(input0, cst);
91 input1 = _mm_madd_epi16(input1, cst);
92 input2 = _mm_madd_epi16(input2, cst);
93 input3 = _mm_madd_epi16(input3, cst);
94
95 input0 = _mm_add_epi32(input0, rounding);
96 input1 = _mm_add_epi32(input1, rounding);
97 input2 = _mm_add_epi32(input2, rounding);
98 input3 = _mm_add_epi32(input3, rounding);
99
100 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104
105 // Stage 2
106 input0 = _mm_packs_epi32(input0, input2);
107 input1 = _mm_packs_epi32(input1, input3);
108
109 // Transpose
110 input2 = _mm_unpacklo_epi16(input0, input1);
111 input3 = _mm_unpackhi_epi16(input0, input1);
112 input0 = _mm_unpacklo_epi32(input2, input3);
113 input1 = _mm_unpackhi_epi32(input2, input3);
114
115 // Switch column2, column 3, and then, we got:
116 // input2: column1, column 0; input3: column2, column 3.
117 input1 = _mm_shuffle_epi32(input1, 0x4e);
118 input2 = _mm_add_epi16(input0, input1);
119 input3 = _mm_sub_epi16(input0, input1);
120
121 // Final round and shift
122 input2 = _mm_add_epi16(input2, eight);
123 input3 = _mm_add_epi16(input3, eight);
124
125 input2 = _mm_srai_epi16(input2, 4);
126 input3 = _mm_srai_epi16(input3, 4);
127
128 // Reconstruction and Store
129 {
130 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132 d0 = _mm_unpacklo_epi32(d0,
133 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134 d2 = _mm_unpacklo_epi32(
135 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136 d0 = _mm_unpacklo_epi8(d0, zero);
137 d2 = _mm_unpacklo_epi8(d2, zero);
138 d0 = _mm_add_epi16(d0, input2);
139 d2 = _mm_add_epi16(d2, input3);
140 d0 = _mm_packus_epi16(d0, d2);
141 // store input0
142 *(int *)dest = _mm_cvtsi128_si32(d0);
143 // store input1
144 d0 = _mm_srli_si128(d0, 4);
145 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
146 // store input2
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149 // store input3
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
152 }
153 }
154
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)155 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
156 int stride) {
157 __m128i dc_value;
158 const __m128i zero = _mm_setzero_si128();
159 int a;
160
161 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
162 a = (int)dct_const_round_shift(a * cospi_16_64);
163 a = ROUND_POWER_OF_TWO(a, 4);
164
165 dc_value = _mm_set1_epi16(a);
166
167 RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168 RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169 RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170 RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
171 }
172
transpose_4x4(__m128i * res)173 static INLINE void transpose_4x4(__m128i *res) {
174 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
176
177 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
179 }
180
idct4_sse2(__m128i * in)181 void idct4_sse2(__m128i *in) {
182 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
187 __m128i u[8], v[8];
188
189 transpose_4x4(in);
190 // stage 1
191 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
197
198 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
202
203 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
207
208 u[0] = _mm_packs_epi32(v[0], v[1]);
209 u[1] = _mm_packs_epi32(v[3], v[2]);
210
211 // stage 2
212 in[0] = _mm_add_epi16(u[0], u[1]);
213 in[1] = _mm_sub_epi16(u[0], u[1]);
214 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
215 }
216
iadst4_sse2(__m128i * in)217 void iadst4_sse2(__m128i *in) {
218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223 const __m128i kZero = _mm_set1_epi16(0);
224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225 __m128i u[8], v[8], in7;
226
227 transpose_4x4(in);
228 in7 = _mm_srli_si128(in[1], 8);
229 in7 = _mm_add_epi16(in7, in[0]);
230 in7 = _mm_sub_epi16(in7, in[1]);
231
232 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234 u[2] = _mm_unpacklo_epi16(in7, kZero);
235 u[3] = _mm_unpackhi_epi16(in[0], kZero);
236
237 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
238 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
239 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
240 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
241 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
242 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
243
244 u[0] = _mm_add_epi32(v[0], v[1]);
245 u[1] = _mm_add_epi32(v[3], v[4]);
246 u[2] = v[2];
247 u[3] = _mm_add_epi32(u[0], u[1]);
248 u[4] = _mm_slli_epi32(v[5], 2);
249 u[5] = _mm_add_epi32(u[3], v[5]);
250 u[6] = _mm_sub_epi32(u[5], u[4]);
251
252 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
256
257 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
261
262 in[0] = _mm_packs_epi32(u[0], u[1]);
263 in[1] = _mm_packs_epi32(u[2], u[3]);
264 }
265
266 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
267 out0, out1, out2, out3, out4, out5, out6, out7) \
268 { \
269 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
277 \
278 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
286 \
287 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
295 }
296
297 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
298 out0, out1, out2, out3) \
299 { \
300 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
301 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
302 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
303 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304 \
305 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
306 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
307 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
308 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309 \
310 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
311 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
312 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
313 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
314 }
315
316 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317 { \
318 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
319 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
320 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
321 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
322 }
323
324 // Define Macro for multiplying elements by constants and adding them together.
325 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
326 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
327 { \
328 tmp0 = _mm_madd_epi16(lo_0, cst0); \
329 tmp1 = _mm_madd_epi16(hi_0, cst0); \
330 tmp2 = _mm_madd_epi16(lo_0, cst1); \
331 tmp3 = _mm_madd_epi16(hi_0, cst1); \
332 tmp4 = _mm_madd_epi16(lo_1, cst2); \
333 tmp5 = _mm_madd_epi16(hi_1, cst2); \
334 tmp6 = _mm_madd_epi16(lo_1, cst3); \
335 tmp7 = _mm_madd_epi16(hi_1, cst3); \
336 \
337 tmp0 = _mm_add_epi32(tmp0, rounding); \
338 tmp1 = _mm_add_epi32(tmp1, rounding); \
339 tmp2 = _mm_add_epi32(tmp2, rounding); \
340 tmp3 = _mm_add_epi32(tmp3, rounding); \
341 tmp4 = _mm_add_epi32(tmp4, rounding); \
342 tmp5 = _mm_add_epi32(tmp5, rounding); \
343 tmp6 = _mm_add_epi32(tmp6, rounding); \
344 tmp7 = _mm_add_epi32(tmp7, rounding); \
345 \
346 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
347 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
348 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
349 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
350 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
351 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
352 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
353 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354 \
355 res0 = _mm_packs_epi32(tmp0, tmp1); \
356 res1 = _mm_packs_epi32(tmp2, tmp3); \
357 res2 = _mm_packs_epi32(tmp4, tmp5); \
358 res3 = _mm_packs_epi32(tmp6, tmp7); \
359 }
360
361 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362 { \
363 tmp0 = _mm_madd_epi16(lo_0, cst0); \
364 tmp1 = _mm_madd_epi16(hi_0, cst0); \
365 tmp2 = _mm_madd_epi16(lo_0, cst1); \
366 tmp3 = _mm_madd_epi16(hi_0, cst1); \
367 \
368 tmp0 = _mm_add_epi32(tmp0, rounding); \
369 tmp1 = _mm_add_epi32(tmp1, rounding); \
370 tmp2 = _mm_add_epi32(tmp2, rounding); \
371 tmp3 = _mm_add_epi32(tmp3, rounding); \
372 \
373 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
374 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
375 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
376 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377 \
378 res0 = _mm_packs_epi32(tmp0, tmp1); \
379 res1 = _mm_packs_epi32(tmp2, tmp3); \
380 }
381
382 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
383 out0, out1, out2, out3, out4, out5, out6, out7) \
384 { \
385 /* Stage1 */ \
386 { \
387 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
388 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
389 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
390 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391 \
392 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
393 stg1_1, stg1_2, stg1_3, stp1_4, \
394 stp1_7, stp1_5, stp1_6) \
395 } \
396 \
397 /* Stage2 */ \
398 { \
399 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
400 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
401 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
402 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
403 \
404 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
405 stg2_1, stg2_2, stg2_3, stp2_0, \
406 stp2_1, stp2_2, stp2_3) \
407 \
408 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
409 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
410 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
411 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
412 } \
413 \
414 /* Stage3 */ \
415 { \
416 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
417 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
418 \
419 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
420 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
421 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
422 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
423 \
424 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
425 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
426 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
427 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
428 \
429 tmp0 = _mm_add_epi32(tmp0, rounding); \
430 tmp1 = _mm_add_epi32(tmp1, rounding); \
431 tmp2 = _mm_add_epi32(tmp2, rounding); \
432 tmp3 = _mm_add_epi32(tmp3, rounding); \
433 \
434 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
435 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
436 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
437 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
438 \
439 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
440 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441 } \
442 \
443 /* Stage4 */ \
444 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
445 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
446 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
447 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
448 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
449 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
450 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
451 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
452 }
453
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)454 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
455 int stride) {
456 const __m128i zero = _mm_setzero_si128();
457 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
458 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
459 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
460 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
461 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
462 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
463 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
464 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
465 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
466 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
467
468 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
469 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
470 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
471 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472 int i;
473
474 // Load input data.
475 in0 = load_input_data(input);
476 in1 = load_input_data(input + 8 * 1);
477 in2 = load_input_data(input + 8 * 2);
478 in3 = load_input_data(input + 8 * 3);
479 in4 = load_input_data(input + 8 * 4);
480 in5 = load_input_data(input + 8 * 5);
481 in6 = load_input_data(input + 8 * 6);
482 in7 = load_input_data(input + 8 * 7);
483
484 // 2-D
485 for (i = 0; i < 2; i++) {
486 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
487 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
488 in0, in1, in2, in3, in4, in5, in6, in7);
489
490 // 4-stage 1D idct8x8
491 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
492 in0, in1, in2, in3, in4, in5, in6, in7);
493 }
494
495 // Final rounding and shift
496 in0 = _mm_adds_epi16(in0, final_rounding);
497 in1 = _mm_adds_epi16(in1, final_rounding);
498 in2 = _mm_adds_epi16(in2, final_rounding);
499 in3 = _mm_adds_epi16(in3, final_rounding);
500 in4 = _mm_adds_epi16(in4, final_rounding);
501 in5 = _mm_adds_epi16(in5, final_rounding);
502 in6 = _mm_adds_epi16(in6, final_rounding);
503 in7 = _mm_adds_epi16(in7, final_rounding);
504
505 in0 = _mm_srai_epi16(in0, 5);
506 in1 = _mm_srai_epi16(in1, 5);
507 in2 = _mm_srai_epi16(in2, 5);
508 in3 = _mm_srai_epi16(in3, 5);
509 in4 = _mm_srai_epi16(in4, 5);
510 in5 = _mm_srai_epi16(in5, 5);
511 in6 = _mm_srai_epi16(in6, 5);
512 in7 = _mm_srai_epi16(in7, 5);
513
514 RECON_AND_STORE(dest + 0 * stride, in0);
515 RECON_AND_STORE(dest + 1 * stride, in1);
516 RECON_AND_STORE(dest + 2 * stride, in2);
517 RECON_AND_STORE(dest + 3 * stride, in3);
518 RECON_AND_STORE(dest + 4 * stride, in4);
519 RECON_AND_STORE(dest + 5 * stride, in5);
520 RECON_AND_STORE(dest + 6 * stride, in6);
521 RECON_AND_STORE(dest + 7 * stride, in7);
522 }
523
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)524 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
525 int stride) {
526 __m128i dc_value;
527 const __m128i zero = _mm_setzero_si128();
528 int a;
529
530 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
531 a = (int)dct_const_round_shift(a * cospi_16_64);
532 a = ROUND_POWER_OF_TWO(a, 5);
533
534 dc_value = _mm_set1_epi16(a);
535
536 RECON_AND_STORE(dest + 0 * stride, dc_value);
537 RECON_AND_STORE(dest + 1 * stride, dc_value);
538 RECON_AND_STORE(dest + 2 * stride, dc_value);
539 RECON_AND_STORE(dest + 3 * stride, dc_value);
540 RECON_AND_STORE(dest + 4 * stride, dc_value);
541 RECON_AND_STORE(dest + 5 * stride, dc_value);
542 RECON_AND_STORE(dest + 6 * stride, dc_value);
543 RECON_AND_STORE(dest + 7 * stride, dc_value);
544 }
545
idct8_sse2(__m128i * in)546 void idct8_sse2(__m128i *in) {
547 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
548 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
549 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
550 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
551 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
552 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
553 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
554 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
555 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
556
557 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
558 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
559 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
560 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
561
562 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
563 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
564 in0, in1, in2, in3, in4, in5, in6, in7);
565
566 // 4-stage 1D idct8x8
567 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
568 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
569 }
570
iadst8_sse2(__m128i * in)571 void iadst8_sse2(__m128i *in) {
572 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
573 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
574 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
575 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
576 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
577 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
578 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
579 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
580 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
581 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
582 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
583 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
584 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
585 const __m128i k__const_0 = _mm_set1_epi16(0);
586 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
587
588 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
589 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
590 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
591 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
592 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
593
594 // transpose
595 array_transpose_8x8(in, in);
596
597 // properly aligned for butterfly input
598 in0 = in[7];
599 in1 = in[0];
600 in2 = in[5];
601 in3 = in[2];
602 in4 = in[3];
603 in5 = in[4];
604 in6 = in[1];
605 in7 = in[6];
606
607 // column transformation
608 // stage 1
609 // interleave and multiply/add into 32-bit integer
610 s0 = _mm_unpacklo_epi16(in0, in1);
611 s1 = _mm_unpackhi_epi16(in0, in1);
612 s2 = _mm_unpacklo_epi16(in2, in3);
613 s3 = _mm_unpackhi_epi16(in2, in3);
614 s4 = _mm_unpacklo_epi16(in4, in5);
615 s5 = _mm_unpackhi_epi16(in4, in5);
616 s6 = _mm_unpacklo_epi16(in6, in7);
617 s7 = _mm_unpackhi_epi16(in6, in7);
618
619 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
620 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
621 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
622 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
623 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
624 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
625 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
626 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
627 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
628 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
629 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
630 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
631 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
632 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
633 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
634 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
635
636 // addition
637 w0 = _mm_add_epi32(u0, u8);
638 w1 = _mm_add_epi32(u1, u9);
639 w2 = _mm_add_epi32(u2, u10);
640 w3 = _mm_add_epi32(u3, u11);
641 w4 = _mm_add_epi32(u4, u12);
642 w5 = _mm_add_epi32(u5, u13);
643 w6 = _mm_add_epi32(u6, u14);
644 w7 = _mm_add_epi32(u7, u15);
645 w8 = _mm_sub_epi32(u0, u8);
646 w9 = _mm_sub_epi32(u1, u9);
647 w10 = _mm_sub_epi32(u2, u10);
648 w11 = _mm_sub_epi32(u3, u11);
649 w12 = _mm_sub_epi32(u4, u12);
650 w13 = _mm_sub_epi32(u5, u13);
651 w14 = _mm_sub_epi32(u6, u14);
652 w15 = _mm_sub_epi32(u7, u15);
653
654 // shift and rounding
655 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
656 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
657 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
658 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
659 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
660 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
661 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
662 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
663 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
664 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
665 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
666 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
667 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
668 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
669 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
670 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
671
672 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
673 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
674 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
675 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
676 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
677 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
678 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
679 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
680 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
681 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
682 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
683 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
684 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
685 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
686 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
687 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
688
689 // back to 16-bit and pack 8 integers into __m128i
690 in[0] = _mm_packs_epi32(u0, u1);
691 in[1] = _mm_packs_epi32(u2, u3);
692 in[2] = _mm_packs_epi32(u4, u5);
693 in[3] = _mm_packs_epi32(u6, u7);
694 in[4] = _mm_packs_epi32(u8, u9);
695 in[5] = _mm_packs_epi32(u10, u11);
696 in[6] = _mm_packs_epi32(u12, u13);
697 in[7] = _mm_packs_epi32(u14, u15);
698
699 // stage 2
700 s0 = _mm_add_epi16(in[0], in[2]);
701 s1 = _mm_add_epi16(in[1], in[3]);
702 s2 = _mm_sub_epi16(in[0], in[2]);
703 s3 = _mm_sub_epi16(in[1], in[3]);
704 u0 = _mm_unpacklo_epi16(in[4], in[5]);
705 u1 = _mm_unpackhi_epi16(in[4], in[5]);
706 u2 = _mm_unpacklo_epi16(in[6], in[7]);
707 u3 = _mm_unpackhi_epi16(in[6], in[7]);
708
709 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
710 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
711 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
712 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
713 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
714 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
715 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
716 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
717
718 w0 = _mm_add_epi32(v0, v4);
719 w1 = _mm_add_epi32(v1, v5);
720 w2 = _mm_add_epi32(v2, v6);
721 w3 = _mm_add_epi32(v3, v7);
722 w4 = _mm_sub_epi32(v0, v4);
723 w5 = _mm_sub_epi32(v1, v5);
724 w6 = _mm_sub_epi32(v2, v6);
725 w7 = _mm_sub_epi32(v3, v7);
726
727 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
728 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
729 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
730 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
731 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
732 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
733 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
734 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
735
736 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
737 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
738 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
739 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
740 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
741 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
742 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
743 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
744
745 // back to 16-bit intergers
746 s4 = _mm_packs_epi32(u0, u1);
747 s5 = _mm_packs_epi32(u2, u3);
748 s6 = _mm_packs_epi32(u4, u5);
749 s7 = _mm_packs_epi32(u6, u7);
750
751 // stage 3
752 u0 = _mm_unpacklo_epi16(s2, s3);
753 u1 = _mm_unpackhi_epi16(s2, s3);
754 u2 = _mm_unpacklo_epi16(s6, s7);
755 u3 = _mm_unpackhi_epi16(s6, s7);
756
757 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
758 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
759 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
760 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
761 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
762 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
763 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
764 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
765
766 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
767 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
768 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
769 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
770 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
771 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
772 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
773 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
774
775 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
776 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
777 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
778 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
779 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
780 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
781 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
782 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
783
784 s2 = _mm_packs_epi32(v0, v1);
785 s3 = _mm_packs_epi32(v2, v3);
786 s6 = _mm_packs_epi32(v4, v5);
787 s7 = _mm_packs_epi32(v6, v7);
788
789 in[0] = s0;
790 in[1] = _mm_sub_epi16(k__const_0, s4);
791 in[2] = s6;
792 in[3] = _mm_sub_epi16(k__const_0, s2);
793 in[4] = s3;
794 in[5] = _mm_sub_epi16(k__const_0, s7);
795 in[6] = s5;
796 in[7] = _mm_sub_epi16(k__const_0, s1);
797 }
798
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
800 int stride) {
801 const __m128i zero = _mm_setzero_si128();
802 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
803 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
804 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
805 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
806 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
807 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
808 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
809 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
810 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
811 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
812 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
813
814 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
815 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
816 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
817 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
818
819 // Rows. Load 4-row input data.
820 in0 = load_input_data(input);
821 in1 = load_input_data(input + 8 * 1);
822 in2 = load_input_data(input + 8 * 2);
823 in3 = load_input_data(input + 8 * 3);
824
825 // 8x4 Transpose
826 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
827 // Stage1
828 {
829 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
830 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
831
832 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
833 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
834 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
835 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
836
837 tmp0 = _mm_add_epi32(tmp0, rounding);
838 tmp2 = _mm_add_epi32(tmp2, rounding);
839 tmp4 = _mm_add_epi32(tmp4, rounding);
840 tmp6 = _mm_add_epi32(tmp6, rounding);
841 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
842 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
843 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
844 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
845
846 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
847 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
848 }
849
850 // Stage2
851 {
852 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
853 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
854
855 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
856 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
857 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
858 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
859
860 tmp0 = _mm_add_epi32(tmp0, rounding);
861 tmp2 = _mm_add_epi32(tmp2, rounding);
862 tmp4 = _mm_add_epi32(tmp4, rounding);
863 tmp6 = _mm_add_epi32(tmp6, rounding);
864 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
865 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
866 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
867 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
868
869 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
870 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
871
872 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
873 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
874
875 stp2_4 = tmp0;
876 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
877 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
878 }
879
880 // Stage3
881 {
882 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
883
884 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
885 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
886
887 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
888 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
889
890 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
891 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
892
893 tmp0 = _mm_add_epi32(tmp0, rounding);
894 tmp2 = _mm_add_epi32(tmp2, rounding);
895 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
896 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
897
898 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899 }
900
901 // Stage4
902 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
903 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
904 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
905 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
906
907 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
908
909 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
910 in0, in1, in2, in3, in4, in5, in6, in7);
911 // Final rounding and shift
912 in0 = _mm_adds_epi16(in0, final_rounding);
913 in1 = _mm_adds_epi16(in1, final_rounding);
914 in2 = _mm_adds_epi16(in2, final_rounding);
915 in3 = _mm_adds_epi16(in3, final_rounding);
916 in4 = _mm_adds_epi16(in4, final_rounding);
917 in5 = _mm_adds_epi16(in5, final_rounding);
918 in6 = _mm_adds_epi16(in6, final_rounding);
919 in7 = _mm_adds_epi16(in7, final_rounding);
920
921 in0 = _mm_srai_epi16(in0, 5);
922 in1 = _mm_srai_epi16(in1, 5);
923 in2 = _mm_srai_epi16(in2, 5);
924 in3 = _mm_srai_epi16(in3, 5);
925 in4 = _mm_srai_epi16(in4, 5);
926 in5 = _mm_srai_epi16(in5, 5);
927 in6 = _mm_srai_epi16(in6, 5);
928 in7 = _mm_srai_epi16(in7, 5);
929
930 RECON_AND_STORE(dest + 0 * stride, in0);
931 RECON_AND_STORE(dest + 1 * stride, in1);
932 RECON_AND_STORE(dest + 2 * stride, in2);
933 RECON_AND_STORE(dest + 3 * stride, in3);
934 RECON_AND_STORE(dest + 4 * stride, in4);
935 RECON_AND_STORE(dest + 5 * stride, in5);
936 RECON_AND_STORE(dest + 6 * stride, in6);
937 RECON_AND_STORE(dest + 7 * stride, in7);
938 }
939
940 #define IDCT16 \
941 /* Stage2 */ \
942 { \
943 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
944 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
945 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
946 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
947 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
948 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
949 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
950 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
951 \
952 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
953 stg2_0, stg2_1, stg2_2, stg2_3, \
954 stp2_8, stp2_15, stp2_9, stp2_14) \
955 \
956 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
957 stg2_4, stg2_5, stg2_6, stg2_7, \
958 stp2_10, stp2_13, stp2_11, stp2_12) \
959 } \
960 \
961 /* Stage3 */ \
962 { \
963 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
964 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
965 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
966 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
967 \
968 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
969 stg3_0, stg3_1, stg3_2, stg3_3, \
970 stp1_4, stp1_7, stp1_5, stp1_6) \
971 \
972 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
973 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
974 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
975 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
976 \
977 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
978 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
979 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
980 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
981 } \
982 \
983 /* Stage4 */ \
984 { \
985 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
986 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
987 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
988 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
989 \
990 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
991 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
992 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
993 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
994 \
995 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
996 stg4_0, stg4_1, stg4_2, stg4_3, \
997 stp2_0, stp2_1, stp2_2, stp2_3) \
998 \
999 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1000 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1001 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1002 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1003 \
1004 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1005 stg4_4, stg4_5, stg4_6, stg4_7, \
1006 stp2_9, stp2_14, stp2_10, stp2_13) \
1007 } \
1008 \
1009 /* Stage5 */ \
1010 { \
1011 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1012 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1013 \
1014 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1015 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1016 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1017 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1018 \
1019 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1020 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1021 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1022 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1023 \
1024 tmp0 = _mm_add_epi32(tmp0, rounding); \
1025 tmp1 = _mm_add_epi32(tmp1, rounding); \
1026 tmp2 = _mm_add_epi32(tmp2, rounding); \
1027 tmp3 = _mm_add_epi32(tmp3, rounding); \
1028 \
1029 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1030 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1031 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1032 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1033 \
1034 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1035 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1036 \
1037 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1038 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1039 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1040 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1041 \
1042 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1043 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1044 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1045 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1046 } \
1047 \
1048 /* Stage6 */ \
1049 { \
1050 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1051 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1052 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1053 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1054 \
1055 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1056 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1057 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1058 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1059 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1060 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1061 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1062 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1063 \
1064 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1065 stg6_0, stg4_0, stg6_0, stg4_0, \
1066 stp2_10, stp2_13, stp2_11, stp2_12) \
1067 }
1068
1069 #define IDCT16_10 \
1070 /* Stage2 */ \
1071 { \
1072 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1073 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1074 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1075 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1076 \
1077 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1078 stg2_0, stg2_1, stg2_6, stg2_7, \
1079 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1080 } \
1081 \
1082 /* Stage3 */ \
1083 { \
1084 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1085 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1086 \
1087 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1088 stg3_0, stg3_1, \
1089 stp2_4, stp2_7) \
1090 \
1091 stp1_9 = stp1_8_0; \
1092 stp1_10 = stp1_11; \
1093 \
1094 stp1_13 = stp1_12_0; \
1095 stp1_14 = stp1_15; \
1096 } \
1097 \
1098 /* Stage4 */ \
1099 { \
1100 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1101 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1102 \
1103 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1104 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1105 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1106 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1107 \
1108 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1109 stg4_0, stg4_1, \
1110 stp1_0, stp1_1) \
1111 stp2_5 = stp2_4; \
1112 stp2_6 = stp2_7; \
1113 \
1114 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1115 stg4_4, stg4_5, stg4_6, stg4_7, \
1116 stp2_9, stp2_14, stp2_10, stp2_13) \
1117 } \
1118 \
1119 /* Stage5 */ \
1120 { \
1121 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1122 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1123 \
1124 stp1_2 = stp1_1; \
1125 stp1_3 = stp1_0; \
1126 \
1127 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1128 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1129 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1130 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1131 \
1132 tmp0 = _mm_add_epi32(tmp0, rounding); \
1133 tmp1 = _mm_add_epi32(tmp1, rounding); \
1134 tmp2 = _mm_add_epi32(tmp2, rounding); \
1135 tmp3 = _mm_add_epi32(tmp3, rounding); \
1136 \
1137 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1138 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1139 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1140 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1141 \
1142 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1143 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1144 \
1145 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1146 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1147 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1148 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1149 \
1150 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1151 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1152 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1153 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1154 } \
1155 \
1156 /* Stage6 */ \
1157 { \
1158 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1159 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1160 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1161 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1162 \
1163 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1164 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1165 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1166 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1167 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1168 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1169 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1170 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1171 \
1172 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1173 stg6_0, stg4_0, stg6_0, stg4_0, \
1174 stp2_10, stp2_13, stp2_11, stp2_12) \
1175 }
1176
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1177 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1178 int stride) {
1179 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1180 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1181 const __m128i zero = _mm_setzero_si128();
1182
1183 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1184 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1185 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1186 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1187 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1188 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1189 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1190 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1191
1192 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1193 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1194 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1195 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1196
1197 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1198 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1199 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1200 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1201 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1202 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1203 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1204 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1205
1206 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1207
1208 __m128i in[16], l[16], r[16], *curr1;
1209 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1210 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1211 stp1_8_0, stp1_12_0;
1212 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1213 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1214 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1215 int i;
1216
1217 curr1 = l;
1218 for (i = 0; i < 2; i++) {
1219 // 1-D idct
1220
1221 // Load input data.
1222 in[0] = load_input_data(input);
1223 in[8] = load_input_data(input + 8 * 1);
1224 in[1] = load_input_data(input + 8 * 2);
1225 in[9] = load_input_data(input + 8 * 3);
1226 in[2] = load_input_data(input + 8 * 4);
1227 in[10] = load_input_data(input + 8 * 5);
1228 in[3] = load_input_data(input + 8 * 6);
1229 in[11] = load_input_data(input + 8 * 7);
1230 in[4] = load_input_data(input + 8 * 8);
1231 in[12] = load_input_data(input + 8 * 9);
1232 in[5] = load_input_data(input + 8 * 10);
1233 in[13] = load_input_data(input + 8 * 11);
1234 in[6] = load_input_data(input + 8 * 12);
1235 in[14] = load_input_data(input + 8 * 13);
1236 in[7] = load_input_data(input + 8 * 14);
1237 in[15] = load_input_data(input + 8 * 15);
1238
1239 array_transpose_8x8(in, in);
1240 array_transpose_8x8(in + 8, in + 8);
1241
1242 IDCT16
1243
1244 // Stage7
1245 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1246 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1247 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1248 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1249 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1250 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1251 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1252 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1253 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1254 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1255 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1256 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1257 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1258 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1259 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1260 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1261
1262 curr1 = r;
1263 input += 128;
1264 }
1265 for (i = 0; i < 2; i++) {
1266 int j;
1267 // 1-D idct
1268 array_transpose_8x8(l + i * 8, in);
1269 array_transpose_8x8(r + i * 8, in + 8);
1270
1271 IDCT16
1272
1273 // 2-D
1274 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1275 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1276 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1277 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1278 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1279 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1280 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1281 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1282 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1283 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1284 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1285 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1286 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1287 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1288 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1289 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1290
1291 for (j = 0; j < 16; ++j) {
1292 // Final rounding and shift
1293 in[j] = _mm_adds_epi16(in[j], final_rounding);
1294 in[j] = _mm_srai_epi16(in[j], 6);
1295 RECON_AND_STORE(dest + j * stride, in[j]);
1296 }
1297
1298 dest += 8;
1299 }
1300 }
1301
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1302 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1303 int stride) {
1304 __m128i dc_value;
1305 const __m128i zero = _mm_setzero_si128();
1306 int a, i;
1307
1308 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1309 a = (int)dct_const_round_shift(a * cospi_16_64);
1310 a = ROUND_POWER_OF_TWO(a, 6);
1311
1312 dc_value = _mm_set1_epi16(a);
1313
1314 for (i = 0; i < 16; ++i) {
1315 RECON_AND_STORE(dest + 0, dc_value);
1316 RECON_AND_STORE(dest + 8, dc_value);
1317 dest += stride;
1318 }
1319 }
1320
iadst16_8col(__m128i * in)1321 static void iadst16_8col(__m128i *in) {
1322 // perform 16x16 1-D ADST for 8 columns
1323 __m128i s[16], x[16], u[32], v[32];
1324 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1325 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1326 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1327 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1328 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1329 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1330 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1331 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1332 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1333 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1334 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1335 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1336 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1337 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1338 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1339 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1340 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1341 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1342 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1343 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1344 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1345 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1346 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1347 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1348 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1349 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1350 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1351 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1352 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1353 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1354 const __m128i kZero = _mm_set1_epi16(0);
1355
1356 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1357 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1358 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1359 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1360 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1361 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1362 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1363 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1364 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1365 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1366 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1367 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1368 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1369 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1370 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1371 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1372
1373 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1374 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1375 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1376 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1377 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1378 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1379 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1380 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1381 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1382 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1383 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1384 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1385 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1386 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1387 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1388 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1389 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1390 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1391 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1392 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1393 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1394 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1395 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1396 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1397 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1398 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1399 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1400 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1401 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1402 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1403 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1404 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1405
1406 u[0] = _mm_add_epi32(v[0], v[16]);
1407 u[1] = _mm_add_epi32(v[1], v[17]);
1408 u[2] = _mm_add_epi32(v[2], v[18]);
1409 u[3] = _mm_add_epi32(v[3], v[19]);
1410 u[4] = _mm_add_epi32(v[4], v[20]);
1411 u[5] = _mm_add_epi32(v[5], v[21]);
1412 u[6] = _mm_add_epi32(v[6], v[22]);
1413 u[7] = _mm_add_epi32(v[7], v[23]);
1414 u[8] = _mm_add_epi32(v[8], v[24]);
1415 u[9] = _mm_add_epi32(v[9], v[25]);
1416 u[10] = _mm_add_epi32(v[10], v[26]);
1417 u[11] = _mm_add_epi32(v[11], v[27]);
1418 u[12] = _mm_add_epi32(v[12], v[28]);
1419 u[13] = _mm_add_epi32(v[13], v[29]);
1420 u[14] = _mm_add_epi32(v[14], v[30]);
1421 u[15] = _mm_add_epi32(v[15], v[31]);
1422 u[16] = _mm_sub_epi32(v[0], v[16]);
1423 u[17] = _mm_sub_epi32(v[1], v[17]);
1424 u[18] = _mm_sub_epi32(v[2], v[18]);
1425 u[19] = _mm_sub_epi32(v[3], v[19]);
1426 u[20] = _mm_sub_epi32(v[4], v[20]);
1427 u[21] = _mm_sub_epi32(v[5], v[21]);
1428 u[22] = _mm_sub_epi32(v[6], v[22]);
1429 u[23] = _mm_sub_epi32(v[7], v[23]);
1430 u[24] = _mm_sub_epi32(v[8], v[24]);
1431 u[25] = _mm_sub_epi32(v[9], v[25]);
1432 u[26] = _mm_sub_epi32(v[10], v[26]);
1433 u[27] = _mm_sub_epi32(v[11], v[27]);
1434 u[28] = _mm_sub_epi32(v[12], v[28]);
1435 u[29] = _mm_sub_epi32(v[13], v[29]);
1436 u[30] = _mm_sub_epi32(v[14], v[30]);
1437 u[31] = _mm_sub_epi32(v[15], v[31]);
1438
1439 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1440 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1441 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1442 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1443 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1444 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1445 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1446 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1447 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1448 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1449 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1450 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1451 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1452 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1453 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1454 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1455 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1456 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1457 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1458 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1459 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1460 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1461 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1462 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1463 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1464 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1465 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1466 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1467 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1468 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1469 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1470 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1471
1472 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1473 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1474 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1475 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1476 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1477 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1478 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1479 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1480 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1481 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1482 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1483 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1484 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1485 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1486 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1487 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1488 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1489 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1490 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1491 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1492 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1493 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1494 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1495 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1496 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1497 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1498 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1499 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1500 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1501 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1502 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1503 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1504
1505 s[0] = _mm_packs_epi32(u[0], u[1]);
1506 s[1] = _mm_packs_epi32(u[2], u[3]);
1507 s[2] = _mm_packs_epi32(u[4], u[5]);
1508 s[3] = _mm_packs_epi32(u[6], u[7]);
1509 s[4] = _mm_packs_epi32(u[8], u[9]);
1510 s[5] = _mm_packs_epi32(u[10], u[11]);
1511 s[6] = _mm_packs_epi32(u[12], u[13]);
1512 s[7] = _mm_packs_epi32(u[14], u[15]);
1513 s[8] = _mm_packs_epi32(u[16], u[17]);
1514 s[9] = _mm_packs_epi32(u[18], u[19]);
1515 s[10] = _mm_packs_epi32(u[20], u[21]);
1516 s[11] = _mm_packs_epi32(u[22], u[23]);
1517 s[12] = _mm_packs_epi32(u[24], u[25]);
1518 s[13] = _mm_packs_epi32(u[26], u[27]);
1519 s[14] = _mm_packs_epi32(u[28], u[29]);
1520 s[15] = _mm_packs_epi32(u[30], u[31]);
1521
1522 // stage 2
1523 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1524 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1525 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1526 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1527 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1528 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1529 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1530 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1531
1532 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1533 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1534 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1535 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1536 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1537 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1538 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1539 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1540 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1541 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1542 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1543 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1544 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1545 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1546 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1547 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1548
1549 u[0] = _mm_add_epi32(v[0], v[8]);
1550 u[1] = _mm_add_epi32(v[1], v[9]);
1551 u[2] = _mm_add_epi32(v[2], v[10]);
1552 u[3] = _mm_add_epi32(v[3], v[11]);
1553 u[4] = _mm_add_epi32(v[4], v[12]);
1554 u[5] = _mm_add_epi32(v[5], v[13]);
1555 u[6] = _mm_add_epi32(v[6], v[14]);
1556 u[7] = _mm_add_epi32(v[7], v[15]);
1557 u[8] = _mm_sub_epi32(v[0], v[8]);
1558 u[9] = _mm_sub_epi32(v[1], v[9]);
1559 u[10] = _mm_sub_epi32(v[2], v[10]);
1560 u[11] = _mm_sub_epi32(v[3], v[11]);
1561 u[12] = _mm_sub_epi32(v[4], v[12]);
1562 u[13] = _mm_sub_epi32(v[5], v[13]);
1563 u[14] = _mm_sub_epi32(v[6], v[14]);
1564 u[15] = _mm_sub_epi32(v[7], v[15]);
1565
1566 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1567 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1568 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1569 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1570 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1571 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1572 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1573 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1574 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1575 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1576 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1577 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1578 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1579 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1580 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1581 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1582
1583 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1584 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1585 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1586 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1587 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1588 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1589 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1590 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1591 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1592 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1593 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1594 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1595 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1596 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1597 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1598 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1599
1600 x[0] = _mm_add_epi16(s[0], s[4]);
1601 x[1] = _mm_add_epi16(s[1], s[5]);
1602 x[2] = _mm_add_epi16(s[2], s[6]);
1603 x[3] = _mm_add_epi16(s[3], s[7]);
1604 x[4] = _mm_sub_epi16(s[0], s[4]);
1605 x[5] = _mm_sub_epi16(s[1], s[5]);
1606 x[6] = _mm_sub_epi16(s[2], s[6]);
1607 x[7] = _mm_sub_epi16(s[3], s[7]);
1608 x[8] = _mm_packs_epi32(u[0], u[1]);
1609 x[9] = _mm_packs_epi32(u[2], u[3]);
1610 x[10] = _mm_packs_epi32(u[4], u[5]);
1611 x[11] = _mm_packs_epi32(u[6], u[7]);
1612 x[12] = _mm_packs_epi32(u[8], u[9]);
1613 x[13] = _mm_packs_epi32(u[10], u[11]);
1614 x[14] = _mm_packs_epi32(u[12], u[13]);
1615 x[15] = _mm_packs_epi32(u[14], u[15]);
1616
1617 // stage 3
1618 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1619 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1620 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1621 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1622 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1623 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1624 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1625 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1626
1627 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1628 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1629 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1630 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1631 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1632 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1633 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1634 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1635 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1636 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1637 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1638 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1639 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1640 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1641 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1642 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1643
1644 u[0] = _mm_add_epi32(v[0], v[4]);
1645 u[1] = _mm_add_epi32(v[1], v[5]);
1646 u[2] = _mm_add_epi32(v[2], v[6]);
1647 u[3] = _mm_add_epi32(v[3], v[7]);
1648 u[4] = _mm_sub_epi32(v[0], v[4]);
1649 u[5] = _mm_sub_epi32(v[1], v[5]);
1650 u[6] = _mm_sub_epi32(v[2], v[6]);
1651 u[7] = _mm_sub_epi32(v[3], v[7]);
1652 u[8] = _mm_add_epi32(v[8], v[12]);
1653 u[9] = _mm_add_epi32(v[9], v[13]);
1654 u[10] = _mm_add_epi32(v[10], v[14]);
1655 u[11] = _mm_add_epi32(v[11], v[15]);
1656 u[12] = _mm_sub_epi32(v[8], v[12]);
1657 u[13] = _mm_sub_epi32(v[9], v[13]);
1658 u[14] = _mm_sub_epi32(v[10], v[14]);
1659 u[15] = _mm_sub_epi32(v[11], v[15]);
1660
1661 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1662 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1663 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1664 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1665 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1666 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1667 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1668 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1669 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1670 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1671 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1672 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1673 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1674 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1675 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1676 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1677
1678 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1679 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1680 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1681 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1682 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1683 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1684 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1685 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1686 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1687 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1688 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1689 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1690 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1691 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1692 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1693 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1694
1695 s[0] = _mm_add_epi16(x[0], x[2]);
1696 s[1] = _mm_add_epi16(x[1], x[3]);
1697 s[2] = _mm_sub_epi16(x[0], x[2]);
1698 s[3] = _mm_sub_epi16(x[1], x[3]);
1699 s[4] = _mm_packs_epi32(v[0], v[1]);
1700 s[5] = _mm_packs_epi32(v[2], v[3]);
1701 s[6] = _mm_packs_epi32(v[4], v[5]);
1702 s[7] = _mm_packs_epi32(v[6], v[7]);
1703 s[8] = _mm_add_epi16(x[8], x[10]);
1704 s[9] = _mm_add_epi16(x[9], x[11]);
1705 s[10] = _mm_sub_epi16(x[8], x[10]);
1706 s[11] = _mm_sub_epi16(x[9], x[11]);
1707 s[12] = _mm_packs_epi32(v[8], v[9]);
1708 s[13] = _mm_packs_epi32(v[10], v[11]);
1709 s[14] = _mm_packs_epi32(v[12], v[13]);
1710 s[15] = _mm_packs_epi32(v[14], v[15]);
1711
1712 // stage 4
1713 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1714 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1715 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1716 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1717 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1718 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1719 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1720 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1721
1722 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1723 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1724 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1725 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1726 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1727 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1728 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1729 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1730 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1731 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1732 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1733 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1734 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1735 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1736 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1737 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1738
1739 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1740 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1741 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1742 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1743 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1744 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1745 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1746 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1747 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1748 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1749 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1750 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1751 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1752 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1753 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1754 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1755
1756 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1757 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1758 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1759 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1760 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1761 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1762 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1763 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1764 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1765 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1766 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1767 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1768 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1769 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1770 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1771 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1772
1773 in[0] = s[0];
1774 in[1] = _mm_sub_epi16(kZero, s[8]);
1775 in[2] = s[12];
1776 in[3] = _mm_sub_epi16(kZero, s[4]);
1777 in[4] = _mm_packs_epi32(v[4], v[5]);
1778 in[5] = _mm_packs_epi32(v[12], v[13]);
1779 in[6] = _mm_packs_epi32(v[8], v[9]);
1780 in[7] = _mm_packs_epi32(v[0], v[1]);
1781 in[8] = _mm_packs_epi32(v[2], v[3]);
1782 in[9] = _mm_packs_epi32(v[10], v[11]);
1783 in[10] = _mm_packs_epi32(v[14], v[15]);
1784 in[11] = _mm_packs_epi32(v[6], v[7]);
1785 in[12] = s[5];
1786 in[13] = _mm_sub_epi16(kZero, s[13]);
1787 in[14] = s[9];
1788 in[15] = _mm_sub_epi16(kZero, s[1]);
1789 }
1790
idct16_8col(__m128i * in)1791 static void idct16_8col(__m128i *in) {
1792 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1793 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1794 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1795 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1796 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1797 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1798 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1799 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1800 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1801 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1802 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1803 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1804 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1805 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1806 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1807 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1808 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1809 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1810 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1811 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1812 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1813 __m128i v[16], u[16], s[16], t[16];
1814
1815 // stage 1
1816 s[0] = in[0];
1817 s[1] = in[8];
1818 s[2] = in[4];
1819 s[3] = in[12];
1820 s[4] = in[2];
1821 s[5] = in[10];
1822 s[6] = in[6];
1823 s[7] = in[14];
1824 s[8] = in[1];
1825 s[9] = in[9];
1826 s[10] = in[5];
1827 s[11] = in[13];
1828 s[12] = in[3];
1829 s[13] = in[11];
1830 s[14] = in[7];
1831 s[15] = in[15];
1832
1833 // stage 2
1834 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1835 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1836 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1837 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1838 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1839 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1840 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1841 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1842
1843 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1844 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1845 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1846 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1847 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1848 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1849 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1850 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1851 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1852 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1853 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1854 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1855 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1856 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1857 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1858 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1859
1860 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1861 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1862 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1863 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1864 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1865 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1866 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1867 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1868 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1869 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1870 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1871 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1872 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1873 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1874 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1875 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1876
1877 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1878 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1879 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1880 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1881 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1882 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1883 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1884 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1885 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1886 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1887 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1888 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1889 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1890 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1891 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1892 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1893
1894 s[8] = _mm_packs_epi32(u[0], u[1]);
1895 s[15] = _mm_packs_epi32(u[2], u[3]);
1896 s[9] = _mm_packs_epi32(u[4], u[5]);
1897 s[14] = _mm_packs_epi32(u[6], u[7]);
1898 s[10] = _mm_packs_epi32(u[8], u[9]);
1899 s[13] = _mm_packs_epi32(u[10], u[11]);
1900 s[11] = _mm_packs_epi32(u[12], u[13]);
1901 s[12] = _mm_packs_epi32(u[14], u[15]);
1902
1903 // stage 3
1904 t[0] = s[0];
1905 t[1] = s[1];
1906 t[2] = s[2];
1907 t[3] = s[3];
1908 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1909 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1910 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1911 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1912
1913 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1914 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1915 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1916 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1917 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1918 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1919 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1920 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1921
1922 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1923 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1924 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1925 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1926 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1927 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1928 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1929 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1930
1931 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1932 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1933 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1934 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1935 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1936 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1937 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1938 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1939
1940 t[4] = _mm_packs_epi32(u[0], u[1]);
1941 t[7] = _mm_packs_epi32(u[2], u[3]);
1942 t[5] = _mm_packs_epi32(u[4], u[5]);
1943 t[6] = _mm_packs_epi32(u[6], u[7]);
1944 t[8] = _mm_add_epi16(s[8], s[9]);
1945 t[9] = _mm_sub_epi16(s[8], s[9]);
1946 t[10] = _mm_sub_epi16(s[11], s[10]);
1947 t[11] = _mm_add_epi16(s[10], s[11]);
1948 t[12] = _mm_add_epi16(s[12], s[13]);
1949 t[13] = _mm_sub_epi16(s[12], s[13]);
1950 t[14] = _mm_sub_epi16(s[15], s[14]);
1951 t[15] = _mm_add_epi16(s[14], s[15]);
1952
1953 // stage 4
1954 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1955 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1956 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1957 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1958 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1959 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1960 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1961 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1962
1963 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1964 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1965 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1966 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1967 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1968 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1969 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1970 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1971 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1972 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1973 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1974 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1975 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1976 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1977 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1978 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1979
1980 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1981 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1982 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1983 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1984 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1985 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1986 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1987 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1988 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1989 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1990 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1991 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1992 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1993 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1994 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1995 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1996
1997 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1998 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1999 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2000 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2001 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2002 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2003 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2004 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2005 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2006 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2007 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2008 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2009 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2010 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2011 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2012 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2013
2014 s[0] = _mm_packs_epi32(u[0], u[1]);
2015 s[1] = _mm_packs_epi32(u[2], u[3]);
2016 s[2] = _mm_packs_epi32(u[4], u[5]);
2017 s[3] = _mm_packs_epi32(u[6], u[7]);
2018 s[4] = _mm_add_epi16(t[4], t[5]);
2019 s[5] = _mm_sub_epi16(t[4], t[5]);
2020 s[6] = _mm_sub_epi16(t[7], t[6]);
2021 s[7] = _mm_add_epi16(t[6], t[7]);
2022 s[8] = t[8];
2023 s[15] = t[15];
2024 s[9] = _mm_packs_epi32(u[8], u[9]);
2025 s[14] = _mm_packs_epi32(u[10], u[11]);
2026 s[10] = _mm_packs_epi32(u[12], u[13]);
2027 s[13] = _mm_packs_epi32(u[14], u[15]);
2028 s[11] = t[11];
2029 s[12] = t[12];
2030
2031 // stage 5
2032 t[0] = _mm_add_epi16(s[0], s[3]);
2033 t[1] = _mm_add_epi16(s[1], s[2]);
2034 t[2] = _mm_sub_epi16(s[1], s[2]);
2035 t[3] = _mm_sub_epi16(s[0], s[3]);
2036 t[4] = s[4];
2037 t[7] = s[7];
2038
2039 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2040 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2041 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2042 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2043 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2044 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2045 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2046 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2047 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2048 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2049 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2050 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2051 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2052 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2053 t[5] = _mm_packs_epi32(u[0], u[1]);
2054 t[6] = _mm_packs_epi32(u[2], u[3]);
2055
2056 t[8] = _mm_add_epi16(s[8], s[11]);
2057 t[9] = _mm_add_epi16(s[9], s[10]);
2058 t[10] = _mm_sub_epi16(s[9], s[10]);
2059 t[11] = _mm_sub_epi16(s[8], s[11]);
2060 t[12] = _mm_sub_epi16(s[15], s[12]);
2061 t[13] = _mm_sub_epi16(s[14], s[13]);
2062 t[14] = _mm_add_epi16(s[13], s[14]);
2063 t[15] = _mm_add_epi16(s[12], s[15]);
2064
2065 // stage 6
2066 s[0] = _mm_add_epi16(t[0], t[7]);
2067 s[1] = _mm_add_epi16(t[1], t[6]);
2068 s[2] = _mm_add_epi16(t[2], t[5]);
2069 s[3] = _mm_add_epi16(t[3], t[4]);
2070 s[4] = _mm_sub_epi16(t[3], t[4]);
2071 s[5] = _mm_sub_epi16(t[2], t[5]);
2072 s[6] = _mm_sub_epi16(t[1], t[6]);
2073 s[7] = _mm_sub_epi16(t[0], t[7]);
2074 s[8] = t[8];
2075 s[9] = t[9];
2076
2077 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2078 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2079 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2080 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2081
2082 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2083 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2084 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2085 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2086 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2087 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2088 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2089 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2090
2091 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2092 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2093 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2094 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2095 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2096 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2097 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2098 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2099
2100 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2101 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2102 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2103 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2104 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2105 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2106 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2107 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2108
2109 s[10] = _mm_packs_epi32(u[0], u[1]);
2110 s[13] = _mm_packs_epi32(u[2], u[3]);
2111 s[11] = _mm_packs_epi32(u[4], u[5]);
2112 s[12] = _mm_packs_epi32(u[6], u[7]);
2113 s[14] = t[14];
2114 s[15] = t[15];
2115
2116 // stage 7
2117 in[0] = _mm_add_epi16(s[0], s[15]);
2118 in[1] = _mm_add_epi16(s[1], s[14]);
2119 in[2] = _mm_add_epi16(s[2], s[13]);
2120 in[3] = _mm_add_epi16(s[3], s[12]);
2121 in[4] = _mm_add_epi16(s[4], s[11]);
2122 in[5] = _mm_add_epi16(s[5], s[10]);
2123 in[6] = _mm_add_epi16(s[6], s[9]);
2124 in[7] = _mm_add_epi16(s[7], s[8]);
2125 in[8] = _mm_sub_epi16(s[7], s[8]);
2126 in[9] = _mm_sub_epi16(s[6], s[9]);
2127 in[10] = _mm_sub_epi16(s[5], s[10]);
2128 in[11] = _mm_sub_epi16(s[4], s[11]);
2129 in[12] = _mm_sub_epi16(s[3], s[12]);
2130 in[13] = _mm_sub_epi16(s[2], s[13]);
2131 in[14] = _mm_sub_epi16(s[1], s[14]);
2132 in[15] = _mm_sub_epi16(s[0], s[15]);
2133 }
2134
idct16_sse2(__m128i * in0,__m128i * in1)2135 void idct16_sse2(__m128i *in0, __m128i *in1) {
2136 array_transpose_16x16(in0, in1);
2137 idct16_8col(in0);
2138 idct16_8col(in1);
2139 }
2140
iadst16_sse2(__m128i * in0,__m128i * in1)2141 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2142 array_transpose_16x16(in0, in1);
2143 iadst16_8col(in0);
2144 iadst16_8col(in1);
2145 }
2146
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2147 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2148 int stride) {
2149 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2150 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2151 const __m128i zero = _mm_setzero_si128();
2152
2153 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2154 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2155 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2156 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2157
2158 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2159 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2160
2161 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2162 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2163 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2164 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2165 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2166 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2167
2168 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2169 __m128i in[16], l[16];
2170 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2171 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2172 stp1_8_0, stp1_12_0;
2173 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2174 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2175 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2176 int i;
2177 // First 1-D inverse DCT
2178 // Load input data.
2179 in[0] = load_input_data(input);
2180 in[1] = load_input_data(input + 8 * 2);
2181 in[2] = load_input_data(input + 8 * 4);
2182 in[3] = load_input_data(input + 8 * 6);
2183
2184 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2185
2186 // Stage2
2187 {
2188 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2189 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2190
2191 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2192 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2193 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2194 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2195
2196 tmp0 = _mm_add_epi32(tmp0, rounding);
2197 tmp2 = _mm_add_epi32(tmp2, rounding);
2198 tmp5 = _mm_add_epi32(tmp5, rounding);
2199 tmp7 = _mm_add_epi32(tmp7, rounding);
2200
2201 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2202 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2203 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2204 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2205
2206 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
2207 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2208 }
2209
2210 // Stage3
2211 {
2212 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2213
2214 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2215 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2216
2217 tmp0 = _mm_add_epi32(tmp0, rounding);
2218 tmp2 = _mm_add_epi32(tmp2, rounding);
2219 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2220 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2221
2222 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2223 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2224
2225 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2226 }
2227
2228 // Stage4
2229 {
2230 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2231 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2232 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2233
2234 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2235 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2236 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2237 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2238 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2239 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2240
2241 tmp0 = _mm_add_epi32(tmp0, rounding);
2242 tmp2 = _mm_add_epi32(tmp2, rounding);
2243 tmp1 = _mm_add_epi32(tmp1, rounding);
2244 tmp3 = _mm_add_epi32(tmp3, rounding);
2245 tmp5 = _mm_add_epi32(tmp5, rounding);
2246 tmp7 = _mm_add_epi32(tmp7, rounding);
2247
2248 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2249 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2250 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2251 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2252 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2253 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2254
2255 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2256 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2257 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2258 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2259
2260 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2261 }
2262
2263 // Stage5 and Stage6
2264 {
2265 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2266 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2267 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2268 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2269
2270 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2271 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2272 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2273 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2274
2275 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2276 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2277 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2278 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2279 }
2280
2281 // Stage6
2282 {
2283 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2284 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2285 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2286
2287 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2288 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2289 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2290 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2291 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2292 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2293
2294 tmp1 = _mm_add_epi32(tmp1, rounding);
2295 tmp3 = _mm_add_epi32(tmp3, rounding);
2296 tmp0 = _mm_add_epi32(tmp0, rounding);
2297 tmp2 = _mm_add_epi32(tmp2, rounding);
2298 tmp4 = _mm_add_epi32(tmp4, rounding);
2299 tmp6 = _mm_add_epi32(tmp6, rounding);
2300
2301 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2302 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2303 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2304 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2305 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2306 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2307
2308 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2309
2310 stp2_10 = _mm_packs_epi32(tmp0, zero);
2311 stp2_13 = _mm_packs_epi32(tmp2, zero);
2312 stp2_11 = _mm_packs_epi32(tmp4, zero);
2313 stp2_12 = _mm_packs_epi32(tmp6, zero);
2314
2315 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2316 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2317 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2318 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2319
2320 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2321 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2322 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2323 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2324 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2325 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2326 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2327 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2328 }
2329
2330 // Stage7. Left 8x16 only.
2331 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2332 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2333 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2334 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2335 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2336 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2337 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2338 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2339 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2340 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2341 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2342 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2343 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2344 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2345 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2346 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2347
2348 // Second 1-D inverse transform, performed per 8x16 block
2349 for (i = 0; i < 2; i++) {
2350 int j;
2351 array_transpose_4X8(l + 8 * i, in);
2352
2353 IDCT16_10
2354
2355 // Stage7
2356 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2357 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2358 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2359 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2360 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2361 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2362 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2363 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2364 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2365 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2366 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2367 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2368 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2369 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2370 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2371 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2372
2373 for (j = 0; j < 16; ++j) {
2374 // Final rounding and shift
2375 in[j] = _mm_adds_epi16(in[j], final_rounding);
2376 in[j] = _mm_srai_epi16(in[j], 6);
2377 RECON_AND_STORE(dest + j * stride, in[j]);
2378 }
2379
2380 dest += 8;
2381 }
2382 }
2383
2384 #define LOAD_DQCOEFF(reg, input) \
2385 { \
2386 reg = load_input_data(input); \
2387 input += 8; \
2388 } \
2389
2390 #define IDCT32_34 \
2391 /* Stage1 */ \
2392 { \
2393 const __m128i zero = _mm_setzero_si128();\
2394 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2395 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2396 \
2397 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2398 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2399 \
2400 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2401 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2402 \
2403 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2404 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2405 \
2406 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2407 stg1_1, stp1_16, stp1_31); \
2408 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2409 stg1_7, stp1_19, stp1_28); \
2410 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2411 stg1_9, stp1_20, stp1_27); \
2412 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2413 stg1_15, stp1_23, stp1_24); \
2414 } \
2415 \
2416 /* Stage2 */ \
2417 { \
2418 const __m128i zero = _mm_setzero_si128();\
2419 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2420 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2421 \
2422 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2423 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2424 \
2425 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2426 stg2_1, stp2_8, stp2_15); \
2427 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2428 stg2_7, stp2_11, stp2_12); \
2429 \
2430 stp2_16 = stp1_16; \
2431 stp2_19 = stp1_19; \
2432 \
2433 stp2_20 = stp1_20; \
2434 stp2_23 = stp1_23; \
2435 \
2436 stp2_24 = stp1_24; \
2437 stp2_27 = stp1_27; \
2438 \
2439 stp2_28 = stp1_28; \
2440 stp2_31 = stp1_31; \
2441 } \
2442 \
2443 /* Stage3 */ \
2444 { \
2445 const __m128i zero = _mm_setzero_si128();\
2446 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2447 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2448 \
2449 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2450 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2451 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2452 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2453 \
2454 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2455 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2456 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2457 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2458 \
2459 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2460 stg3_1, stp1_4, stp1_7); \
2461 \
2462 stp1_8 = stp2_8; \
2463 stp1_11 = stp2_11; \
2464 stp1_12 = stp2_12; \
2465 stp1_15 = stp2_15; \
2466 \
2467 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2468 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2469 stp1_18, stp1_29) \
2470 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2471 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2472 stp1_22, stp1_25) \
2473 \
2474 stp1_16 = stp2_16; \
2475 stp1_31 = stp2_31; \
2476 stp1_19 = stp2_19; \
2477 stp1_20 = stp2_20; \
2478 stp1_23 = stp2_23; \
2479 stp1_24 = stp2_24; \
2480 stp1_27 = stp2_27; \
2481 stp1_28 = stp2_28; \
2482 } \
2483 \
2484 /* Stage4 */ \
2485 { \
2486 const __m128i zero = _mm_setzero_si128();\
2487 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2488 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2489 \
2490 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2491 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2492 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2493 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2494 \
2495 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2496 stg4_1, stp2_0, stp2_1); \
2497 \
2498 stp2_4 = stp1_4; \
2499 stp2_5 = stp1_4; \
2500 stp2_6 = stp1_7; \
2501 stp2_7 = stp1_7; \
2502 \
2503 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2504 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2505 stp2_10, stp2_13) \
2506 \
2507 stp2_8 = stp1_8; \
2508 stp2_15 = stp1_15; \
2509 stp2_11 = stp1_11; \
2510 stp2_12 = stp1_12; \
2511 \
2512 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2513 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2514 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2515 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2516 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2517 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2518 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2519 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2520 \
2521 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2522 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2523 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2524 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2525 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2526 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2527 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2528 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2529 } \
2530 \
2531 /* Stage5 */ \
2532 { \
2533 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2534 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2535 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2536 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2537 \
2538 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2539 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2540 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2541 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2542 \
2543 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2544 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2545 \
2546 stp1_0 = stp2_0; \
2547 stp1_1 = stp2_1; \
2548 stp1_2 = stp2_1; \
2549 stp1_3 = stp2_0; \
2550 \
2551 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2552 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2553 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2554 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2555 \
2556 tmp0 = _mm_add_epi32(tmp0, rounding); \
2557 tmp1 = _mm_add_epi32(tmp1, rounding); \
2558 tmp2 = _mm_add_epi32(tmp2, rounding); \
2559 tmp3 = _mm_add_epi32(tmp3, rounding); \
2560 \
2561 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2562 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2563 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2564 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2565 \
2566 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2567 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2568 \
2569 stp1_4 = stp2_4; \
2570 stp1_7 = stp2_7; \
2571 \
2572 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2573 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2574 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2575 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2576 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2577 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2578 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2579 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2580 \
2581 stp1_16 = stp2_16; \
2582 stp1_17 = stp2_17; \
2583 \
2584 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2585 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2586 stp1_19, stp1_28) \
2587 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2588 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2589 stp1_21, stp1_26) \
2590 \
2591 stp1_22 = stp2_22; \
2592 stp1_23 = stp2_23; \
2593 stp1_24 = stp2_24; \
2594 stp1_25 = stp2_25; \
2595 stp1_30 = stp2_30; \
2596 stp1_31 = stp2_31; \
2597 } \
2598 \
2599 /* Stage6 */ \
2600 { \
2601 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2602 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2603 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2604 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2605 \
2606 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2607 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2608 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2609 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2610 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2611 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2612 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2613 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2614 \
2615 stp2_8 = stp1_8; \
2616 stp2_9 = stp1_9; \
2617 stp2_14 = stp1_14; \
2618 stp2_15 = stp1_15; \
2619 \
2620 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2621 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2622 stp2_13, stp2_11, stp2_12) \
2623 \
2624 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2625 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2626 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2627 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2628 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2629 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2630 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2631 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2632 \
2633 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2634 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2635 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2636 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2637 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2638 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2639 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2640 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2641 } \
2642 \
2643 /* Stage7 */ \
2644 { \
2645 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2646 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2647 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2648 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2649 \
2650 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2651 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2652 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2653 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2654 \
2655 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2656 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2657 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2658 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2659 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2660 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2661 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2662 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2663 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2664 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2665 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2666 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2667 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2668 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2669 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2670 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2671 \
2672 stp1_16 = stp2_16; \
2673 stp1_17 = stp2_17; \
2674 stp1_18 = stp2_18; \
2675 stp1_19 = stp2_19; \
2676 \
2677 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2678 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2679 stp1_21, stp1_26) \
2680 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2681 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2682 stp1_23, stp1_24) \
2683 \
2684 stp1_28 = stp2_28; \
2685 stp1_29 = stp2_29; \
2686 stp1_30 = stp2_30; \
2687 stp1_31 = stp2_31; \
2688 }
2689
2690
2691 #define IDCT32 \
2692 /* Stage1 */ \
2693 { \
2694 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2695 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2696 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2697 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2698 \
2699 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2700 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2701 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2702 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2703 \
2704 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2705 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2706 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2707 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2708 \
2709 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2710 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2711 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2712 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2713 \
2714 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2715 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2716 stp1_17, stp1_30) \
2717 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2718 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2719 stp1_19, stp1_28) \
2720 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2721 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2722 stp1_21, stp1_26) \
2723 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2724 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2725 stp1_23, stp1_24) \
2726 } \
2727 \
2728 /* Stage2 */ \
2729 { \
2730 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2731 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2732 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2733 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2734 \
2735 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2736 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2737 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2738 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2739 \
2740 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2741 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2742 stp2_14) \
2743 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2744 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2745 stp2_11, stp2_12) \
2746 \
2747 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2748 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2749 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2750 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2751 \
2752 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2753 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2754 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2755 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2756 \
2757 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2758 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2759 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2760 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2761 \
2762 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2763 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2764 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2765 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2766 } \
2767 \
2768 /* Stage3 */ \
2769 { \
2770 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2771 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2772 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2773 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2774 \
2775 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2776 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2777 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2778 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2779 \
2780 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2781 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2782 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2783 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2784 \
2785 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2786 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2787 stp1_6) \
2788 \
2789 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2790 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2791 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2792 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2793 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2794 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2795 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2796 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2797 \
2798 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2799 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2800 stp1_18, stp1_29) \
2801 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2802 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2803 stp1_22, stp1_25) \
2804 \
2805 stp1_16 = stp2_16; \
2806 stp1_31 = stp2_31; \
2807 stp1_19 = stp2_19; \
2808 stp1_20 = stp2_20; \
2809 stp1_23 = stp2_23; \
2810 stp1_24 = stp2_24; \
2811 stp1_27 = stp2_27; \
2812 stp1_28 = stp2_28; \
2813 } \
2814 \
2815 /* Stage4 */ \
2816 { \
2817 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2818 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2819 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2820 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2821 \
2822 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2823 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2824 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2825 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2826 \
2827 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2828 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2829 stp2_2, stp2_3) \
2830 \
2831 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2832 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2833 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2834 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2835 \
2836 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2837 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2838 stp2_10, stp2_13) \
2839 \
2840 stp2_8 = stp1_8; \
2841 stp2_15 = stp1_15; \
2842 stp2_11 = stp1_11; \
2843 stp2_12 = stp1_12; \
2844 \
2845 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2846 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2847 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2848 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2849 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2850 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2851 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2852 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2853 \
2854 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2855 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2856 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2857 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2858 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2859 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2860 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2861 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2862 } \
2863 \
2864 /* Stage5 */ \
2865 { \
2866 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2867 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2868 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2869 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2870 \
2871 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2872 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2873 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2874 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2875 \
2876 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2877 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2878 \
2879 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2880 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2881 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2882 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2883 \
2884 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2885 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2886 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2887 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2888 \
2889 tmp0 = _mm_add_epi32(tmp0, rounding); \
2890 tmp1 = _mm_add_epi32(tmp1, rounding); \
2891 tmp2 = _mm_add_epi32(tmp2, rounding); \
2892 tmp3 = _mm_add_epi32(tmp3, rounding); \
2893 \
2894 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2895 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2896 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2897 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2898 \
2899 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2900 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2901 \
2902 stp1_4 = stp2_4; \
2903 stp1_7 = stp2_7; \
2904 \
2905 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2906 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2907 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2908 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2909 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2910 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2911 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2912 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2913 \
2914 stp1_16 = stp2_16; \
2915 stp1_17 = stp2_17; \
2916 \
2917 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2918 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2919 stp1_19, stp1_28) \
2920 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2921 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2922 stp1_21, stp1_26) \
2923 \
2924 stp1_22 = stp2_22; \
2925 stp1_23 = stp2_23; \
2926 stp1_24 = stp2_24; \
2927 stp1_25 = stp2_25; \
2928 stp1_30 = stp2_30; \
2929 stp1_31 = stp2_31; \
2930 } \
2931 \
2932 /* Stage6 */ \
2933 { \
2934 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2935 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2936 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2937 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2938 \
2939 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2940 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2941 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2942 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2943 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2944 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2945 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2946 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2947 \
2948 stp2_8 = stp1_8; \
2949 stp2_9 = stp1_9; \
2950 stp2_14 = stp1_14; \
2951 stp2_15 = stp1_15; \
2952 \
2953 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2954 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2955 stp2_13, stp2_11, stp2_12) \
2956 \
2957 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2958 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2959 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2960 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2961 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2962 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2963 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2964 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2965 \
2966 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2967 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2968 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2969 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2970 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2971 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2972 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2973 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2974 } \
2975 \
2976 /* Stage7 */ \
2977 { \
2978 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2979 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2980 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2981 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2982 \
2983 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2984 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2985 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2986 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2987 \
2988 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2989 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2990 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2991 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2992 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2993 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2994 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2995 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2996 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2997 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2998 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2999 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3000 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3001 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3002 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3003 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3004 \
3005 stp1_16 = stp2_16; \
3006 stp1_17 = stp2_17; \
3007 stp1_18 = stp2_18; \
3008 stp1_19 = stp2_19; \
3009 \
3010 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3011 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3012 stp1_21, stp1_26) \
3013 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3014 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3015 stp1_23, stp1_24) \
3016 \
3017 stp1_28 = stp2_28; \
3018 stp1_29 = stp2_29; \
3019 stp1_30 = stp2_30; \
3020 stp1_31 = stp2_31; \
3021 }
3022
3023 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3024 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3025 int stride) {
3026 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3027 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3028
3029 // idct constants for each stage
3030 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3031 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3032 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3033 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3034 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3035 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3036 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3037 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3038
3039 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3040 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3041 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3042 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3043
3044 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3045 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3046 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3047 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3048 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3049 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3050 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3051 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3052
3053 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3054 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3055 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3056 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3057 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3058
3059 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3060
3061 __m128i in[32], col[32];
3062 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3063 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3064 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3065 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3066 stp1_30, stp1_31;
3067 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3068 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3069 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3070 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3071 stp2_30, stp2_31;
3072 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3073 int i;
3074
3075 // Load input data. Only need to load the top left 8x8 block.
3076 in[0] = load_input_data(input);
3077 in[1] = load_input_data(input + 32);
3078 in[2] = load_input_data(input + 64);
3079 in[3] = load_input_data(input + 96);
3080 in[4] = load_input_data(input + 128);
3081 in[5] = load_input_data(input + 160);
3082 in[6] = load_input_data(input + 192);
3083 in[7] = load_input_data(input + 224);
3084
3085 for (i = 8; i < 32; ++i) {
3086 in[i] = _mm_setzero_si128();
3087 }
3088
3089 array_transpose_8x8(in, in);
3090 // TODO(hkuang): Following transposes are unnecessary. But remove them will
3091 // lead to performance drop on some devices.
3092 array_transpose_8x8(in + 8, in + 8);
3093 array_transpose_8x8(in + 16, in + 16);
3094 array_transpose_8x8(in + 24, in + 24);
3095
3096 IDCT32_34
3097
3098 // 1_D: Store 32 intermediate results for each 8x32 block.
3099 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3100 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3101 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3102 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3103 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3104 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3105 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3106 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3107 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3108 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3109 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3110 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3111 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3112 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3113 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3114 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3115 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3116 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3117 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3118 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3119 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3120 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3121 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3122 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3123 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3124 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3125 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3126 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3127 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3128 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3129 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3130 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3131 for (i = 0; i < 4; i++) {
3132 int j;
3133 const __m128i zero = _mm_setzero_si128();
3134 // Transpose 32x8 block to 8x32 block
3135 array_transpose_8x8(col + i * 8, in);
3136 IDCT32_34
3137
3138 // 2_D: Calculate the results and store them to destination.
3139 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3140 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3141 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3142 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3143 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3144 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3145 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3146 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3147 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3148 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3149 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3150 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3151 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3152 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3153 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3154 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3155 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3156 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3157 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3158 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3159 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3160 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3161 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3162 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3163 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3164 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3165 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3166 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3167 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3168 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3169 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3170 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3171
3172 for (j = 0; j < 32; ++j) {
3173 // Final rounding and shift
3174 in[j] = _mm_adds_epi16(in[j], final_rounding);
3175 in[j] = _mm_srai_epi16(in[j], 6);
3176 RECON_AND_STORE(dest + j * stride, in[j]);
3177 }
3178
3179 dest += 8;
3180 }
3181 }
3182
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3183 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3184 int stride) {
3185 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3186 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3187 const __m128i zero = _mm_setzero_si128();
3188
3189 // idct constants for each stage
3190 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3191 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3192 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3193 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3194 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3195 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3196 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3197 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3198 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3199 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3200 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3201 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3202 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3203 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3204 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3205 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3206
3207 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3208 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3209 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3210 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3211 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3212 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3213 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3214 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3215
3216 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3217 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3218 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3219 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3220 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3221 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3222 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3223 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3224 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3225 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3226
3227 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3228 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3229 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3230 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3231 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3232 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3233 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3234
3235 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3236
3237 __m128i in[32], col[128], zero_idx[16];
3238 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3239 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3240 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3241 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3242 stp1_30, stp1_31;
3243 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3244 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3245 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3246 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3247 stp2_30, stp2_31;
3248 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3249 int i, j, i32;
3250
3251 for (i = 0; i < 4; i++) {
3252 i32 = (i << 5);
3253 // First 1-D idct
3254 // Load input data.
3255 LOAD_DQCOEFF(in[0], input);
3256 LOAD_DQCOEFF(in[8], input);
3257 LOAD_DQCOEFF(in[16], input);
3258 LOAD_DQCOEFF(in[24], input);
3259 LOAD_DQCOEFF(in[1], input);
3260 LOAD_DQCOEFF(in[9], input);
3261 LOAD_DQCOEFF(in[17], input);
3262 LOAD_DQCOEFF(in[25], input);
3263 LOAD_DQCOEFF(in[2], input);
3264 LOAD_DQCOEFF(in[10], input);
3265 LOAD_DQCOEFF(in[18], input);
3266 LOAD_DQCOEFF(in[26], input);
3267 LOAD_DQCOEFF(in[3], input);
3268 LOAD_DQCOEFF(in[11], input);
3269 LOAD_DQCOEFF(in[19], input);
3270 LOAD_DQCOEFF(in[27], input);
3271
3272 LOAD_DQCOEFF(in[4], input);
3273 LOAD_DQCOEFF(in[12], input);
3274 LOAD_DQCOEFF(in[20], input);
3275 LOAD_DQCOEFF(in[28], input);
3276 LOAD_DQCOEFF(in[5], input);
3277 LOAD_DQCOEFF(in[13], input);
3278 LOAD_DQCOEFF(in[21], input);
3279 LOAD_DQCOEFF(in[29], input);
3280 LOAD_DQCOEFF(in[6], input);
3281 LOAD_DQCOEFF(in[14], input);
3282 LOAD_DQCOEFF(in[22], input);
3283 LOAD_DQCOEFF(in[30], input);
3284 LOAD_DQCOEFF(in[7], input);
3285 LOAD_DQCOEFF(in[15], input);
3286 LOAD_DQCOEFF(in[23], input);
3287 LOAD_DQCOEFF(in[31], input);
3288
3289 // checking if all entries are zero
3290 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3291 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3292 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3293 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3294 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3295 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3296 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3297 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3298 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3299 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3300 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3301 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3302 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3303 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3304 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3305 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3306
3307 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3308 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3309 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3310 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3311 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3312 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3313 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3314 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3315
3316 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3317 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3318 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3319 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3320 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3321 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3322 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3323
3324 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3325 col[i32 + 0] = _mm_setzero_si128();
3326 col[i32 + 1] = _mm_setzero_si128();
3327 col[i32 + 2] = _mm_setzero_si128();
3328 col[i32 + 3] = _mm_setzero_si128();
3329 col[i32 + 4] = _mm_setzero_si128();
3330 col[i32 + 5] = _mm_setzero_si128();
3331 col[i32 + 6] = _mm_setzero_si128();
3332 col[i32 + 7] = _mm_setzero_si128();
3333 col[i32 + 8] = _mm_setzero_si128();
3334 col[i32 + 9] = _mm_setzero_si128();
3335 col[i32 + 10] = _mm_setzero_si128();
3336 col[i32 + 11] = _mm_setzero_si128();
3337 col[i32 + 12] = _mm_setzero_si128();
3338 col[i32 + 13] = _mm_setzero_si128();
3339 col[i32 + 14] = _mm_setzero_si128();
3340 col[i32 + 15] = _mm_setzero_si128();
3341 col[i32 + 16] = _mm_setzero_si128();
3342 col[i32 + 17] = _mm_setzero_si128();
3343 col[i32 + 18] = _mm_setzero_si128();
3344 col[i32 + 19] = _mm_setzero_si128();
3345 col[i32 + 20] = _mm_setzero_si128();
3346 col[i32 + 21] = _mm_setzero_si128();
3347 col[i32 + 22] = _mm_setzero_si128();
3348 col[i32 + 23] = _mm_setzero_si128();
3349 col[i32 + 24] = _mm_setzero_si128();
3350 col[i32 + 25] = _mm_setzero_si128();
3351 col[i32 + 26] = _mm_setzero_si128();
3352 col[i32 + 27] = _mm_setzero_si128();
3353 col[i32 + 28] = _mm_setzero_si128();
3354 col[i32 + 29] = _mm_setzero_si128();
3355 col[i32 + 30] = _mm_setzero_si128();
3356 col[i32 + 31] = _mm_setzero_si128();
3357 continue;
3358 }
3359
3360 // Transpose 32x8 block to 8x32 block
3361 array_transpose_8x8(in, in);
3362 array_transpose_8x8(in + 8, in + 8);
3363 array_transpose_8x8(in + 16, in + 16);
3364 array_transpose_8x8(in + 24, in + 24);
3365
3366 IDCT32
3367
3368 // 1_D: Store 32 intermediate results for each 8x32 block.
3369 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3370 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3371 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3372 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3373 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3374 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3375 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3376 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3377 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3378 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3379 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3380 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3381 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3382 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3383 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3384 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3385 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3386 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3387 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3388 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3389 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3390 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3391 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3392 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3393 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3394 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3395 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3396 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3397 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3398 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3399 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3400 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3401 }
3402 for (i = 0; i < 4; i++) {
3403 // Second 1-D idct
3404 j = i << 3;
3405
3406 // Transpose 32x8 block to 8x32 block
3407 array_transpose_8x8(col + j, in);
3408 array_transpose_8x8(col + j + 32, in + 8);
3409 array_transpose_8x8(col + j + 64, in + 16);
3410 array_transpose_8x8(col + j + 96, in + 24);
3411
3412 IDCT32
3413
3414 // 2_D: Calculate the results and store them to destination.
3415 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3416 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3417 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3418 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3419 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3420 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3421 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3422 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3423 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3424 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3425 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3426 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3427 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3428 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3429 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3430 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3431 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3432 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3433 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3434 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3435 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3436 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3437 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3438 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3439 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3440 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3441 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3442 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3443 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3444 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3445 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3446 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3447
3448 for (j = 0; j < 32; ++j) {
3449 // Final rounding and shift
3450 in[j] = _mm_adds_epi16(in[j], final_rounding);
3451 in[j] = _mm_srai_epi16(in[j], 6);
3452 RECON_AND_STORE(dest + j * stride, in[j]);
3453 }
3454
3455 dest += 8;
3456 }
3457 }
3458
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3459 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3460 int stride) {
3461 __m128i dc_value;
3462 const __m128i zero = _mm_setzero_si128();
3463 int a, j;
3464
3465 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3466 a = (int)dct_const_round_shift(a * cospi_16_64);
3467 a = ROUND_POWER_OF_TWO(a, 6);
3468
3469 dc_value = _mm_set1_epi16(a);
3470
3471 for (j = 0; j < 32; ++j) {
3472 RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3473 RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3474 RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3475 RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3476 }
3477 }
3478
3479 #if CONFIG_VP9_HIGHBITDEPTH
clamp_high_sse2(__m128i value,int bd)3480 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3481 __m128i ubounded, retval;
3482 const __m128i zero = _mm_set1_epi16(0);
3483 const __m128i one = _mm_set1_epi16(1);
3484 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3485 ubounded = _mm_cmpgt_epi16(value, max);
3486 retval = _mm_andnot_si128(ubounded, value);
3487 ubounded = _mm_and_si128(ubounded, max);
3488 retval = _mm_or_si128(retval, ubounded);
3489 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3490 return retval;
3491 }
3492
vpx_highbd_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3493 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3494 int stride, int bd) {
3495 tran_low_t out[4 * 4];
3496 tran_low_t *outptr = out;
3497 int i, j;
3498 __m128i inptr[4];
3499 __m128i sign_bits[2];
3500 __m128i temp_mm, min_input, max_input;
3501 int test;
3502 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3503 int optimised_cols = 0;
3504 const __m128i zero = _mm_set1_epi16(0);
3505 const __m128i eight = _mm_set1_epi16(8);
3506 const __m128i max = _mm_set1_epi16(12043);
3507 const __m128i min = _mm_set1_epi16(-12043);
3508 // Load input into __m128i
3509 inptr[0] = _mm_loadu_si128((const __m128i *)input);
3510 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3511 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3512 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3513
3514 // Pack to 16 bits
3515 inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3516 inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3517
3518 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3519 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3520 max_input = _mm_cmpgt_epi16(max_input, max);
3521 min_input = _mm_cmplt_epi16(min_input, min);
3522 temp_mm = _mm_or_si128(max_input, min_input);
3523 test = _mm_movemask_epi8(temp_mm);
3524
3525 if (!test) {
3526 // Do the row transform
3527 idct4_sse2(inptr);
3528
3529 // Check the min & max values
3530 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3531 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3532 max_input = _mm_cmpgt_epi16(max_input, max);
3533 min_input = _mm_cmplt_epi16(min_input, min);
3534 temp_mm = _mm_or_si128(max_input, min_input);
3535 test = _mm_movemask_epi8(temp_mm);
3536
3537 if (test) {
3538 transpose_4x4(inptr);
3539 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3540 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3541 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3542 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3543 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3544 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3545 _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3546 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3547 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3548 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3549 } else {
3550 // Set to use the optimised transform for the column
3551 optimised_cols = 1;
3552 }
3553 } else {
3554 // Run the un-optimised row transform
3555 for (i = 0; i < 4; ++i) {
3556 vpx_highbd_idct4_c(input, outptr, bd);
3557 input += 4;
3558 outptr += 4;
3559 }
3560 }
3561
3562 if (optimised_cols) {
3563 idct4_sse2(inptr);
3564
3565 // Final round and shift
3566 inptr[0] = _mm_add_epi16(inptr[0], eight);
3567 inptr[1] = _mm_add_epi16(inptr[1], eight);
3568
3569 inptr[0] = _mm_srai_epi16(inptr[0], 4);
3570 inptr[1] = _mm_srai_epi16(inptr[1], 4);
3571
3572 // Reconstruction and Store
3573 {
3574 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3575 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3576 d0 = _mm_unpacklo_epi64(
3577 d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3578 d2 = _mm_unpacklo_epi64(
3579 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3580 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3581 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3582 // store input0
3583 _mm_storel_epi64((__m128i *)dest, d0);
3584 // store input1
3585 d0 = _mm_srli_si128(d0, 8);
3586 _mm_storel_epi64((__m128i *)(dest + stride), d0);
3587 // store input2
3588 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3589 // store input3
3590 d2 = _mm_srli_si128(d2, 8);
3591 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3592 }
3593 } else {
3594 // Run the un-optimised column transform
3595 tran_low_t temp_in[4], temp_out[4];
3596 // Columns
3597 for (i = 0; i < 4; ++i) {
3598 for (j = 0; j < 4; ++j)
3599 temp_in[j] = out[j * 4 + i];
3600 vpx_highbd_idct4_c(temp_in, temp_out, bd);
3601 for (j = 0; j < 4; ++j) {
3602 dest[j * stride + i] = highbd_clip_pixel_add(
3603 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3604 }
3605 }
3606 }
3607 }
3608
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3609 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3610 int stride, int bd) {
3611 tran_low_t out[8 * 8];
3612 tran_low_t *outptr = out;
3613 int i, j, test;
3614 __m128i inptr[8];
3615 __m128i min_input, max_input, temp1, temp2, sign_bits;
3616 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3617 const __m128i zero = _mm_set1_epi16(0);
3618 const __m128i sixteen = _mm_set1_epi16(16);
3619 const __m128i max = _mm_set1_epi16(6201);
3620 const __m128i min = _mm_set1_epi16(-6201);
3621 int optimised_cols = 0;
3622
3623 // Load input into __m128i & pack to 16 bits
3624 for (i = 0; i < 8; i++) {
3625 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3626 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3627 inptr[i] = _mm_packs_epi32(temp1, temp2);
3628 }
3629
3630 // Find the min & max for the row transform
3631 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3632 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3633 for (i = 2; i < 8; i++) {
3634 max_input = _mm_max_epi16(max_input, inptr[i]);
3635 min_input = _mm_min_epi16(min_input, inptr[i]);
3636 }
3637 max_input = _mm_cmpgt_epi16(max_input, max);
3638 min_input = _mm_cmplt_epi16(min_input, min);
3639 temp1 = _mm_or_si128(max_input, min_input);
3640 test = _mm_movemask_epi8(temp1);
3641
3642 if (!test) {
3643 // Do the row transform
3644 idct8_sse2(inptr);
3645
3646 // Find the min & max for the column transform
3647 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3648 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3649 for (i = 2; i < 8; i++) {
3650 max_input = _mm_max_epi16(max_input, inptr[i]);
3651 min_input = _mm_min_epi16(min_input, inptr[i]);
3652 }
3653 max_input = _mm_cmpgt_epi16(max_input, max);
3654 min_input = _mm_cmplt_epi16(min_input, min);
3655 temp1 = _mm_or_si128(max_input, min_input);
3656 test = _mm_movemask_epi8(temp1);
3657
3658 if (test) {
3659 array_transpose_8x8(inptr, inptr);
3660 for (i = 0; i < 8; i++) {
3661 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3662 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3663 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3664 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3665 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3666 }
3667 } else {
3668 // Set to use the optimised transform for the column
3669 optimised_cols = 1;
3670 }
3671 } else {
3672 // Run the un-optimised row transform
3673 for (i = 0; i < 8; ++i) {
3674 vpx_highbd_idct8_c(input, outptr, bd);
3675 input += 8;
3676 outptr += 8;
3677 }
3678 }
3679
3680 if (optimised_cols) {
3681 idct8_sse2(inptr);
3682
3683 // Final round & shift and Reconstruction and Store
3684 {
3685 __m128i d[8];
3686 for (i = 0; i < 8; i++) {
3687 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3688 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3689 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3690 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3691 // Store
3692 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3693 }
3694 }
3695 } else {
3696 // Run the un-optimised column transform
3697 tran_low_t temp_in[8], temp_out[8];
3698 for (i = 0; i < 8; ++i) {
3699 for (j = 0; j < 8; ++j)
3700 temp_in[j] = out[j * 8 + i];
3701 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3702 for (j = 0; j < 8; ++j) {
3703 dest[j * stride + i] = highbd_clip_pixel_add(
3704 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3705 }
3706 }
3707 }
3708 }
3709
vpx_highbd_idct8x8_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3710 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3711 int stride, int bd) {
3712 tran_low_t out[8 * 8] = { 0 };
3713 tran_low_t *outptr = out;
3714 int i, j, test;
3715 __m128i inptr[8];
3716 __m128i min_input, max_input, temp1, temp2, sign_bits;
3717 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3718 const __m128i zero = _mm_set1_epi16(0);
3719 const __m128i sixteen = _mm_set1_epi16(16);
3720 const __m128i max = _mm_set1_epi16(6201);
3721 const __m128i min = _mm_set1_epi16(-6201);
3722 int optimised_cols = 0;
3723
3724 // Load input into __m128i & pack to 16 bits
3725 for (i = 0; i < 8; i++) {
3726 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3727 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3728 inptr[i] = _mm_packs_epi32(temp1, temp2);
3729 }
3730
3731 // Find the min & max for the row transform
3732 // only first 4 row has non-zero coefs
3733 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3734 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3735 for (i = 2; i < 4; i++) {
3736 max_input = _mm_max_epi16(max_input, inptr[i]);
3737 min_input = _mm_min_epi16(min_input, inptr[i]);
3738 }
3739 max_input = _mm_cmpgt_epi16(max_input, max);
3740 min_input = _mm_cmplt_epi16(min_input, min);
3741 temp1 = _mm_or_si128(max_input, min_input);
3742 test = _mm_movemask_epi8(temp1);
3743
3744 if (!test) {
3745 // Do the row transform
3746 idct8_sse2(inptr);
3747
3748 // Find the min & max for the column transform
3749 // N.B. Only first 4 cols contain non-zero coeffs
3750 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3751 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3752 for (i = 2; i < 8; i++) {
3753 max_input = _mm_max_epi16(max_input, inptr[i]);
3754 min_input = _mm_min_epi16(min_input, inptr[i]);
3755 }
3756 max_input = _mm_cmpgt_epi16(max_input, max);
3757 min_input = _mm_cmplt_epi16(min_input, min);
3758 temp1 = _mm_or_si128(max_input, min_input);
3759 test = _mm_movemask_epi8(temp1);
3760
3761 if (test) {
3762 // Use fact only first 4 rows contain non-zero coeffs
3763 array_transpose_4X8(inptr, inptr);
3764 for (i = 0; i < 4; i++) {
3765 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3766 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3767 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3768 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3769 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3770 }
3771 } else {
3772 // Set to use the optimised transform for the column
3773 optimised_cols = 1;
3774 }
3775 } else {
3776 // Run the un-optimised row transform
3777 for (i = 0; i < 4; ++i) {
3778 vpx_highbd_idct8_c(input, outptr, bd);
3779 input += 8;
3780 outptr += 8;
3781 }
3782 }
3783
3784 if (optimised_cols) {
3785 idct8_sse2(inptr);
3786
3787 // Final round & shift and Reconstruction and Store
3788 {
3789 __m128i d[8];
3790 for (i = 0; i < 8; i++) {
3791 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3792 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3793 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3794 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3795 // Store
3796 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3797 }
3798 }
3799 } else {
3800 // Run the un-optimised column transform
3801 tran_low_t temp_in[8], temp_out[8];
3802 for (i = 0; i < 8; ++i) {
3803 for (j = 0; j < 8; ++j)
3804 temp_in[j] = out[j * 8 + i];
3805 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3806 for (j = 0; j < 8; ++j) {
3807 dest[j * stride + i] = highbd_clip_pixel_add(
3808 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3809 }
3810 }
3811 }
3812 }
3813
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3814 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3815 int stride, int bd) {
3816 tran_low_t out[16 * 16];
3817 tran_low_t *outptr = out;
3818 int i, j, test;
3819 __m128i inptr[32];
3820 __m128i min_input, max_input, temp1, temp2, sign_bits;
3821 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3822 const __m128i zero = _mm_set1_epi16(0);
3823 const __m128i rounding = _mm_set1_epi16(32);
3824 const __m128i max = _mm_set1_epi16(3155);
3825 const __m128i min = _mm_set1_epi16(-3155);
3826 int optimised_cols = 0;
3827
3828 // Load input into __m128i & pack to 16 bits
3829 for (i = 0; i < 16; i++) {
3830 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3831 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3832 inptr[i] = _mm_packs_epi32(temp1, temp2);
3833 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3834 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3835 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3836 }
3837
3838 // Find the min & max for the row transform
3839 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3840 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3841 for (i = 2; i < 32; i++) {
3842 max_input = _mm_max_epi16(max_input, inptr[i]);
3843 min_input = _mm_min_epi16(min_input, inptr[i]);
3844 }
3845 max_input = _mm_cmpgt_epi16(max_input, max);
3846 min_input = _mm_cmplt_epi16(min_input, min);
3847 temp1 = _mm_or_si128(max_input, min_input);
3848 test = _mm_movemask_epi8(temp1);
3849
3850 if (!test) {
3851 // Do the row transform
3852 idct16_sse2(inptr, inptr + 16);
3853
3854 // Find the min & max for the column transform
3855 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3856 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3857 for (i = 2; i < 32; i++) {
3858 max_input = _mm_max_epi16(max_input, inptr[i]);
3859 min_input = _mm_min_epi16(min_input, inptr[i]);
3860 }
3861 max_input = _mm_cmpgt_epi16(max_input, max);
3862 min_input = _mm_cmplt_epi16(min_input, min);
3863 temp1 = _mm_or_si128(max_input, min_input);
3864 test = _mm_movemask_epi8(temp1);
3865
3866 if (test) {
3867 array_transpose_16x16(inptr, inptr + 16);
3868 for (i = 0; i < 16; i++) {
3869 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3870 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3871 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3872 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3873 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3874 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3875 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3876 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3877 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3878 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3879 }
3880 } else {
3881 // Set to use the optimised transform for the column
3882 optimised_cols = 1;
3883 }
3884 } else {
3885 // Run the un-optimised row transform
3886 for (i = 0; i < 16; ++i) {
3887 vpx_highbd_idct16_c(input, outptr, bd);
3888 input += 16;
3889 outptr += 16;
3890 }
3891 }
3892
3893 if (optimised_cols) {
3894 idct16_sse2(inptr, inptr + 16);
3895
3896 // Final round & shift and Reconstruction and Store
3897 {
3898 __m128i d[2];
3899 for (i = 0; i < 16; i++) {
3900 inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
3901 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3902 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3903 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3904 inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
3905 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3906 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
3907 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3908 // Store
3909 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3910 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3911 }
3912 }
3913 } else {
3914 // Run the un-optimised column transform
3915 tran_low_t temp_in[16], temp_out[16];
3916 for (i = 0; i < 16; ++i) {
3917 for (j = 0; j < 16; ++j)
3918 temp_in[j] = out[j * 16 + i];
3919 vpx_highbd_idct16_c(temp_in, temp_out, bd);
3920 for (j = 0; j < 16; ++j) {
3921 dest[j * stride + i] = highbd_clip_pixel_add(
3922 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3923 }
3924 }
3925 }
3926 }
3927
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3928 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3929 int stride, int bd) {
3930 tran_low_t out[16 * 16] = { 0 };
3931 tran_low_t *outptr = out;
3932 int i, j, test;
3933 __m128i inptr[32];
3934 __m128i min_input, max_input, temp1, temp2, sign_bits;
3935 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3936 const __m128i zero = _mm_set1_epi16(0);
3937 const __m128i rounding = _mm_set1_epi16(32);
3938 const __m128i max = _mm_set1_epi16(3155);
3939 const __m128i min = _mm_set1_epi16(-3155);
3940 int optimised_cols = 0;
3941
3942 // Load input into __m128i & pack to 16 bits
3943 for (i = 0; i < 16; i++) {
3944 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3945 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3946 inptr[i] = _mm_packs_epi32(temp1, temp2);
3947 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3948 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3949 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3950 }
3951
3952 // Find the min & max for the row transform
3953 // Since all non-zero dct coefficients are in upper-left 4x4 area,
3954 // we only need to consider first 4 rows here.
3955 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3956 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3957 for (i = 2; i < 4; i++) {
3958 max_input = _mm_max_epi16(max_input, inptr[i]);
3959 min_input = _mm_min_epi16(min_input, inptr[i]);
3960 }
3961 max_input = _mm_cmpgt_epi16(max_input, max);
3962 min_input = _mm_cmplt_epi16(min_input, min);
3963 temp1 = _mm_or_si128(max_input, min_input);
3964 test = _mm_movemask_epi8(temp1);
3965
3966 if (!test) {
3967 // Do the row transform (N.B. This transposes inptr)
3968 idct16_sse2(inptr, inptr + 16);
3969
3970 // Find the min & max for the column transform
3971 // N.B. Only first 4 cols contain non-zero coeffs
3972 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3973 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3974 for (i = 2; i < 16; i++) {
3975 max_input = _mm_max_epi16(max_input, inptr[i]);
3976 min_input = _mm_min_epi16(min_input, inptr[i]);
3977 }
3978 max_input = _mm_cmpgt_epi16(max_input, max);
3979 min_input = _mm_cmplt_epi16(min_input, min);
3980 temp1 = _mm_or_si128(max_input, min_input);
3981 test = _mm_movemask_epi8(temp1);
3982
3983 if (test) {
3984 // Use fact only first 4 rows contain non-zero coeffs
3985 array_transpose_8x8(inptr, inptr);
3986 array_transpose_8x8(inptr + 8, inptr + 16);
3987 for (i = 0; i < 4; i++) {
3988 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3989 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3990 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3991 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3992 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3993 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3994 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3995 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3996 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3997 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3998 }
3999 } else {
4000 // Set to use the optimised transform for the column
4001 optimised_cols = 1;
4002 }
4003 } else {
4004 // Run the un-optimised row transform
4005 for (i = 0; i < 4; ++i) {
4006 vpx_highbd_idct16_c(input, outptr, bd);
4007 input += 16;
4008 outptr += 16;
4009 }
4010 }
4011
4012 if (optimised_cols) {
4013 idct16_sse2(inptr, inptr + 16);
4014
4015 // Final round & shift and Reconstruction and Store
4016 {
4017 __m128i d[2];
4018 for (i = 0; i < 16; i++) {
4019 inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
4020 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4021 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4022 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4023 inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
4024 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4025 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
4026 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4027 // Store
4028 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4029 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4030 }
4031 }
4032 } else {
4033 // Run the un-optimised column transform
4034 tran_low_t temp_in[16], temp_out[16];
4035 for (i = 0; i < 16; ++i) {
4036 for (j = 0; j < 16; ++j)
4037 temp_in[j] = out[j * 16 + i];
4038 vpx_highbd_idct16_c(temp_in, temp_out, bd);
4039 for (j = 0; j < 16; ++j) {
4040 dest[j * stride + i] = highbd_clip_pixel_add(
4041 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4042 }
4043 }
4044 }
4045 }
4046 #endif // CONFIG_VP9_HIGHBITDEPTH
4047