1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/txfm_common_sse2.h"
14 
15 #define RECON_AND_STORE4X4(dest, in_x) \
16 {                                                     \
17   __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18   d0 = _mm_unpacklo_epi8(d0, zero); \
19   d0 = _mm_add_epi16(in_x, d0); \
20   d0 = _mm_packus_epi16(d0, d0); \
21   *(int *)(dest) = _mm_cvtsi128_si32(d0); \
22 }
23 
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)24 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
25                              int stride) {
26   const __m128i zero = _mm_setzero_si128();
27   const __m128i eight = _mm_set1_epi16(8);
28   const __m128i cst = _mm_setr_epi16(
29       (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30       (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31       (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33   __m128i input0, input1, input2, input3;
34 
35   // Rows
36   input0 = load_input_data(input);
37   input2 = load_input_data(input + 8);
38 
39   // Construct i3, i1, i3, i1, i2, i0, i2, i0
40   input0 = _mm_shufflelo_epi16(input0, 0xd8);
41   input0 = _mm_shufflehi_epi16(input0, 0xd8);
42   input2 = _mm_shufflelo_epi16(input2, 0xd8);
43   input2 = _mm_shufflehi_epi16(input2, 0xd8);
44 
45   input1 = _mm_unpackhi_epi32(input0, input0);
46   input0 = _mm_unpacklo_epi32(input0, input0);
47   input3 = _mm_unpackhi_epi32(input2, input2);
48   input2 = _mm_unpacklo_epi32(input2, input2);
49 
50   // Stage 1
51   input0 = _mm_madd_epi16(input0, cst);
52   input1 = _mm_madd_epi16(input1, cst);
53   input2 = _mm_madd_epi16(input2, cst);
54   input3 = _mm_madd_epi16(input3, cst);
55 
56   input0 = _mm_add_epi32(input0, rounding);
57   input1 = _mm_add_epi32(input1, rounding);
58   input2 = _mm_add_epi32(input2, rounding);
59   input3 = _mm_add_epi32(input3, rounding);
60 
61   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
65 
66   // Stage 2
67   input0 = _mm_packs_epi32(input0, input1);
68   input1 = _mm_packs_epi32(input2, input3);
69 
70   // Transpose
71   input2 = _mm_unpacklo_epi16(input0, input1);
72   input3 = _mm_unpackhi_epi16(input0, input1);
73   input0 = _mm_unpacklo_epi32(input2, input3);
74   input1 = _mm_unpackhi_epi32(input2, input3);
75 
76   // Switch column2, column 3, and then, we got:
77   // input2: column1, column 0;  input3: column2, column 3.
78   input1 = _mm_shuffle_epi32(input1, 0x4e);
79   input2 = _mm_add_epi16(input0, input1);
80   input3 = _mm_sub_epi16(input0, input1);
81 
82   // Columns
83   // Construct i3, i1, i3, i1, i2, i0, i2, i0
84   input0 = _mm_unpacklo_epi32(input2, input2);
85   input1 = _mm_unpackhi_epi32(input2, input2);
86   input2 = _mm_unpackhi_epi32(input3, input3);
87   input3 = _mm_unpacklo_epi32(input3, input3);
88 
89   // Stage 1
90   input0 = _mm_madd_epi16(input0, cst);
91   input1 = _mm_madd_epi16(input1, cst);
92   input2 = _mm_madd_epi16(input2, cst);
93   input3 = _mm_madd_epi16(input3, cst);
94 
95   input0 = _mm_add_epi32(input0, rounding);
96   input1 = _mm_add_epi32(input1, rounding);
97   input2 = _mm_add_epi32(input2, rounding);
98   input3 = _mm_add_epi32(input3, rounding);
99 
100   input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101   input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102   input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
104 
105   // Stage 2
106   input0 = _mm_packs_epi32(input0, input2);
107   input1 = _mm_packs_epi32(input1, input3);
108 
109   // Transpose
110   input2 = _mm_unpacklo_epi16(input0, input1);
111   input3 = _mm_unpackhi_epi16(input0, input1);
112   input0 = _mm_unpacklo_epi32(input2, input3);
113   input1 = _mm_unpackhi_epi32(input2, input3);
114 
115   // Switch column2, column 3, and then, we got:
116   // input2: column1, column 0;  input3: column2, column 3.
117   input1 = _mm_shuffle_epi32(input1, 0x4e);
118   input2 = _mm_add_epi16(input0, input1);
119   input3 = _mm_sub_epi16(input0, input1);
120 
121   // Final round and shift
122   input2 = _mm_add_epi16(input2, eight);
123   input3 = _mm_add_epi16(input3, eight);
124 
125   input2 = _mm_srai_epi16(input2, 4);
126   input3 = _mm_srai_epi16(input3, 4);
127 
128   // Reconstruction and Store
129   {
130     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132     d0 = _mm_unpacklo_epi32(d0,
133                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134     d2 = _mm_unpacklo_epi32(
135         _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136     d0 = _mm_unpacklo_epi8(d0, zero);
137     d2 = _mm_unpacklo_epi8(d2, zero);
138     d0 = _mm_add_epi16(d0, input2);
139     d2 = _mm_add_epi16(d2, input3);
140     d0 = _mm_packus_epi16(d0, d2);
141     // store input0
142     *(int *)dest = _mm_cvtsi128_si32(d0);
143     // store input1
144     d0 = _mm_srli_si128(d0, 4);
145     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
146     // store input2
147     d0 = _mm_srli_si128(d0, 4);
148     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
149     // store input3
150     d0 = _mm_srli_si128(d0, 4);
151     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
152   }
153 }
154 
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)155 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
156                             int stride) {
157   __m128i dc_value;
158   const __m128i zero = _mm_setzero_si128();
159   int a;
160 
161   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
162   a = (int)dct_const_round_shift(a * cospi_16_64);
163   a = ROUND_POWER_OF_TWO(a, 4);
164 
165   dc_value = _mm_set1_epi16(a);
166 
167   RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168   RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169   RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
171 }
172 
transpose_4x4(__m128i * res)173 static INLINE void transpose_4x4(__m128i *res) {
174   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
176 
177   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
179 }
180 
idct4_sse2(__m128i * in)181 void idct4_sse2(__m128i *in) {
182   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
187   __m128i u[8], v[8];
188 
189   transpose_4x4(in);
190   // stage 1
191   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196   v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
197 
198   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
202 
203   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
207 
208   u[0] = _mm_packs_epi32(v[0], v[1]);
209   u[1] = _mm_packs_epi32(v[3], v[2]);
210 
211   // stage 2
212   in[0] = _mm_add_epi16(u[0], u[1]);
213   in[1] = _mm_sub_epi16(u[0], u[1]);
214   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
215 }
216 
iadst4_sse2(__m128i * in)217 void iadst4_sse2(__m128i *in) {
218   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223   const __m128i kZero = _mm_set1_epi16(0);
224   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225   __m128i u[8], v[8], in7;
226 
227   transpose_4x4(in);
228   in7 = _mm_srli_si128(in[1], 8);
229   in7 = _mm_add_epi16(in7, in[0]);
230   in7 = _mm_sub_epi16(in7, in[1]);
231 
232   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234   u[2] = _mm_unpacklo_epi16(in7, kZero);
235   u[3] = _mm_unpackhi_epi16(in[0], kZero);
236 
237   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
238   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
239   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
240   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
241   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
242   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
243 
244   u[0] = _mm_add_epi32(v[0], v[1]);
245   u[1] = _mm_add_epi32(v[3], v[4]);
246   u[2] = v[2];
247   u[3] = _mm_add_epi32(u[0], u[1]);
248   u[4] = _mm_slli_epi32(v[5], 2);
249   u[5] = _mm_add_epi32(u[3], v[5]);
250   u[6] = _mm_sub_epi32(u[5], u[4]);
251 
252   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
256 
257   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
261 
262   in[0] = _mm_packs_epi32(u[0], u[1]);
263   in[1] = _mm_packs_epi32(u[2], u[3]);
264 }
265 
266 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
267                       out0, out1, out2, out3, out4, out5, out6, out7) \
268   {                                                     \
269     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271     const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272     const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273     const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274     const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275     const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276     const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
277                                                         \
278     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
286                                                             \
287     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291     out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292     out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293     out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294     out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
295   }
296 
297 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
298                          out0, out1, out2, out3) \
299   {                                              \
300     const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
301     const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
302     const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
303     const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304     \
305     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
306     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
307     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
308     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309     \
310     out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
311     out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
312     out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
313     out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
314   }
315 
316 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317   {                                            \
318     const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
319     const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
320     out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
321     out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
322   }
323 
324 // Define Macro for multiplying elements by constants and adding them together.
325 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
326                                cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
327   {   \
328       tmp0 = _mm_madd_epi16(lo_0, cst0); \
329       tmp1 = _mm_madd_epi16(hi_0, cst0); \
330       tmp2 = _mm_madd_epi16(lo_0, cst1); \
331       tmp3 = _mm_madd_epi16(hi_0, cst1); \
332       tmp4 = _mm_madd_epi16(lo_1, cst2); \
333       tmp5 = _mm_madd_epi16(hi_1, cst2); \
334       tmp6 = _mm_madd_epi16(lo_1, cst3); \
335       tmp7 = _mm_madd_epi16(hi_1, cst3); \
336       \
337       tmp0 = _mm_add_epi32(tmp0, rounding); \
338       tmp1 = _mm_add_epi32(tmp1, rounding); \
339       tmp2 = _mm_add_epi32(tmp2, rounding); \
340       tmp3 = _mm_add_epi32(tmp3, rounding); \
341       tmp4 = _mm_add_epi32(tmp4, rounding); \
342       tmp5 = _mm_add_epi32(tmp5, rounding); \
343       tmp6 = _mm_add_epi32(tmp6, rounding); \
344       tmp7 = _mm_add_epi32(tmp7, rounding); \
345       \
346       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
347       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
348       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
349       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
350       tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
351       tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
352       tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
353       tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354       \
355       res0 = _mm_packs_epi32(tmp0, tmp1); \
356       res1 = _mm_packs_epi32(tmp2, tmp3); \
357       res2 = _mm_packs_epi32(tmp4, tmp5); \
358       res3 = _mm_packs_epi32(tmp6, tmp7); \
359   }
360 
361 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362   {   \
363       tmp0 = _mm_madd_epi16(lo_0, cst0); \
364       tmp1 = _mm_madd_epi16(hi_0, cst0); \
365       tmp2 = _mm_madd_epi16(lo_0, cst1); \
366       tmp3 = _mm_madd_epi16(hi_0, cst1); \
367       \
368       tmp0 = _mm_add_epi32(tmp0, rounding); \
369       tmp1 = _mm_add_epi32(tmp1, rounding); \
370       tmp2 = _mm_add_epi32(tmp2, rounding); \
371       tmp3 = _mm_add_epi32(tmp3, rounding); \
372       \
373       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
374       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
375       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
376       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377       \
378       res0 = _mm_packs_epi32(tmp0, tmp1); \
379       res1 = _mm_packs_epi32(tmp2, tmp3); \
380   }
381 
382 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
383               out0, out1, out2, out3, out4, out5, out6, out7)  \
384   { \
385   /* Stage1 */      \
386   { \
387     const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
388     const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
389     const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
390     const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391     \
392     MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
393                           stg1_1, stg1_2, stg1_3, stp1_4,      \
394                           stp1_7, stp1_5, stp1_6)              \
395   } \
396     \
397   /* Stage2 */ \
398   { \
399     const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
400     const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
401     const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
402     const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
403     \
404     MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
405                            stg2_1, stg2_2, stg2_3, stp2_0,     \
406                            stp2_1, stp2_2, stp2_3)             \
407     \
408     stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
409     stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
410     stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
411     stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
412   } \
413     \
414   /* Stage3 */ \
415   { \
416     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
417     const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
418     \
419     stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
420     stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
421     stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
422     stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
423     \
424     tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
425     tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
426     tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
427     tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
428     \
429     tmp0 = _mm_add_epi32(tmp0, rounding); \
430     tmp1 = _mm_add_epi32(tmp1, rounding); \
431     tmp2 = _mm_add_epi32(tmp2, rounding); \
432     tmp3 = _mm_add_epi32(tmp3, rounding); \
433     \
434     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
435     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
436     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
437     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
438     \
439     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
440     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441   } \
442   \
443   /* Stage4  */ \
444   out0 = _mm_adds_epi16(stp1_0, stp2_7); \
445   out1 = _mm_adds_epi16(stp1_1, stp1_6); \
446   out2 = _mm_adds_epi16(stp1_2, stp1_5); \
447   out3 = _mm_adds_epi16(stp1_3, stp2_4); \
448   out4 = _mm_subs_epi16(stp1_3, stp2_4); \
449   out5 = _mm_subs_epi16(stp1_2, stp1_5); \
450   out6 = _mm_subs_epi16(stp1_1, stp1_6); \
451   out7 = _mm_subs_epi16(stp1_0, stp2_7); \
452   }
453 
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)454 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
455                              int stride) {
456   const __m128i zero = _mm_setzero_si128();
457   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
458   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
459   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
460   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
461   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
462   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
463   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
464   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
465   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
466   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
467 
468   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
469   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
470   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
471   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472   int i;
473 
474   // Load input data.
475   in0 = load_input_data(input);
476   in1 = load_input_data(input + 8 * 1);
477   in2 = load_input_data(input + 8 * 2);
478   in3 = load_input_data(input + 8 * 3);
479   in4 = load_input_data(input + 8 * 4);
480   in5 = load_input_data(input + 8 * 5);
481   in6 = load_input_data(input + 8 * 6);
482   in7 = load_input_data(input + 8 * 7);
483 
484   // 2-D
485   for (i = 0; i < 2; i++) {
486     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
487     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
488                   in0, in1, in2, in3, in4, in5, in6, in7);
489 
490     // 4-stage 1D idct8x8
491     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
492           in0, in1, in2, in3, in4, in5, in6, in7);
493   }
494 
495   // Final rounding and shift
496   in0 = _mm_adds_epi16(in0, final_rounding);
497   in1 = _mm_adds_epi16(in1, final_rounding);
498   in2 = _mm_adds_epi16(in2, final_rounding);
499   in3 = _mm_adds_epi16(in3, final_rounding);
500   in4 = _mm_adds_epi16(in4, final_rounding);
501   in5 = _mm_adds_epi16(in5, final_rounding);
502   in6 = _mm_adds_epi16(in6, final_rounding);
503   in7 = _mm_adds_epi16(in7, final_rounding);
504 
505   in0 = _mm_srai_epi16(in0, 5);
506   in1 = _mm_srai_epi16(in1, 5);
507   in2 = _mm_srai_epi16(in2, 5);
508   in3 = _mm_srai_epi16(in3, 5);
509   in4 = _mm_srai_epi16(in4, 5);
510   in5 = _mm_srai_epi16(in5, 5);
511   in6 = _mm_srai_epi16(in6, 5);
512   in7 = _mm_srai_epi16(in7, 5);
513 
514   RECON_AND_STORE(dest + 0 * stride, in0);
515   RECON_AND_STORE(dest + 1 * stride, in1);
516   RECON_AND_STORE(dest + 2 * stride, in2);
517   RECON_AND_STORE(dest + 3 * stride, in3);
518   RECON_AND_STORE(dest + 4 * stride, in4);
519   RECON_AND_STORE(dest + 5 * stride, in5);
520   RECON_AND_STORE(dest + 6 * stride, in6);
521   RECON_AND_STORE(dest + 7 * stride, in7);
522 }
523 
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)524 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
525                             int stride) {
526   __m128i dc_value;
527   const __m128i zero = _mm_setzero_si128();
528   int a;
529 
530   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
531   a = (int)dct_const_round_shift(a * cospi_16_64);
532   a = ROUND_POWER_OF_TWO(a, 5);
533 
534   dc_value = _mm_set1_epi16(a);
535 
536   RECON_AND_STORE(dest + 0 * stride, dc_value);
537   RECON_AND_STORE(dest + 1 * stride, dc_value);
538   RECON_AND_STORE(dest + 2 * stride, dc_value);
539   RECON_AND_STORE(dest + 3 * stride, dc_value);
540   RECON_AND_STORE(dest + 4 * stride, dc_value);
541   RECON_AND_STORE(dest + 5 * stride, dc_value);
542   RECON_AND_STORE(dest + 6 * stride, dc_value);
543   RECON_AND_STORE(dest + 7 * stride, dc_value);
544 }
545 
idct8_sse2(__m128i * in)546 void idct8_sse2(__m128i *in) {
547   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
548   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
549   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
550   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
551   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
552   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
553   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
554   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
555   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
556 
557   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
558   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
559   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
560   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
561 
562   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
563   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
564                 in0, in1, in2, in3, in4, in5, in6, in7);
565 
566   // 4-stage 1D idct8x8
567   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
568         in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
569 }
570 
iadst8_sse2(__m128i * in)571 void iadst8_sse2(__m128i *in) {
572   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
573   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
574   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
575   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
576   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
577   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
578   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
579   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
580   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
581   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
582   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
583   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
584   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
585   const __m128i k__const_0 = _mm_set1_epi16(0);
586   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
587 
588   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
589   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
590   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
591   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
592   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
593 
594   // transpose
595   array_transpose_8x8(in, in);
596 
597   // properly aligned for butterfly input
598   in0 = in[7];
599   in1 = in[0];
600   in2 = in[5];
601   in3 = in[2];
602   in4 = in[3];
603   in5 = in[4];
604   in6 = in[1];
605   in7 = in[6];
606 
607   // column transformation
608   // stage 1
609   // interleave and multiply/add into 32-bit integer
610   s0 = _mm_unpacklo_epi16(in0, in1);
611   s1 = _mm_unpackhi_epi16(in0, in1);
612   s2 = _mm_unpacklo_epi16(in2, in3);
613   s3 = _mm_unpackhi_epi16(in2, in3);
614   s4 = _mm_unpacklo_epi16(in4, in5);
615   s5 = _mm_unpackhi_epi16(in4, in5);
616   s6 = _mm_unpacklo_epi16(in6, in7);
617   s7 = _mm_unpackhi_epi16(in6, in7);
618 
619   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
620   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
621   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
622   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
623   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
624   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
625   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
626   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
627   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
628   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
629   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
630   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
631   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
632   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
633   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
634   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
635 
636   // addition
637   w0 = _mm_add_epi32(u0, u8);
638   w1 = _mm_add_epi32(u1, u9);
639   w2 = _mm_add_epi32(u2, u10);
640   w3 = _mm_add_epi32(u3, u11);
641   w4 = _mm_add_epi32(u4, u12);
642   w5 = _mm_add_epi32(u5, u13);
643   w6 = _mm_add_epi32(u6, u14);
644   w7 = _mm_add_epi32(u7, u15);
645   w8 = _mm_sub_epi32(u0, u8);
646   w9 = _mm_sub_epi32(u1, u9);
647   w10 = _mm_sub_epi32(u2, u10);
648   w11 = _mm_sub_epi32(u3, u11);
649   w12 = _mm_sub_epi32(u4, u12);
650   w13 = _mm_sub_epi32(u5, u13);
651   w14 = _mm_sub_epi32(u6, u14);
652   w15 = _mm_sub_epi32(u7, u15);
653 
654   // shift and rounding
655   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
656   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
657   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
658   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
659   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
660   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
661   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
662   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
663   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
664   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
665   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
666   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
667   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
668   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
669   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
670   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
671 
672   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
673   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
674   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
675   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
676   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
677   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
678   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
679   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
680   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
681   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
682   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
683   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
684   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
685   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
686   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
687   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
688 
689   // back to 16-bit and pack 8 integers into __m128i
690   in[0] = _mm_packs_epi32(u0, u1);
691   in[1] = _mm_packs_epi32(u2, u3);
692   in[2] = _mm_packs_epi32(u4, u5);
693   in[3] = _mm_packs_epi32(u6, u7);
694   in[4] = _mm_packs_epi32(u8, u9);
695   in[5] = _mm_packs_epi32(u10, u11);
696   in[6] = _mm_packs_epi32(u12, u13);
697   in[7] = _mm_packs_epi32(u14, u15);
698 
699   // stage 2
700   s0 = _mm_add_epi16(in[0], in[2]);
701   s1 = _mm_add_epi16(in[1], in[3]);
702   s2 = _mm_sub_epi16(in[0], in[2]);
703   s3 = _mm_sub_epi16(in[1], in[3]);
704   u0 = _mm_unpacklo_epi16(in[4], in[5]);
705   u1 = _mm_unpackhi_epi16(in[4], in[5]);
706   u2 = _mm_unpacklo_epi16(in[6], in[7]);
707   u3 = _mm_unpackhi_epi16(in[6], in[7]);
708 
709   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
710   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
711   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
712   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
713   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
714   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
715   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
716   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
717 
718   w0 = _mm_add_epi32(v0, v4);
719   w1 = _mm_add_epi32(v1, v5);
720   w2 = _mm_add_epi32(v2, v6);
721   w3 = _mm_add_epi32(v3, v7);
722   w4 = _mm_sub_epi32(v0, v4);
723   w5 = _mm_sub_epi32(v1, v5);
724   w6 = _mm_sub_epi32(v2, v6);
725   w7 = _mm_sub_epi32(v3, v7);
726 
727   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
728   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
729   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
730   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
731   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
732   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
733   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
734   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
735 
736   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
737   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
738   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
739   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
740   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
741   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
742   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
743   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
744 
745   // back to 16-bit intergers
746   s4 = _mm_packs_epi32(u0, u1);
747   s5 = _mm_packs_epi32(u2, u3);
748   s6 = _mm_packs_epi32(u4, u5);
749   s7 = _mm_packs_epi32(u6, u7);
750 
751   // stage 3
752   u0 = _mm_unpacklo_epi16(s2, s3);
753   u1 = _mm_unpackhi_epi16(s2, s3);
754   u2 = _mm_unpacklo_epi16(s6, s7);
755   u3 = _mm_unpackhi_epi16(s6, s7);
756 
757   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
758   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
759   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
760   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
761   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
762   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
763   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
764   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
765 
766   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
767   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
768   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
769   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
770   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
771   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
772   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
773   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
774 
775   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
776   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
777   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
778   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
779   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
780   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
781   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
782   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
783 
784   s2 = _mm_packs_epi32(v0, v1);
785   s3 = _mm_packs_epi32(v2, v3);
786   s6 = _mm_packs_epi32(v4, v5);
787   s7 = _mm_packs_epi32(v6, v7);
788 
789   in[0] = s0;
790   in[1] = _mm_sub_epi16(k__const_0, s4);
791   in[2] = s6;
792   in[3] = _mm_sub_epi16(k__const_0, s2);
793   in[4] = s3;
794   in[5] = _mm_sub_epi16(k__const_0, s7);
795   in[6] = s5;
796   in[7] = _mm_sub_epi16(k__const_0, s1);
797 }
798 
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)799 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
800                              int stride) {
801   const __m128i zero = _mm_setzero_si128();
802   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
803   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
804   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
805   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
806   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
807   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
808   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
809   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
810   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
811   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
812   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
813 
814   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
815   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
816   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
817   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
818 
819   // Rows. Load 4-row input data.
820   in0 = load_input_data(input);
821   in1 = load_input_data(input + 8 * 1);
822   in2 = load_input_data(input + 8 * 2);
823   in3 = load_input_data(input + 8 * 3);
824 
825   // 8x4 Transpose
826   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
827   // Stage1
828   {
829     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
830     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
831 
832     tmp0 = _mm_madd_epi16(lo_17, stg1_0);
833     tmp2 = _mm_madd_epi16(lo_17, stg1_1);
834     tmp4 = _mm_madd_epi16(lo_35, stg1_2);
835     tmp6 = _mm_madd_epi16(lo_35, stg1_3);
836 
837     tmp0 = _mm_add_epi32(tmp0, rounding);
838     tmp2 = _mm_add_epi32(tmp2, rounding);
839     tmp4 = _mm_add_epi32(tmp4, rounding);
840     tmp6 = _mm_add_epi32(tmp6, rounding);
841     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
842     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
843     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
844     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
845 
846     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
847     stp1_5 = _mm_packs_epi32(tmp4, tmp6);
848   }
849 
850   // Stage2
851   {
852     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
853     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
854 
855     tmp0 = _mm_madd_epi16(lo_04, stg2_0);
856     tmp2 = _mm_madd_epi16(lo_04, stg2_1);
857     tmp4 = _mm_madd_epi16(lo_26, stg2_2);
858     tmp6 = _mm_madd_epi16(lo_26, stg2_3);
859 
860     tmp0 = _mm_add_epi32(tmp0, rounding);
861     tmp2 = _mm_add_epi32(tmp2, rounding);
862     tmp4 = _mm_add_epi32(tmp4, rounding);
863     tmp6 = _mm_add_epi32(tmp6, rounding);
864     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
865     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
866     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
867     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
868 
869     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
870     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
871 
872     tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
873     tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
874 
875     stp2_4 = tmp0;
876     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
877     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
878   }
879 
880   // Stage3
881   {
882     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
883 
884     tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
885     tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
886 
887     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
888     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
889 
890     tmp0 = _mm_madd_epi16(lo_56, stg3_0);
891     tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
892 
893     tmp0 = _mm_add_epi32(tmp0, rounding);
894     tmp2 = _mm_add_epi32(tmp2, rounding);
895     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
896     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
897 
898     stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899   }
900 
901   // Stage4
902   tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
903   tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
904   tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
905   tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
906 
907   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
908 
909   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
910         in0, in1, in2, in3, in4, in5, in6, in7);
911   // Final rounding and shift
912   in0 = _mm_adds_epi16(in0, final_rounding);
913   in1 = _mm_adds_epi16(in1, final_rounding);
914   in2 = _mm_adds_epi16(in2, final_rounding);
915   in3 = _mm_adds_epi16(in3, final_rounding);
916   in4 = _mm_adds_epi16(in4, final_rounding);
917   in5 = _mm_adds_epi16(in5, final_rounding);
918   in6 = _mm_adds_epi16(in6, final_rounding);
919   in7 = _mm_adds_epi16(in7, final_rounding);
920 
921   in0 = _mm_srai_epi16(in0, 5);
922   in1 = _mm_srai_epi16(in1, 5);
923   in2 = _mm_srai_epi16(in2, 5);
924   in3 = _mm_srai_epi16(in3, 5);
925   in4 = _mm_srai_epi16(in4, 5);
926   in5 = _mm_srai_epi16(in5, 5);
927   in6 = _mm_srai_epi16(in6, 5);
928   in7 = _mm_srai_epi16(in7, 5);
929 
930   RECON_AND_STORE(dest + 0 * stride, in0);
931   RECON_AND_STORE(dest + 1 * stride, in1);
932   RECON_AND_STORE(dest + 2 * stride, in2);
933   RECON_AND_STORE(dest + 3 * stride, in3);
934   RECON_AND_STORE(dest + 4 * stride, in4);
935   RECON_AND_STORE(dest + 5 * stride, in5);
936   RECON_AND_STORE(dest + 6 * stride, in6);
937   RECON_AND_STORE(dest + 7 * stride, in7);
938 }
939 
940 #define IDCT16 \
941   /* Stage2 */ \
942   { \
943     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
944     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
945     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
946     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
947     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
948     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
949     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
950     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
951     \
952     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
953                            stg2_0, stg2_1, stg2_2, stg2_3, \
954                            stp2_8, stp2_15, stp2_9, stp2_14) \
955     \
956     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
957                            stg2_4, stg2_5, stg2_6, stg2_7, \
958                            stp2_10, stp2_13, stp2_11, stp2_12) \
959   } \
960     \
961   /* Stage3 */ \
962   { \
963     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
964     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
965     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
966     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
967     \
968     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
969                            stg3_0, stg3_1, stg3_2, stg3_3, \
970                            stp1_4, stp1_7, stp1_5, stp1_6) \
971     \
972     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \
973     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \
974     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
975     stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
976     \
977     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
978     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
979     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
980     stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
981   } \
982   \
983   /* Stage4 */ \
984   { \
985     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
986     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
987     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
988     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
989     \
990     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
991     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
992     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
993     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
994     \
995     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
996                            stg4_0, stg4_1, stg4_2, stg4_3, \
997                            stp2_0, stp2_1, stp2_2, stp2_3) \
998     \
999     stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
1000     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
1001     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
1002     stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
1003     \
1004     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1005                            stg4_4, stg4_5, stg4_6, stg4_7, \
1006                            stp2_9, stp2_14, stp2_10, stp2_13) \
1007   } \
1008     \
1009   /* Stage5 */ \
1010   { \
1011     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1012     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1013     \
1014     stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1015     stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1016     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1017     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1018     \
1019     tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1020     tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1021     tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1022     tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1023     \
1024     tmp0 = _mm_add_epi32(tmp0, rounding); \
1025     tmp1 = _mm_add_epi32(tmp1, rounding); \
1026     tmp2 = _mm_add_epi32(tmp2, rounding); \
1027     tmp3 = _mm_add_epi32(tmp3, rounding); \
1028     \
1029     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1030     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1031     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1032     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1033     \
1034     stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1035     stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1036     \
1037     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1038     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1039     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1040     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1041     \
1042     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1043     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1044     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1045     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1046   } \
1047     \
1048   /* Stage6 */ \
1049   { \
1050     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1051     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1052     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1053     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1054     \
1055     stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1056     stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1057     stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1058     stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1059     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1060     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1061     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1062     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1063     \
1064     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1065                            stg6_0, stg4_0, stg6_0, stg4_0, \
1066                            stp2_10, stp2_13, stp2_11, stp2_12) \
1067   }
1068 
1069 #define IDCT16_10 \
1070     /* Stage2 */ \
1071     { \
1072       const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1073       const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1074       const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1075       const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1076       \
1077       MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
1078                              stg2_0, stg2_1, stg2_6, stg2_7, \
1079                              stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
1080     } \
1081       \
1082     /* Stage3 */ \
1083     { \
1084       const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1085       const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1086       \
1087       MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
1088                                stg3_0, stg3_1,  \
1089                                stp2_4, stp2_7) \
1090       \
1091       stp1_9  =  stp1_8_0; \
1092       stp1_10 =  stp1_11;  \
1093       \
1094       stp1_13 = stp1_12_0; \
1095       stp1_14 = stp1_15;   \
1096     } \
1097     \
1098     /* Stage4 */ \
1099     { \
1100       const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1101       const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1102       \
1103       const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1104       const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1105       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1106       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1107       \
1108       MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
1109                                stg4_0, stg4_1, \
1110                                stp1_0, stp1_1) \
1111       stp2_5 = stp2_4; \
1112       stp2_6 = stp2_7; \
1113       \
1114       MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
1115                              stg4_4, stg4_5, stg4_6, stg4_7, \
1116                              stp2_9, stp2_14, stp2_10, stp2_13) \
1117     } \
1118       \
1119     /* Stage5 */ \
1120     { \
1121       const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1122       const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1123       \
1124       stp1_2 = stp1_1; \
1125       stp1_3 = stp1_0; \
1126       \
1127       tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1128       tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1129       tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1130       tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1131       \
1132       tmp0 = _mm_add_epi32(tmp0, rounding); \
1133       tmp1 = _mm_add_epi32(tmp1, rounding); \
1134       tmp2 = _mm_add_epi32(tmp2, rounding); \
1135       tmp3 = _mm_add_epi32(tmp3, rounding); \
1136       \
1137       tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1138       tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1139       tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1140       tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1141       \
1142       stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1143       stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1144       \
1145       stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \
1146       stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \
1147       stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \
1148       stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1149       \
1150       stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1151       stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \
1152       stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \
1153       stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1154     } \
1155       \
1156     /* Stage6 */ \
1157     { \
1158       const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1159       const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1160       const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1161       const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1162       \
1163       stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1164       stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1165       stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1166       stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1167       stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1168       stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1169       stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1170       stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1171       \
1172       MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
1173                              stg6_0, stg4_0, stg6_0, stg4_0, \
1174                              stp2_10, stp2_13, stp2_11, stp2_12) \
1175     }
1176 
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1177 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1178                                 int stride) {
1179   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1180   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1181   const __m128i zero = _mm_setzero_si128();
1182 
1183   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1184   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1185   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1186   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1187   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1188   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1189   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1190   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1191 
1192   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1193   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1194   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1195   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1196 
1197   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1198   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1199   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1200   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1201   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1202   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1203   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1204   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1205 
1206   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1207 
1208   __m128i in[16], l[16], r[16], *curr1;
1209   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1210           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1211           stp1_8_0, stp1_12_0;
1212   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1213           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1214   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1215   int i;
1216 
1217   curr1 = l;
1218   for (i = 0; i < 2; i++) {
1219     // 1-D idct
1220 
1221     // Load input data.
1222     in[0] = load_input_data(input);
1223     in[8] = load_input_data(input + 8 * 1);
1224     in[1] = load_input_data(input + 8 * 2);
1225     in[9] = load_input_data(input + 8 * 3);
1226     in[2] = load_input_data(input + 8 * 4);
1227     in[10] = load_input_data(input + 8 * 5);
1228     in[3] = load_input_data(input + 8 * 6);
1229     in[11] = load_input_data(input + 8 * 7);
1230     in[4] = load_input_data(input + 8 * 8);
1231     in[12] = load_input_data(input + 8 * 9);
1232     in[5] = load_input_data(input + 8 * 10);
1233     in[13] = load_input_data(input + 8 * 11);
1234     in[6] = load_input_data(input + 8 * 12);
1235     in[14] = load_input_data(input + 8 * 13);
1236     in[7] = load_input_data(input + 8 * 14);
1237     in[15] = load_input_data(input + 8 * 15);
1238 
1239     array_transpose_8x8(in, in);
1240     array_transpose_8x8(in + 8, in + 8);
1241 
1242     IDCT16
1243 
1244     // Stage7
1245     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1246     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1247     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1248     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1249     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1250     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1251     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1252     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1253     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1254     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1255     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1256     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1257     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1258     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1259     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1260     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1261 
1262     curr1 = r;
1263     input += 128;
1264   }
1265   for (i = 0; i < 2; i++) {
1266     int j;
1267     // 1-D idct
1268     array_transpose_8x8(l + i * 8, in);
1269     array_transpose_8x8(r + i * 8, in + 8);
1270 
1271     IDCT16
1272 
1273     // 2-D
1274     in[0] = _mm_add_epi16(stp2_0, stp1_15);
1275     in[1] = _mm_add_epi16(stp2_1, stp1_14);
1276     in[2] = _mm_add_epi16(stp2_2, stp2_13);
1277     in[3] = _mm_add_epi16(stp2_3, stp2_12);
1278     in[4] = _mm_add_epi16(stp2_4, stp2_11);
1279     in[5] = _mm_add_epi16(stp2_5, stp2_10);
1280     in[6] = _mm_add_epi16(stp2_6, stp1_9);
1281     in[7] = _mm_add_epi16(stp2_7, stp1_8);
1282     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1283     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1284     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1285     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1286     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1287     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1288     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1289     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1290 
1291     for (j = 0; j < 16; ++j) {
1292       // Final rounding and shift
1293       in[j] = _mm_adds_epi16(in[j], final_rounding);
1294       in[j] = _mm_srai_epi16(in[j], 6);
1295       RECON_AND_STORE(dest + j * stride, in[j]);
1296     }
1297 
1298     dest += 8;
1299   }
1300 }
1301 
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1302 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1303                               int stride) {
1304   __m128i dc_value;
1305   const __m128i zero = _mm_setzero_si128();
1306   int a, i;
1307 
1308   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1309   a = (int)dct_const_round_shift(a * cospi_16_64);
1310   a = ROUND_POWER_OF_TWO(a, 6);
1311 
1312   dc_value = _mm_set1_epi16(a);
1313 
1314   for (i = 0; i < 16; ++i) {
1315     RECON_AND_STORE(dest +  0, dc_value);
1316     RECON_AND_STORE(dest +  8, dc_value);
1317     dest += stride;
1318   }
1319 }
1320 
iadst16_8col(__m128i * in)1321 static void iadst16_8col(__m128i *in) {
1322   // perform 16x16 1-D ADST for 8 columns
1323   __m128i s[16], x[16], u[32], v[32];
1324   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1325   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1326   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1327   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1328   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1329   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1330   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1331   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1332   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1333   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1334   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1335   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1336   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1337   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1338   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1339   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1340   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1341   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1342   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1343   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1344   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1345   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1346   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1347   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1348   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1349   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1350   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1351   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1352   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1353   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1354   const __m128i kZero = _mm_set1_epi16(0);
1355 
1356   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1357   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1358   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1359   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1360   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1361   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1362   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1363   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1364   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1365   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1366   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1367   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1368   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1369   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1370   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1371   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1372 
1373   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1374   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1375   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1376   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1377   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1378   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1379   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1380   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1381   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1382   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1383   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1384   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1385   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1386   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1387   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1388   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1389   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1390   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1391   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1392   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1393   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1394   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1395   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1396   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1397   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1398   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1399   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1400   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1401   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1402   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1403   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1404   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1405 
1406   u[0] = _mm_add_epi32(v[0], v[16]);
1407   u[1] = _mm_add_epi32(v[1], v[17]);
1408   u[2] = _mm_add_epi32(v[2], v[18]);
1409   u[3] = _mm_add_epi32(v[3], v[19]);
1410   u[4] = _mm_add_epi32(v[4], v[20]);
1411   u[5] = _mm_add_epi32(v[5], v[21]);
1412   u[6] = _mm_add_epi32(v[6], v[22]);
1413   u[7] = _mm_add_epi32(v[7], v[23]);
1414   u[8] = _mm_add_epi32(v[8], v[24]);
1415   u[9] = _mm_add_epi32(v[9], v[25]);
1416   u[10] = _mm_add_epi32(v[10], v[26]);
1417   u[11] = _mm_add_epi32(v[11], v[27]);
1418   u[12] = _mm_add_epi32(v[12], v[28]);
1419   u[13] = _mm_add_epi32(v[13], v[29]);
1420   u[14] = _mm_add_epi32(v[14], v[30]);
1421   u[15] = _mm_add_epi32(v[15], v[31]);
1422   u[16] = _mm_sub_epi32(v[0], v[16]);
1423   u[17] = _mm_sub_epi32(v[1], v[17]);
1424   u[18] = _mm_sub_epi32(v[2], v[18]);
1425   u[19] = _mm_sub_epi32(v[3], v[19]);
1426   u[20] = _mm_sub_epi32(v[4], v[20]);
1427   u[21] = _mm_sub_epi32(v[5], v[21]);
1428   u[22] = _mm_sub_epi32(v[6], v[22]);
1429   u[23] = _mm_sub_epi32(v[7], v[23]);
1430   u[24] = _mm_sub_epi32(v[8], v[24]);
1431   u[25] = _mm_sub_epi32(v[9], v[25]);
1432   u[26] = _mm_sub_epi32(v[10], v[26]);
1433   u[27] = _mm_sub_epi32(v[11], v[27]);
1434   u[28] = _mm_sub_epi32(v[12], v[28]);
1435   u[29] = _mm_sub_epi32(v[13], v[29]);
1436   u[30] = _mm_sub_epi32(v[14], v[30]);
1437   u[31] = _mm_sub_epi32(v[15], v[31]);
1438 
1439   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1440   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1441   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1442   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1443   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1444   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1445   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1446   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1447   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1448   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1449   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1450   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1451   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1452   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1453   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1454   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1455   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1456   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1457   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1458   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1459   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1460   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1461   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1462   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1463   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1464   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1465   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1466   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1467   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1468   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1469   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1470   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1471 
1472   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1473   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1474   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1475   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1476   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1477   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1478   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1479   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1480   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1481   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1482   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1483   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1484   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1485   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1486   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1487   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1488   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1489   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1490   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1491   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1492   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1493   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1494   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1495   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1496   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1497   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1498   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1499   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1500   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1501   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1502   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1503   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1504 
1505   s[0] = _mm_packs_epi32(u[0], u[1]);
1506   s[1] = _mm_packs_epi32(u[2], u[3]);
1507   s[2] = _mm_packs_epi32(u[4], u[5]);
1508   s[3] = _mm_packs_epi32(u[6], u[7]);
1509   s[4] = _mm_packs_epi32(u[8], u[9]);
1510   s[5] = _mm_packs_epi32(u[10], u[11]);
1511   s[6] = _mm_packs_epi32(u[12], u[13]);
1512   s[7] = _mm_packs_epi32(u[14], u[15]);
1513   s[8] = _mm_packs_epi32(u[16], u[17]);
1514   s[9] = _mm_packs_epi32(u[18], u[19]);
1515   s[10] = _mm_packs_epi32(u[20], u[21]);
1516   s[11] = _mm_packs_epi32(u[22], u[23]);
1517   s[12] = _mm_packs_epi32(u[24], u[25]);
1518   s[13] = _mm_packs_epi32(u[26], u[27]);
1519   s[14] = _mm_packs_epi32(u[28], u[29]);
1520   s[15] = _mm_packs_epi32(u[30], u[31]);
1521 
1522   // stage 2
1523   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1524   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1525   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1526   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1527   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1528   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1529   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1530   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1531 
1532   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1533   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1534   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1535   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1536   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1537   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1538   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1539   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1540   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1541   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1542   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1543   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1544   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1545   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1546   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1547   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1548 
1549   u[0] = _mm_add_epi32(v[0], v[8]);
1550   u[1] = _mm_add_epi32(v[1], v[9]);
1551   u[2] = _mm_add_epi32(v[2], v[10]);
1552   u[3] = _mm_add_epi32(v[3], v[11]);
1553   u[4] = _mm_add_epi32(v[4], v[12]);
1554   u[5] = _mm_add_epi32(v[5], v[13]);
1555   u[6] = _mm_add_epi32(v[6], v[14]);
1556   u[7] = _mm_add_epi32(v[7], v[15]);
1557   u[8] = _mm_sub_epi32(v[0], v[8]);
1558   u[9] = _mm_sub_epi32(v[1], v[9]);
1559   u[10] = _mm_sub_epi32(v[2], v[10]);
1560   u[11] = _mm_sub_epi32(v[3], v[11]);
1561   u[12] = _mm_sub_epi32(v[4], v[12]);
1562   u[13] = _mm_sub_epi32(v[5], v[13]);
1563   u[14] = _mm_sub_epi32(v[6], v[14]);
1564   u[15] = _mm_sub_epi32(v[7], v[15]);
1565 
1566   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1567   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1568   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1569   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1570   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1571   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1572   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1573   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1574   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1575   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1576   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1577   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1578   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1579   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1580   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1581   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1582 
1583   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1584   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1585   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1586   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1587   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1588   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1589   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1590   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1591   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1592   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1593   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1594   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1595   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1596   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1597   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1598   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1599 
1600   x[0] = _mm_add_epi16(s[0], s[4]);
1601   x[1] = _mm_add_epi16(s[1], s[5]);
1602   x[2] = _mm_add_epi16(s[2], s[6]);
1603   x[3] = _mm_add_epi16(s[3], s[7]);
1604   x[4] = _mm_sub_epi16(s[0], s[4]);
1605   x[5] = _mm_sub_epi16(s[1], s[5]);
1606   x[6] = _mm_sub_epi16(s[2], s[6]);
1607   x[7] = _mm_sub_epi16(s[3], s[7]);
1608   x[8] = _mm_packs_epi32(u[0], u[1]);
1609   x[9] = _mm_packs_epi32(u[2], u[3]);
1610   x[10] = _mm_packs_epi32(u[4], u[5]);
1611   x[11] = _mm_packs_epi32(u[6], u[7]);
1612   x[12] = _mm_packs_epi32(u[8], u[9]);
1613   x[13] = _mm_packs_epi32(u[10], u[11]);
1614   x[14] = _mm_packs_epi32(u[12], u[13]);
1615   x[15] = _mm_packs_epi32(u[14], u[15]);
1616 
1617   // stage 3
1618   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1619   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1620   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1621   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1622   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1623   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1624   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1625   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1626 
1627   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1628   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1629   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1630   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1631   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1632   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1633   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1634   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1635   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1636   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1637   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1638   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1639   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1640   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1641   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1642   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1643 
1644   u[0] = _mm_add_epi32(v[0], v[4]);
1645   u[1] = _mm_add_epi32(v[1], v[5]);
1646   u[2] = _mm_add_epi32(v[2], v[6]);
1647   u[3] = _mm_add_epi32(v[3], v[7]);
1648   u[4] = _mm_sub_epi32(v[0], v[4]);
1649   u[5] = _mm_sub_epi32(v[1], v[5]);
1650   u[6] = _mm_sub_epi32(v[2], v[6]);
1651   u[7] = _mm_sub_epi32(v[3], v[7]);
1652   u[8] = _mm_add_epi32(v[8], v[12]);
1653   u[9] = _mm_add_epi32(v[9], v[13]);
1654   u[10] = _mm_add_epi32(v[10], v[14]);
1655   u[11] = _mm_add_epi32(v[11], v[15]);
1656   u[12] = _mm_sub_epi32(v[8], v[12]);
1657   u[13] = _mm_sub_epi32(v[9], v[13]);
1658   u[14] = _mm_sub_epi32(v[10], v[14]);
1659   u[15] = _mm_sub_epi32(v[11], v[15]);
1660 
1661   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1662   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1663   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1664   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1665   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1666   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1667   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1668   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1669   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1670   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1671   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1672   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1673   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1674   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1675   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1676   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1677 
1678   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1679   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1680   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1681   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1682   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1683   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1684   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1685   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1686   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1687   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1688   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1689   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1690   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1691   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1692   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1693   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1694 
1695   s[0] = _mm_add_epi16(x[0], x[2]);
1696   s[1] = _mm_add_epi16(x[1], x[3]);
1697   s[2] = _mm_sub_epi16(x[0], x[2]);
1698   s[3] = _mm_sub_epi16(x[1], x[3]);
1699   s[4] = _mm_packs_epi32(v[0], v[1]);
1700   s[5] = _mm_packs_epi32(v[2], v[3]);
1701   s[6] = _mm_packs_epi32(v[4], v[5]);
1702   s[7] = _mm_packs_epi32(v[6], v[7]);
1703   s[8] = _mm_add_epi16(x[8], x[10]);
1704   s[9] = _mm_add_epi16(x[9], x[11]);
1705   s[10] = _mm_sub_epi16(x[8], x[10]);
1706   s[11] = _mm_sub_epi16(x[9], x[11]);
1707   s[12] = _mm_packs_epi32(v[8], v[9]);
1708   s[13] = _mm_packs_epi32(v[10], v[11]);
1709   s[14] = _mm_packs_epi32(v[12], v[13]);
1710   s[15] = _mm_packs_epi32(v[14], v[15]);
1711 
1712   // stage 4
1713   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1714   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1715   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1716   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1717   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1718   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1719   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1720   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1721 
1722   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1723   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1724   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1725   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1726   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1727   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1728   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1729   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1730   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1731   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1732   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1733   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1734   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1735   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1736   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1737   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1738 
1739   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1740   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1741   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1742   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1743   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1744   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1745   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1746   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1747   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1748   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1749   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1750   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1751   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1752   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1753   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1754   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1755 
1756   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1757   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1758   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1759   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1760   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1761   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1762   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1763   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1764   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1765   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1766   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1767   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1768   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1769   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1770   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1771   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1772 
1773   in[0] = s[0];
1774   in[1] = _mm_sub_epi16(kZero, s[8]);
1775   in[2] = s[12];
1776   in[3] = _mm_sub_epi16(kZero, s[4]);
1777   in[4] = _mm_packs_epi32(v[4], v[5]);
1778   in[5] = _mm_packs_epi32(v[12], v[13]);
1779   in[6] = _mm_packs_epi32(v[8], v[9]);
1780   in[7] = _mm_packs_epi32(v[0], v[1]);
1781   in[8] = _mm_packs_epi32(v[2], v[3]);
1782   in[9] = _mm_packs_epi32(v[10], v[11]);
1783   in[10] = _mm_packs_epi32(v[14], v[15]);
1784   in[11] = _mm_packs_epi32(v[6], v[7]);
1785   in[12] = s[5];
1786   in[13] = _mm_sub_epi16(kZero, s[13]);
1787   in[14] = s[9];
1788   in[15] = _mm_sub_epi16(kZero, s[1]);
1789 }
1790 
idct16_8col(__m128i * in)1791 static void idct16_8col(__m128i *in) {
1792   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1793   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1794   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1795   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1796   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1797   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1798   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1799   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1800   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1801   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1802   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1803   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1804   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1805   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1806   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1807   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1808   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1809   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1810   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1811   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1812   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1813   __m128i v[16], u[16], s[16], t[16];
1814 
1815   // stage 1
1816   s[0] = in[0];
1817   s[1] = in[8];
1818   s[2] = in[4];
1819   s[3] = in[12];
1820   s[4] = in[2];
1821   s[5] = in[10];
1822   s[6] = in[6];
1823   s[7] = in[14];
1824   s[8] = in[1];
1825   s[9] = in[9];
1826   s[10] = in[5];
1827   s[11] = in[13];
1828   s[12] = in[3];
1829   s[13] = in[11];
1830   s[14] = in[7];
1831   s[15] = in[15];
1832 
1833   // stage 2
1834   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1835   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1836   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1837   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1838   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1839   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1840   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1841   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1842 
1843   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1844   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1845   v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1846   v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1847   v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1848   v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1849   v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1850   v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1851   v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1852   v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1853   v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1854   v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1855   v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1856   v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1857   v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1858   v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1859 
1860   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1861   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1862   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1863   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1864   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1865   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1866   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1867   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1868   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1869   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1870   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1871   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1872   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1873   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1874   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1875   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1876 
1877   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1878   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1879   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1880   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1881   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1882   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1883   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1884   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1885   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1886   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1887   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1888   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1889   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1890   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1891   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1892   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1893 
1894   s[8]  = _mm_packs_epi32(u[0], u[1]);
1895   s[15] = _mm_packs_epi32(u[2], u[3]);
1896   s[9]  = _mm_packs_epi32(u[4], u[5]);
1897   s[14] = _mm_packs_epi32(u[6], u[7]);
1898   s[10] = _mm_packs_epi32(u[8], u[9]);
1899   s[13] = _mm_packs_epi32(u[10], u[11]);
1900   s[11] = _mm_packs_epi32(u[12], u[13]);
1901   s[12] = _mm_packs_epi32(u[14], u[15]);
1902 
1903   // stage 3
1904   t[0] = s[0];
1905   t[1] = s[1];
1906   t[2] = s[2];
1907   t[3] = s[3];
1908   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1909   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1910   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1911   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1912 
1913   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1914   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1915   v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1916   v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1917   v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1918   v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1919   v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1920   v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1921 
1922   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1923   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1924   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1925   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1926   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1927   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1928   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1929   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1930 
1931   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1932   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1933   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1934   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1935   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1936   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1937   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1938   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1939 
1940   t[4] = _mm_packs_epi32(u[0], u[1]);
1941   t[7] = _mm_packs_epi32(u[2], u[3]);
1942   t[5] = _mm_packs_epi32(u[4], u[5]);
1943   t[6] = _mm_packs_epi32(u[6], u[7]);
1944   t[8] = _mm_add_epi16(s[8], s[9]);
1945   t[9] = _mm_sub_epi16(s[8], s[9]);
1946   t[10] = _mm_sub_epi16(s[11], s[10]);
1947   t[11] = _mm_add_epi16(s[10], s[11]);
1948   t[12] = _mm_add_epi16(s[12], s[13]);
1949   t[13] = _mm_sub_epi16(s[12], s[13]);
1950   t[14] = _mm_sub_epi16(s[15], s[14]);
1951   t[15] = _mm_add_epi16(s[14], s[15]);
1952 
1953   // stage 4
1954   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1955   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1956   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1957   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1958   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1959   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1960   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1961   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1962 
1963   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1964   v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1965   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1966   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1967   v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1968   v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1969   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1970   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1971   v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1972   v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1973   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1974   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1975   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1976   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1977   v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1978   v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1979 
1980   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1981   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1982   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1983   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1984   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1985   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1986   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1987   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1988   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1989   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1990   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1991   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1992   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1993   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1994   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1995   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1996 
1997   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1998   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1999   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2000   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2001   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2002   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2003   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2004   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2005   u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
2006   u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
2007   u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
2008   u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
2009   u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
2010   u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2011   u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2012   u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2013 
2014   s[0] = _mm_packs_epi32(u[0], u[1]);
2015   s[1] = _mm_packs_epi32(u[2], u[3]);
2016   s[2] = _mm_packs_epi32(u[4], u[5]);
2017   s[3] = _mm_packs_epi32(u[6], u[7]);
2018   s[4] = _mm_add_epi16(t[4], t[5]);
2019   s[5] = _mm_sub_epi16(t[4], t[5]);
2020   s[6] = _mm_sub_epi16(t[7], t[6]);
2021   s[7] = _mm_add_epi16(t[6], t[7]);
2022   s[8] = t[8];
2023   s[15] = t[15];
2024   s[9]  = _mm_packs_epi32(u[8], u[9]);
2025   s[14] = _mm_packs_epi32(u[10], u[11]);
2026   s[10] = _mm_packs_epi32(u[12], u[13]);
2027   s[13] = _mm_packs_epi32(u[14], u[15]);
2028   s[11] = t[11];
2029   s[12] = t[12];
2030 
2031   // stage 5
2032   t[0] = _mm_add_epi16(s[0], s[3]);
2033   t[1] = _mm_add_epi16(s[1], s[2]);
2034   t[2] = _mm_sub_epi16(s[1], s[2]);
2035   t[3] = _mm_sub_epi16(s[0], s[3]);
2036   t[4] = s[4];
2037   t[7] = s[7];
2038 
2039   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2040   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2041   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2042   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2043   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2044   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2045   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2046   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2047   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2048   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2049   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2050   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2051   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2052   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2053   t[5] = _mm_packs_epi32(u[0], u[1]);
2054   t[6] = _mm_packs_epi32(u[2], u[3]);
2055 
2056   t[8] = _mm_add_epi16(s[8], s[11]);
2057   t[9] = _mm_add_epi16(s[9], s[10]);
2058   t[10] = _mm_sub_epi16(s[9], s[10]);
2059   t[11] = _mm_sub_epi16(s[8], s[11]);
2060   t[12] = _mm_sub_epi16(s[15], s[12]);
2061   t[13] = _mm_sub_epi16(s[14], s[13]);
2062   t[14] = _mm_add_epi16(s[13], s[14]);
2063   t[15] = _mm_add_epi16(s[12], s[15]);
2064 
2065   // stage 6
2066   s[0] = _mm_add_epi16(t[0], t[7]);
2067   s[1] = _mm_add_epi16(t[1], t[6]);
2068   s[2] = _mm_add_epi16(t[2], t[5]);
2069   s[3] = _mm_add_epi16(t[3], t[4]);
2070   s[4] = _mm_sub_epi16(t[3], t[4]);
2071   s[5] = _mm_sub_epi16(t[2], t[5]);
2072   s[6] = _mm_sub_epi16(t[1], t[6]);
2073   s[7] = _mm_sub_epi16(t[0], t[7]);
2074   s[8] = t[8];
2075   s[9] = t[9];
2076 
2077   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2078   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2079   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2080   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2081 
2082   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2083   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2084   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2085   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2086   v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2087   v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2088   v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2089   v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2090 
2091   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2092   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2093   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2094   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2095   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2096   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2097   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2098   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2099 
2100   u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2101   u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2102   u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2103   u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2104   u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2105   u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2106   u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2107   u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2108 
2109   s[10] = _mm_packs_epi32(u[0], u[1]);
2110   s[13] = _mm_packs_epi32(u[2], u[3]);
2111   s[11] = _mm_packs_epi32(u[4], u[5]);
2112   s[12] = _mm_packs_epi32(u[6], u[7]);
2113   s[14] = t[14];
2114   s[15] = t[15];
2115 
2116   // stage 7
2117   in[0] = _mm_add_epi16(s[0], s[15]);
2118   in[1] = _mm_add_epi16(s[1], s[14]);
2119   in[2] = _mm_add_epi16(s[2], s[13]);
2120   in[3] = _mm_add_epi16(s[3], s[12]);
2121   in[4] = _mm_add_epi16(s[4], s[11]);
2122   in[5] = _mm_add_epi16(s[5], s[10]);
2123   in[6] = _mm_add_epi16(s[6], s[9]);
2124   in[7] = _mm_add_epi16(s[7], s[8]);
2125   in[8] = _mm_sub_epi16(s[7], s[8]);
2126   in[9] = _mm_sub_epi16(s[6], s[9]);
2127   in[10] = _mm_sub_epi16(s[5], s[10]);
2128   in[11] = _mm_sub_epi16(s[4], s[11]);
2129   in[12] = _mm_sub_epi16(s[3], s[12]);
2130   in[13] = _mm_sub_epi16(s[2], s[13]);
2131   in[14] = _mm_sub_epi16(s[1], s[14]);
2132   in[15] = _mm_sub_epi16(s[0], s[15]);
2133 }
2134 
idct16_sse2(__m128i * in0,__m128i * in1)2135 void idct16_sse2(__m128i *in0, __m128i *in1) {
2136   array_transpose_16x16(in0, in1);
2137   idct16_8col(in0);
2138   idct16_8col(in1);
2139 }
2140 
iadst16_sse2(__m128i * in0,__m128i * in1)2141 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2142   array_transpose_16x16(in0, in1);
2143   iadst16_8col(in0);
2144   iadst16_8col(in1);
2145 }
2146 
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)2147 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2148                                int stride) {
2149   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2150   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2151   const __m128i zero = _mm_setzero_si128();
2152 
2153   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2154   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2155   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2156   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2157 
2158   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2159   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2160 
2161   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2162   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2163   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2164   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2165   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2166   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2167 
2168   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2169   __m128i in[16], l[16];
2170   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
2171           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2172           stp1_8_0, stp1_12_0;
2173   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2174           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2175   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2176   int i;
2177   // First 1-D inverse DCT
2178   // Load input data.
2179   in[0] = load_input_data(input);
2180   in[1] = load_input_data(input + 8 * 2);
2181   in[2] = load_input_data(input + 8 * 4);
2182   in[3] = load_input_data(input + 8 * 6);
2183 
2184   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2185 
2186   // Stage2
2187   {
2188     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2189     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2190 
2191     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2192     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2193     tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2194     tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2195 
2196     tmp0 = _mm_add_epi32(tmp0, rounding);
2197     tmp2 = _mm_add_epi32(tmp2, rounding);
2198     tmp5 = _mm_add_epi32(tmp5, rounding);
2199     tmp7 = _mm_add_epi32(tmp7, rounding);
2200 
2201     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2202     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2203     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2204     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2205 
2206     stp2_8  = _mm_packs_epi32(tmp0, tmp2);
2207     stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2208   }
2209 
2210   // Stage3
2211   {
2212     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2213 
2214     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2215     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2216 
2217     tmp0 = _mm_add_epi32(tmp0, rounding);
2218     tmp2 = _mm_add_epi32(tmp2, rounding);
2219     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2220     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2221 
2222     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2223     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2224 
2225     stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2226   }
2227 
2228   // Stage4
2229   {
2230     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2231     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2232     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2233 
2234     tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2235     tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2236     tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2237     tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2238     tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2239     tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2240 
2241     tmp0 = _mm_add_epi32(tmp0, rounding);
2242     tmp2 = _mm_add_epi32(tmp2, rounding);
2243     tmp1 = _mm_add_epi32(tmp1, rounding);
2244     tmp3 = _mm_add_epi32(tmp3, rounding);
2245     tmp5 = _mm_add_epi32(tmp5, rounding);
2246     tmp7 = _mm_add_epi32(tmp7, rounding);
2247 
2248     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2249     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2250     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2251     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2252     tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2253     tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2254 
2255     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2256     stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2257     stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2258     stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2259 
2260     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2261   }
2262 
2263   // Stage5 and Stage6
2264   {
2265     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2266     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2267     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2268     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2269 
2270     stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
2271     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2272     stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
2273     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2274 
2275     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2276     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2277     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2278     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2279   }
2280 
2281   // Stage6
2282   {
2283     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2284     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2285     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2286 
2287     tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2288     tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2289     tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2290     tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2291     tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2292     tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2293 
2294     tmp1 = _mm_add_epi32(tmp1, rounding);
2295     tmp3 = _mm_add_epi32(tmp3, rounding);
2296     tmp0 = _mm_add_epi32(tmp0, rounding);
2297     tmp2 = _mm_add_epi32(tmp2, rounding);
2298     tmp4 = _mm_add_epi32(tmp4, rounding);
2299     tmp6 = _mm_add_epi32(tmp6, rounding);
2300 
2301     tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2302     tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2303     tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2304     tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2305     tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2306     tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2307 
2308     stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2309 
2310     stp2_10 = _mm_packs_epi32(tmp0, zero);
2311     stp2_13 = _mm_packs_epi32(tmp2, zero);
2312     stp2_11 = _mm_packs_epi32(tmp4, zero);
2313     stp2_12 = _mm_packs_epi32(tmp6, zero);
2314 
2315     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2316     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2317     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2318     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2319 
2320     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2321     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2322     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2323     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2324     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2325     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2326     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2327     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2328   }
2329 
2330   // Stage7. Left 8x16 only.
2331   l[0] = _mm_add_epi16(stp2_0, stp1_15);
2332   l[1] = _mm_add_epi16(stp2_1, stp1_14);
2333   l[2] = _mm_add_epi16(stp2_2, stp2_13);
2334   l[3] = _mm_add_epi16(stp2_3, stp2_12);
2335   l[4] = _mm_add_epi16(stp2_4, stp2_11);
2336   l[5] = _mm_add_epi16(stp2_5, stp2_10);
2337   l[6] = _mm_add_epi16(stp2_6, stp1_9);
2338   l[7] = _mm_add_epi16(stp2_7, stp1_8);
2339   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2340   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2341   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2342   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2343   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2344   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2345   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2346   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2347 
2348   // Second 1-D inverse transform, performed per 8x16 block
2349   for (i = 0; i < 2; i++) {
2350     int j;
2351     array_transpose_4X8(l + 8 * i, in);
2352 
2353     IDCT16_10
2354 
2355     // Stage7
2356     in[0] = _mm_add_epi16(stp2_0, stp1_15);
2357     in[1] = _mm_add_epi16(stp2_1, stp1_14);
2358     in[2] = _mm_add_epi16(stp2_2, stp2_13);
2359     in[3] = _mm_add_epi16(stp2_3, stp2_12);
2360     in[4] = _mm_add_epi16(stp2_4, stp2_11);
2361     in[5] = _mm_add_epi16(stp2_5, stp2_10);
2362     in[6] = _mm_add_epi16(stp2_6, stp1_9);
2363     in[7] = _mm_add_epi16(stp2_7, stp1_8);
2364     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2365     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2366     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2367     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2368     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2369     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2370     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2371     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2372 
2373     for (j = 0; j < 16; ++j) {
2374       // Final rounding and shift
2375       in[j] = _mm_adds_epi16(in[j], final_rounding);
2376       in[j] = _mm_srai_epi16(in[j], 6);
2377       RECON_AND_STORE(dest + j * stride, in[j]);
2378     }
2379 
2380     dest += 8;
2381   }
2382 }
2383 
2384 #define LOAD_DQCOEFF(reg, input) \
2385   {  \
2386     reg = load_input_data(input); \
2387     input += 8; \
2388   }  \
2389 
2390 #define IDCT32_34 \
2391 /* Stage1 */ \
2392 { \
2393   const __m128i zero = _mm_setzero_si128();\
2394   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2395   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2396   \
2397   const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
2398   const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2399   \
2400   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2401   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2402   \
2403   const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2404   const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2405   \
2406   MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
2407                          stg1_1, stp1_16, stp1_31); \
2408   MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
2409                          stg1_7, stp1_19, stp1_28); \
2410   MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
2411                          stg1_9, stp1_20, stp1_27); \
2412   MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
2413                          stg1_15, stp1_23, stp1_24); \
2414 } \
2415 \
2416 /* Stage2 */ \
2417 { \
2418   const __m128i zero = _mm_setzero_si128();\
2419   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2420   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2421   \
2422   const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2423   const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2424   \
2425   MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
2426                          stg2_1, stp2_8, stp2_15); \
2427   MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
2428                          stg2_7, stp2_11, stp2_12); \
2429   \
2430   stp2_16 = stp1_16; \
2431   stp2_19 = stp1_19; \
2432   \
2433   stp2_20 = stp1_20; \
2434   stp2_23 = stp1_23; \
2435   \
2436   stp2_24 = stp1_24; \
2437   stp2_27 = stp1_27; \
2438   \
2439   stp2_28 = stp1_28; \
2440   stp2_31 = stp1_31; \
2441 } \
2442 \
2443 /* Stage3 */ \
2444 { \
2445   const __m128i zero = _mm_setzero_si128();\
2446   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2447   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2448   \
2449   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2450   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2451   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2452   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2453   \
2454   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2455   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2456   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2457   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2458   \
2459   MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
2460                          stg3_1, stp1_4, stp1_7); \
2461   \
2462   stp1_8 = stp2_8; \
2463   stp1_11 = stp2_11; \
2464   stp1_12 = stp2_12; \
2465   stp1_15 = stp2_15; \
2466   \
2467   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2468                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2469                          stp1_18, stp1_29) \
2470   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2471                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2472                          stp1_22, stp1_25) \
2473   \
2474   stp1_16 = stp2_16; \
2475   stp1_31 = stp2_31; \
2476   stp1_19 = stp2_19; \
2477   stp1_20 = stp2_20; \
2478   stp1_23 = stp2_23; \
2479   stp1_24 = stp2_24; \
2480   stp1_27 = stp2_27; \
2481   stp1_28 = stp2_28; \
2482 } \
2483 \
2484 /* Stage4 */ \
2485 { \
2486   const __m128i zero = _mm_setzero_si128();\
2487   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2488   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2489   \
2490   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2491   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2492   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2493   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2494   \
2495   MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
2496                          stg4_1, stp2_0, stp2_1); \
2497   \
2498   stp2_4 = stp1_4; \
2499   stp2_5 = stp1_4; \
2500   stp2_6 = stp1_7; \
2501   stp2_7 = stp1_7; \
2502   \
2503   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2504                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2505                          stp2_10, stp2_13) \
2506   \
2507   stp2_8 = stp1_8; \
2508   stp2_15 = stp1_15; \
2509   stp2_11 = stp1_11; \
2510   stp2_12 = stp1_12; \
2511   \
2512   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2513   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2514   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2515   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2516   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2517   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2518   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2519   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2520   \
2521   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2522   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2523   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2524   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2525   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2526   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2527   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2528   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2529 } \
2530 \
2531 /* Stage5 */ \
2532 { \
2533   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2534   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2535   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2536   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2537   \
2538   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2539   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2540   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2541   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2542   \
2543   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2544   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2545   \
2546   stp1_0 = stp2_0; \
2547   stp1_1 = stp2_1; \
2548   stp1_2 = stp2_1; \
2549   stp1_3 = stp2_0; \
2550   \
2551   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2552   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2553   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2554   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2555   \
2556   tmp0 = _mm_add_epi32(tmp0, rounding); \
2557   tmp1 = _mm_add_epi32(tmp1, rounding); \
2558   tmp2 = _mm_add_epi32(tmp2, rounding); \
2559   tmp3 = _mm_add_epi32(tmp3, rounding); \
2560   \
2561   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2562   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2563   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2564   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2565   \
2566   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2567   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2568   \
2569   stp1_4 = stp2_4; \
2570   stp1_7 = stp2_7; \
2571   \
2572   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2573   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2574   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2575   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2576   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2577   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2578   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2579   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2580   \
2581   stp1_16 = stp2_16; \
2582   stp1_17 = stp2_17; \
2583   \
2584   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2585                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2586                          stp1_19, stp1_28) \
2587   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2588                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2589                          stp1_21, stp1_26) \
2590   \
2591   stp1_22 = stp2_22; \
2592   stp1_23 = stp2_23; \
2593   stp1_24 = stp2_24; \
2594   stp1_25 = stp2_25; \
2595   stp1_30 = stp2_30; \
2596   stp1_31 = stp2_31; \
2597 } \
2598 \
2599 /* Stage6 */ \
2600 { \
2601   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2602   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2603   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2604   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2605   \
2606   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2607   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2608   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2609   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2610   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2611   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2612   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2613   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2614   \
2615   stp2_8 = stp1_8; \
2616   stp2_9 = stp1_9; \
2617   stp2_14 = stp1_14; \
2618   stp2_15 = stp1_15; \
2619   \
2620   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2621                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2622                          stp2_13, stp2_11, stp2_12) \
2623   \
2624   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2625   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2626   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2627   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2628   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2629   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2630   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2631   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2632   \
2633   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2634   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2635   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2636   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2637   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2638   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2639   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2640   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2641 } \
2642 \
2643 /* Stage7 */ \
2644 { \
2645   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2646   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2647   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2648   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2649   \
2650   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2651   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2652   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2653   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2654   \
2655   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2656   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2657   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2658   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2659   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2660   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2661   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2662   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2663   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2664   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2665   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2666   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2667   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2668   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2669   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2670   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2671   \
2672   stp1_16 = stp2_16; \
2673   stp1_17 = stp2_17; \
2674   stp1_18 = stp2_18; \
2675   stp1_19 = stp2_19; \
2676   \
2677   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2678                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
2679                          stp1_21, stp1_26) \
2680   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2681                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
2682                          stp1_23, stp1_24) \
2683   \
2684   stp1_28 = stp2_28; \
2685   stp1_29 = stp2_29; \
2686   stp1_30 = stp2_30; \
2687   stp1_31 = stp2_31; \
2688 }
2689 
2690 
2691 #define IDCT32 \
2692 /* Stage1 */ \
2693 { \
2694   const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2695   const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2696   const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2697   const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2698   \
2699   const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2700   const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2701   const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
2702   const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2703   \
2704   const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2705   const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2706   const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2707   const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2708   \
2709   const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2710   const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2711   const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2712   const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2713   \
2714   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2715                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
2716                          stp1_17, stp1_30) \
2717   MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
2718                          stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
2719                          stp1_19, stp1_28) \
2720   MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2721                          stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2722                          stp1_21, stp1_26) \
2723   MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2724                          stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2725                          stp1_23, stp1_24) \
2726 } \
2727 \
2728 /* Stage2 */ \
2729 { \
2730   const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2731   const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2732   const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2733   const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2734   \
2735   const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2736   const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2737   const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2738   const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2739   \
2740   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2741                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2742                          stp2_14) \
2743   MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2744                          stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
2745                          stp2_11, stp2_12) \
2746   \
2747   stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2748   stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2749   stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2750   stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2751   \
2752   stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2753   stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2754   stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2755   stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2756   \
2757   stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2758   stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2759   stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2760   stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2761   \
2762   stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2763   stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2764   stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2765   stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2766 } \
2767 \
2768 /* Stage3 */ \
2769 { \
2770   const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2771   const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2772   const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2773   const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2774   \
2775   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2776   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2777   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2778   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2779   \
2780   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2781   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2782   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2783   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2784   \
2785   MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2786                          stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2787                          stp1_6) \
2788   \
2789   stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2790   stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2791   stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2792   stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2793   stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2794   stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2795   stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2796   stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2797   \
2798   MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2799                          stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
2800                          stp1_18, stp1_29) \
2801   MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2802                          stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
2803                          stp1_22, stp1_25) \
2804   \
2805   stp1_16 = stp2_16; \
2806   stp1_31 = stp2_31; \
2807   stp1_19 = stp2_19; \
2808   stp1_20 = stp2_20; \
2809   stp1_23 = stp2_23; \
2810   stp1_24 = stp2_24; \
2811   stp1_27 = stp2_27; \
2812   stp1_28 = stp2_28; \
2813 } \
2814 \
2815 /* Stage4 */ \
2816 { \
2817   const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2818   const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2819   const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2820   const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2821   \
2822   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2823   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2824   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2825   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2826   \
2827   MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
2828                          stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
2829                          stp2_2, stp2_3) \
2830   \
2831   stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2832   stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2833   stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2834   stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2835   \
2836   MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2837                          stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
2838                          stp2_10, stp2_13) \
2839   \
2840   stp2_8 = stp1_8; \
2841   stp2_15 = stp1_15; \
2842   stp2_11 = stp1_11; \
2843   stp2_12 = stp1_12; \
2844   \
2845   stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2846   stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2847   stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2848   stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2849   stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2850   stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2851   stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2852   stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2853   \
2854   stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2855   stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2856   stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2857   stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2858   stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2859   stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2860   stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2861   stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2862 } \
2863 \
2864 /* Stage5 */ \
2865 { \
2866   const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2867   const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2868   const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2869   const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2870   \
2871   const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2872   const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2873   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2874   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2875   \
2876   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2877   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2878   \
2879   stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2880   stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2881   stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2882   stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2883   \
2884   tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2885   tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2886   tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2887   tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2888   \
2889   tmp0 = _mm_add_epi32(tmp0, rounding); \
2890   tmp1 = _mm_add_epi32(tmp1, rounding); \
2891   tmp2 = _mm_add_epi32(tmp2, rounding); \
2892   tmp3 = _mm_add_epi32(tmp3, rounding); \
2893   \
2894   tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2895   tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2896   tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2897   tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2898   \
2899   stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2900   stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2901   \
2902   stp1_4 = stp2_4; \
2903   stp1_7 = stp2_7; \
2904   \
2905   stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2906   stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2907   stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2908   stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2909   stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2910   stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2911   stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2912   stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2913   \
2914   stp1_16 = stp2_16; \
2915   stp1_17 = stp2_17; \
2916   \
2917   MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2918                          stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
2919                          stp1_19, stp1_28) \
2920   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2921                          stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
2922                          stp1_21, stp1_26) \
2923   \
2924   stp1_22 = stp2_22; \
2925   stp1_23 = stp2_23; \
2926   stp1_24 = stp2_24; \
2927   stp1_25 = stp2_25; \
2928   stp1_30 = stp2_30; \
2929   stp1_31 = stp2_31; \
2930 } \
2931 \
2932 /* Stage6 */ \
2933 { \
2934   const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2935   const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2936   const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2937   const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2938   \
2939   stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2940   stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2941   stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2942   stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2943   stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2944   stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2945   stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2946   stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2947   \
2948   stp2_8 = stp1_8; \
2949   stp2_9 = stp1_9; \
2950   stp2_14 = stp1_14; \
2951   stp2_15 = stp1_15; \
2952   \
2953   MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
2954                          stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
2955                          stp2_13, stp2_11, stp2_12) \
2956   \
2957   stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2958   stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2959   stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2960   stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2961   stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2962   stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2963   stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2964   stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2965   \
2966   stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2967   stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2968   stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2969   stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2970   stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2971   stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2972   stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2973   stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2974 } \
2975 \
2976 /* Stage7 */ \
2977 { \
2978   const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2979   const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2980   const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2981   const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2982   \
2983   const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2984   const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2985   const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2986   const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2987   \
2988   stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2989   stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2990   stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2991   stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2992   stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2993   stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2994   stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2995   stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2996   stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2997   stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2998   stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2999   stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
3000   stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
3001   stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
3002   stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
3003   stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
3004   \
3005   stp1_16 = stp2_16; \
3006   stp1_17 = stp2_17; \
3007   stp1_18 = stp2_18; \
3008   stp1_19 = stp2_19; \
3009   \
3010   MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
3011                          stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
3012                          stp1_21, stp1_26) \
3013   MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3014                          stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
3015                          stp1_23, stp1_24) \
3016   \
3017   stp1_28 = stp2_28; \
3018   stp1_29 = stp2_29; \
3019   stp1_30 = stp2_30; \
3020   stp1_31 = stp2_31; \
3021 }
3022 
3023 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3024 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3025                                int stride) {
3026   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3027   const __m128i final_rounding = _mm_set1_epi16(1<<5);
3028 
3029   // idct constants for each stage
3030   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3031   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3032   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3033   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3034   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3035   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3036   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3037   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3038 
3039   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3040   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3041   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3042   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3043 
3044   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3045   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3046   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3047   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3048   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3049   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3050   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3051   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3052 
3053   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3054   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3055   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3056   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3057   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3058 
3059   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3060 
3061   __m128i in[32], col[32];
3062   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3063           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3064           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3065           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3066           stp1_30, stp1_31;
3067   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3068           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3069           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3070           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3071           stp2_30, stp2_31;
3072   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3073   int i;
3074 
3075   // Load input data. Only need to load the top left 8x8 block.
3076   in[0] = load_input_data(input);
3077   in[1] = load_input_data(input + 32);
3078   in[2] = load_input_data(input + 64);
3079   in[3] = load_input_data(input + 96);
3080   in[4] = load_input_data(input + 128);
3081   in[5] = load_input_data(input + 160);
3082   in[6] = load_input_data(input + 192);
3083   in[7] = load_input_data(input + 224);
3084 
3085   for (i = 8; i < 32; ++i) {
3086     in[i] = _mm_setzero_si128();
3087   }
3088 
3089   array_transpose_8x8(in, in);
3090   // TODO(hkuang): Following transposes are unnecessary. But remove them will
3091   // lead to performance drop on some devices.
3092   array_transpose_8x8(in + 8, in + 8);
3093   array_transpose_8x8(in + 16, in + 16);
3094   array_transpose_8x8(in + 24, in + 24);
3095 
3096   IDCT32_34
3097 
3098   // 1_D: Store 32 intermediate results for each 8x32 block.
3099   col[0] = _mm_add_epi16(stp1_0, stp1_31);
3100   col[1] = _mm_add_epi16(stp1_1, stp1_30);
3101   col[2] = _mm_add_epi16(stp1_2, stp1_29);
3102   col[3] = _mm_add_epi16(stp1_3, stp1_28);
3103   col[4] = _mm_add_epi16(stp1_4, stp1_27);
3104   col[5] = _mm_add_epi16(stp1_5, stp1_26);
3105   col[6] = _mm_add_epi16(stp1_6, stp1_25);
3106   col[7] = _mm_add_epi16(stp1_7, stp1_24);
3107   col[8] = _mm_add_epi16(stp1_8, stp1_23);
3108   col[9] = _mm_add_epi16(stp1_9, stp1_22);
3109   col[10] = _mm_add_epi16(stp1_10, stp1_21);
3110   col[11] = _mm_add_epi16(stp1_11, stp1_20);
3111   col[12] = _mm_add_epi16(stp1_12, stp1_19);
3112   col[13] = _mm_add_epi16(stp1_13, stp1_18);
3113   col[14] = _mm_add_epi16(stp1_14, stp1_17);
3114   col[15] = _mm_add_epi16(stp1_15, stp1_16);
3115   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3116   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3117   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3118   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3119   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3120   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3121   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3122   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3123   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3124   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3125   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3126   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3127   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3128   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3129   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3130   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3131   for (i = 0; i < 4; i++) {
3132     int j;
3133     const __m128i zero = _mm_setzero_si128();
3134     // Transpose 32x8 block to 8x32 block
3135     array_transpose_8x8(col + i * 8, in);
3136     IDCT32_34
3137 
3138     // 2_D: Calculate the results and store them to destination.
3139     in[0] = _mm_add_epi16(stp1_0, stp1_31);
3140     in[1] = _mm_add_epi16(stp1_1, stp1_30);
3141     in[2] = _mm_add_epi16(stp1_2, stp1_29);
3142     in[3] = _mm_add_epi16(stp1_3, stp1_28);
3143     in[4] = _mm_add_epi16(stp1_4, stp1_27);
3144     in[5] = _mm_add_epi16(stp1_5, stp1_26);
3145     in[6] = _mm_add_epi16(stp1_6, stp1_25);
3146     in[7] = _mm_add_epi16(stp1_7, stp1_24);
3147     in[8] = _mm_add_epi16(stp1_8, stp1_23);
3148     in[9] = _mm_add_epi16(stp1_9, stp1_22);
3149     in[10] = _mm_add_epi16(stp1_10, stp1_21);
3150     in[11] = _mm_add_epi16(stp1_11, stp1_20);
3151     in[12] = _mm_add_epi16(stp1_12, stp1_19);
3152     in[13] = _mm_add_epi16(stp1_13, stp1_18);
3153     in[14] = _mm_add_epi16(stp1_14, stp1_17);
3154     in[15] = _mm_add_epi16(stp1_15, stp1_16);
3155     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3156     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3157     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3158     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3159     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3160     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3161     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3162     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3163     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3164     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3165     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3166     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3167     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3168     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3169     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3170     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3171 
3172     for (j = 0; j < 32; ++j) {
3173       // Final rounding and shift
3174       in[j] = _mm_adds_epi16(in[j], final_rounding);
3175       in[j] = _mm_srai_epi16(in[j], 6);
3176       RECON_AND_STORE(dest + j * stride, in[j]);
3177     }
3178 
3179     dest += 8;
3180   }
3181 }
3182 
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3183 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3184                                  int stride) {
3185   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3186   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3187   const __m128i zero = _mm_setzero_si128();
3188 
3189   // idct constants for each stage
3190   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3191   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3192   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3193   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3194   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3195   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3196   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3197   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3198   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3199   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3200   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3201   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3202   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3203   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3204   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3205   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3206 
3207   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3208   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3209   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3210   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3211   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3212   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3213   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3214   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3215 
3216   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3217   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3218   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3219   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3220   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3221   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3222   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3223   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3224   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3225   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3226 
3227   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3228   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3229   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3230   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3231   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3232   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3233   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3234 
3235   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3236 
3237   __m128i in[32], col[128], zero_idx[16];
3238   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3239           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3240           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3241           stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3242           stp1_30, stp1_31;
3243   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3244           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3245           stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3246           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3247           stp2_30, stp2_31;
3248   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3249   int i, j, i32;
3250 
3251   for (i = 0; i < 4; i++) {
3252     i32 = (i << 5);
3253     // First 1-D idct
3254     // Load input data.
3255     LOAD_DQCOEFF(in[0], input);
3256     LOAD_DQCOEFF(in[8], input);
3257     LOAD_DQCOEFF(in[16], input);
3258     LOAD_DQCOEFF(in[24], input);
3259     LOAD_DQCOEFF(in[1], input);
3260     LOAD_DQCOEFF(in[9], input);
3261     LOAD_DQCOEFF(in[17], input);
3262     LOAD_DQCOEFF(in[25], input);
3263     LOAD_DQCOEFF(in[2], input);
3264     LOAD_DQCOEFF(in[10], input);
3265     LOAD_DQCOEFF(in[18], input);
3266     LOAD_DQCOEFF(in[26], input);
3267     LOAD_DQCOEFF(in[3], input);
3268     LOAD_DQCOEFF(in[11], input);
3269     LOAD_DQCOEFF(in[19], input);
3270     LOAD_DQCOEFF(in[27], input);
3271 
3272     LOAD_DQCOEFF(in[4], input);
3273     LOAD_DQCOEFF(in[12], input);
3274     LOAD_DQCOEFF(in[20], input);
3275     LOAD_DQCOEFF(in[28], input);
3276     LOAD_DQCOEFF(in[5], input);
3277     LOAD_DQCOEFF(in[13], input);
3278     LOAD_DQCOEFF(in[21], input);
3279     LOAD_DQCOEFF(in[29], input);
3280     LOAD_DQCOEFF(in[6], input);
3281     LOAD_DQCOEFF(in[14], input);
3282     LOAD_DQCOEFF(in[22], input);
3283     LOAD_DQCOEFF(in[30], input);
3284     LOAD_DQCOEFF(in[7], input);
3285     LOAD_DQCOEFF(in[15], input);
3286     LOAD_DQCOEFF(in[23], input);
3287     LOAD_DQCOEFF(in[31], input);
3288 
3289     // checking if all entries are zero
3290     zero_idx[0] = _mm_or_si128(in[0], in[1]);
3291     zero_idx[1] = _mm_or_si128(in[2], in[3]);
3292     zero_idx[2] = _mm_or_si128(in[4], in[5]);
3293     zero_idx[3] = _mm_or_si128(in[6], in[7]);
3294     zero_idx[4] = _mm_or_si128(in[8], in[9]);
3295     zero_idx[5] = _mm_or_si128(in[10], in[11]);
3296     zero_idx[6] = _mm_or_si128(in[12], in[13]);
3297     zero_idx[7] = _mm_or_si128(in[14], in[15]);
3298     zero_idx[8] = _mm_or_si128(in[16], in[17]);
3299     zero_idx[9] = _mm_or_si128(in[18], in[19]);
3300     zero_idx[10] = _mm_or_si128(in[20], in[21]);
3301     zero_idx[11] = _mm_or_si128(in[22], in[23]);
3302     zero_idx[12] = _mm_or_si128(in[24], in[25]);
3303     zero_idx[13] = _mm_or_si128(in[26], in[27]);
3304     zero_idx[14] = _mm_or_si128(in[28], in[29]);
3305     zero_idx[15] = _mm_or_si128(in[30], in[31]);
3306 
3307     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3308     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3309     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3310     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3311     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3312     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3313     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3314     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3315 
3316     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3317     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3318     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3319     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3320     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3321     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3322     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3323 
3324     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3325       col[i32 + 0] = _mm_setzero_si128();
3326       col[i32 + 1] = _mm_setzero_si128();
3327       col[i32 + 2] = _mm_setzero_si128();
3328       col[i32 + 3] = _mm_setzero_si128();
3329       col[i32 + 4] = _mm_setzero_si128();
3330       col[i32 + 5] = _mm_setzero_si128();
3331       col[i32 + 6] = _mm_setzero_si128();
3332       col[i32 + 7] = _mm_setzero_si128();
3333       col[i32 + 8] = _mm_setzero_si128();
3334       col[i32 + 9] = _mm_setzero_si128();
3335       col[i32 + 10] = _mm_setzero_si128();
3336       col[i32 + 11] = _mm_setzero_si128();
3337       col[i32 + 12] = _mm_setzero_si128();
3338       col[i32 + 13] = _mm_setzero_si128();
3339       col[i32 + 14] = _mm_setzero_si128();
3340       col[i32 + 15] = _mm_setzero_si128();
3341       col[i32 + 16] = _mm_setzero_si128();
3342       col[i32 + 17] = _mm_setzero_si128();
3343       col[i32 + 18] = _mm_setzero_si128();
3344       col[i32 + 19] = _mm_setzero_si128();
3345       col[i32 + 20] = _mm_setzero_si128();
3346       col[i32 + 21] = _mm_setzero_si128();
3347       col[i32 + 22] = _mm_setzero_si128();
3348       col[i32 + 23] = _mm_setzero_si128();
3349       col[i32 + 24] = _mm_setzero_si128();
3350       col[i32 + 25] = _mm_setzero_si128();
3351       col[i32 + 26] = _mm_setzero_si128();
3352       col[i32 + 27] = _mm_setzero_si128();
3353       col[i32 + 28] = _mm_setzero_si128();
3354       col[i32 + 29] = _mm_setzero_si128();
3355       col[i32 + 30] = _mm_setzero_si128();
3356       col[i32 + 31] = _mm_setzero_si128();
3357       continue;
3358     }
3359 
3360     // Transpose 32x8 block to 8x32 block
3361     array_transpose_8x8(in, in);
3362     array_transpose_8x8(in + 8, in + 8);
3363     array_transpose_8x8(in + 16, in + 16);
3364     array_transpose_8x8(in + 24, in + 24);
3365 
3366     IDCT32
3367 
3368     // 1_D: Store 32 intermediate results for each 8x32 block.
3369     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3370     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3371     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3372     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3373     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3374     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3375     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3376     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3377     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3378     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3379     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3380     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3381     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3382     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3383     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3384     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3385     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3386     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3387     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3388     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3389     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3390     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3391     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3392     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3393     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3394     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3395     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3396     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3397     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3398     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3399     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3400     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3401   }
3402   for (i = 0; i < 4; i++) {
3403     // Second 1-D idct
3404     j = i << 3;
3405 
3406     // Transpose 32x8 block to 8x32 block
3407     array_transpose_8x8(col + j, in);
3408     array_transpose_8x8(col + j + 32, in + 8);
3409     array_transpose_8x8(col + j + 64, in + 16);
3410     array_transpose_8x8(col + j + 96, in + 24);
3411 
3412     IDCT32
3413 
3414     // 2_D: Calculate the results and store them to destination.
3415     in[0] = _mm_add_epi16(stp1_0, stp1_31);
3416     in[1] = _mm_add_epi16(stp1_1, stp1_30);
3417     in[2] = _mm_add_epi16(stp1_2, stp1_29);
3418     in[3] = _mm_add_epi16(stp1_3, stp1_28);
3419     in[4] = _mm_add_epi16(stp1_4, stp1_27);
3420     in[5] = _mm_add_epi16(stp1_5, stp1_26);
3421     in[6] = _mm_add_epi16(stp1_6, stp1_25);
3422     in[7] = _mm_add_epi16(stp1_7, stp1_24);
3423     in[8] = _mm_add_epi16(stp1_8, stp1_23);
3424     in[9] = _mm_add_epi16(stp1_9, stp1_22);
3425     in[10] = _mm_add_epi16(stp1_10, stp1_21);
3426     in[11] = _mm_add_epi16(stp1_11, stp1_20);
3427     in[12] = _mm_add_epi16(stp1_12, stp1_19);
3428     in[13] = _mm_add_epi16(stp1_13, stp1_18);
3429     in[14] = _mm_add_epi16(stp1_14, stp1_17);
3430     in[15] = _mm_add_epi16(stp1_15, stp1_16);
3431     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3432     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3433     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3434     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3435     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3436     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3437     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3438     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3439     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3440     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3441     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3442     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3443     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3444     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3445     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3446     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3447 
3448     for (j = 0; j < 32; ++j) {
3449       // Final rounding and shift
3450       in[j] = _mm_adds_epi16(in[j], final_rounding);
3451       in[j] = _mm_srai_epi16(in[j], 6);
3452       RECON_AND_STORE(dest + j * stride, in[j]);
3453     }
3454 
3455     dest += 8;
3456   }
3457 }
3458 
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)3459 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3460                               int stride) {
3461   __m128i dc_value;
3462   const __m128i zero = _mm_setzero_si128();
3463   int a, j;
3464 
3465   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3466   a = (int)dct_const_round_shift(a * cospi_16_64);
3467   a = ROUND_POWER_OF_TWO(a, 6);
3468 
3469   dc_value = _mm_set1_epi16(a);
3470 
3471   for (j = 0; j < 32; ++j) {
3472     RECON_AND_STORE(dest +  0 + j * stride, dc_value);
3473     RECON_AND_STORE(dest +  8 + j * stride, dc_value);
3474     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3475     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3476   }
3477 }
3478 
3479 #if CONFIG_VP9_HIGHBITDEPTH
clamp_high_sse2(__m128i value,int bd)3480 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3481   __m128i ubounded, retval;
3482   const __m128i zero = _mm_set1_epi16(0);
3483   const __m128i one = _mm_set1_epi16(1);
3484   const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3485   ubounded = _mm_cmpgt_epi16(value, max);
3486   retval = _mm_andnot_si128(ubounded, value);
3487   ubounded = _mm_and_si128(ubounded, max);
3488   retval = _mm_or_si128(retval, ubounded);
3489   retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3490   return retval;
3491 }
3492 
vpx_highbd_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3493 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3494                                     int stride, int bd) {
3495   tran_low_t out[4 * 4];
3496   tran_low_t *outptr = out;
3497   int i, j;
3498   __m128i inptr[4];
3499   __m128i sign_bits[2];
3500   __m128i temp_mm, min_input, max_input;
3501   int test;
3502   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3503   int optimised_cols = 0;
3504   const __m128i zero = _mm_set1_epi16(0);
3505   const __m128i eight = _mm_set1_epi16(8);
3506   const __m128i max = _mm_set1_epi16(12043);
3507   const __m128i min = _mm_set1_epi16(-12043);
3508   // Load input into __m128i
3509   inptr[0] = _mm_loadu_si128((const __m128i *)input);
3510   inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3511   inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3512   inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3513 
3514   // Pack to 16 bits
3515   inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3516   inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3517 
3518   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3519   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3520   max_input = _mm_cmpgt_epi16(max_input, max);
3521   min_input = _mm_cmplt_epi16(min_input, min);
3522   temp_mm = _mm_or_si128(max_input, min_input);
3523   test = _mm_movemask_epi8(temp_mm);
3524 
3525   if (!test) {
3526     // Do the row transform
3527     idct4_sse2(inptr);
3528 
3529     // Check the min & max values
3530     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3531     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3532     max_input = _mm_cmpgt_epi16(max_input, max);
3533     min_input = _mm_cmplt_epi16(min_input, min);
3534     temp_mm = _mm_or_si128(max_input, min_input);
3535     test = _mm_movemask_epi8(temp_mm);
3536 
3537     if (test) {
3538       transpose_4x4(inptr);
3539       sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3540       sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3541       inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3542       inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3543       inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3544       inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3545       _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3546       _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3547       _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3548       _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3549     } else {
3550       // Set to use the optimised transform for the column
3551       optimised_cols = 1;
3552     }
3553   } else {
3554     // Run the un-optimised row transform
3555     for (i = 0; i < 4; ++i) {
3556       vpx_highbd_idct4_c(input, outptr, bd);
3557       input += 4;
3558       outptr += 4;
3559     }
3560   }
3561 
3562   if (optimised_cols) {
3563     idct4_sse2(inptr);
3564 
3565     // Final round and shift
3566     inptr[0] = _mm_add_epi16(inptr[0], eight);
3567     inptr[1] = _mm_add_epi16(inptr[1], eight);
3568 
3569     inptr[0] = _mm_srai_epi16(inptr[0], 4);
3570     inptr[1] = _mm_srai_epi16(inptr[1], 4);
3571 
3572     // Reconstruction and Store
3573     {
3574       __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3575       __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3576       d0 = _mm_unpacklo_epi64(
3577           d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3578       d2 = _mm_unpacklo_epi64(
3579           d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3580       d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3581       d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3582       // store input0
3583       _mm_storel_epi64((__m128i *)dest, d0);
3584       // store input1
3585       d0 = _mm_srli_si128(d0, 8);
3586       _mm_storel_epi64((__m128i *)(dest + stride), d0);
3587       // store input2
3588       _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3589       // store input3
3590       d2 = _mm_srli_si128(d2, 8);
3591       _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3592     }
3593   } else {
3594     // Run the un-optimised column transform
3595     tran_low_t temp_in[4], temp_out[4];
3596     // Columns
3597     for (i = 0; i < 4; ++i) {
3598       for (j = 0; j < 4; ++j)
3599         temp_in[j] = out[j * 4 + i];
3600       vpx_highbd_idct4_c(temp_in, temp_out, bd);
3601       for (j = 0; j < 4; ++j) {
3602         dest[j * stride + i] = highbd_clip_pixel_add(
3603             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3604       }
3605     }
3606   }
3607 }
3608 
vpx_highbd_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3609 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3610                                     int stride, int bd) {
3611   tran_low_t out[8 * 8];
3612   tran_low_t *outptr = out;
3613   int i, j, test;
3614   __m128i inptr[8];
3615   __m128i min_input, max_input, temp1, temp2, sign_bits;
3616   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3617   const __m128i zero = _mm_set1_epi16(0);
3618   const __m128i sixteen = _mm_set1_epi16(16);
3619   const __m128i max = _mm_set1_epi16(6201);
3620   const __m128i min = _mm_set1_epi16(-6201);
3621   int optimised_cols = 0;
3622 
3623   // Load input into __m128i & pack to 16 bits
3624   for (i = 0; i < 8; i++) {
3625     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3626     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3627     inptr[i] = _mm_packs_epi32(temp1, temp2);
3628   }
3629 
3630   // Find the min & max for the row transform
3631   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3632   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3633   for (i = 2; i < 8; i++) {
3634     max_input = _mm_max_epi16(max_input, inptr[i]);
3635     min_input = _mm_min_epi16(min_input, inptr[i]);
3636   }
3637   max_input = _mm_cmpgt_epi16(max_input, max);
3638   min_input = _mm_cmplt_epi16(min_input, min);
3639   temp1 = _mm_or_si128(max_input, min_input);
3640   test = _mm_movemask_epi8(temp1);
3641 
3642   if (!test) {
3643     // Do the row transform
3644     idct8_sse2(inptr);
3645 
3646     // Find the min & max for the column transform
3647     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3648     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3649     for (i = 2; i < 8; i++) {
3650       max_input = _mm_max_epi16(max_input, inptr[i]);
3651       min_input = _mm_min_epi16(min_input, inptr[i]);
3652     }
3653     max_input = _mm_cmpgt_epi16(max_input, max);
3654     min_input = _mm_cmplt_epi16(min_input, min);
3655     temp1 = _mm_or_si128(max_input, min_input);
3656     test = _mm_movemask_epi8(temp1);
3657 
3658     if (test) {
3659       array_transpose_8x8(inptr, inptr);
3660       for (i = 0; i < 8; i++) {
3661         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3662         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3663         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3664         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3665         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3666       }
3667     } else {
3668       // Set to use the optimised transform for the column
3669       optimised_cols = 1;
3670     }
3671   } else {
3672     // Run the un-optimised row transform
3673     for (i = 0; i < 8; ++i) {
3674       vpx_highbd_idct8_c(input, outptr, bd);
3675       input += 8;
3676       outptr += 8;
3677     }
3678   }
3679 
3680   if (optimised_cols) {
3681     idct8_sse2(inptr);
3682 
3683     // Final round & shift and Reconstruction and Store
3684     {
3685       __m128i d[8];
3686       for (i = 0; i < 8; i++) {
3687         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3688         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3689         inptr[i] = _mm_srai_epi16(inptr[i], 5);
3690         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3691         // Store
3692         _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3693       }
3694     }
3695   } else {
3696     // Run the un-optimised column transform
3697     tran_low_t temp_in[8], temp_out[8];
3698     for (i = 0; i < 8; ++i) {
3699       for (j = 0; j < 8; ++j)
3700         temp_in[j] = out[j * 8 + i];
3701       vpx_highbd_idct8_c(temp_in, temp_out, bd);
3702       for (j = 0; j < 8; ++j) {
3703         dest[j * stride + i] = highbd_clip_pixel_add(
3704             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3705       }
3706     }
3707   }
3708 }
3709 
vpx_highbd_idct8x8_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3710 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3711                                     int stride, int bd) {
3712   tran_low_t out[8 * 8] = { 0 };
3713   tran_low_t *outptr = out;
3714   int i, j, test;
3715   __m128i inptr[8];
3716   __m128i min_input, max_input, temp1, temp2, sign_bits;
3717   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3718   const __m128i zero = _mm_set1_epi16(0);
3719   const __m128i sixteen = _mm_set1_epi16(16);
3720   const __m128i max = _mm_set1_epi16(6201);
3721   const __m128i min = _mm_set1_epi16(-6201);
3722   int optimised_cols = 0;
3723 
3724   // Load input into __m128i & pack to 16 bits
3725   for (i = 0; i < 8; i++) {
3726     temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3727     temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3728     inptr[i] = _mm_packs_epi32(temp1, temp2);
3729   }
3730 
3731   // Find the min & max for the row transform
3732   // only first 4 row has non-zero coefs
3733   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3734   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3735   for (i = 2; i < 4; i++) {
3736     max_input = _mm_max_epi16(max_input, inptr[i]);
3737     min_input = _mm_min_epi16(min_input, inptr[i]);
3738   }
3739   max_input = _mm_cmpgt_epi16(max_input, max);
3740   min_input = _mm_cmplt_epi16(min_input, min);
3741   temp1 = _mm_or_si128(max_input, min_input);
3742   test = _mm_movemask_epi8(temp1);
3743 
3744   if (!test) {
3745     // Do the row transform
3746     idct8_sse2(inptr);
3747 
3748     // Find the min & max for the column transform
3749     // N.B. Only first 4 cols contain non-zero coeffs
3750     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3751     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3752     for (i = 2; i < 8; i++) {
3753       max_input = _mm_max_epi16(max_input, inptr[i]);
3754       min_input = _mm_min_epi16(min_input, inptr[i]);
3755     }
3756     max_input = _mm_cmpgt_epi16(max_input, max);
3757     min_input = _mm_cmplt_epi16(min_input, min);
3758     temp1 = _mm_or_si128(max_input, min_input);
3759     test = _mm_movemask_epi8(temp1);
3760 
3761     if (test) {
3762       // Use fact only first 4 rows contain non-zero coeffs
3763       array_transpose_4X8(inptr, inptr);
3764       for (i = 0; i < 4; i++) {
3765         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3766         temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3767         temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3768         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3769         _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3770       }
3771     } else {
3772       // Set to use the optimised transform for the column
3773       optimised_cols = 1;
3774     }
3775   } else {
3776     // Run the un-optimised row transform
3777     for (i = 0; i < 4; ++i) {
3778       vpx_highbd_idct8_c(input, outptr, bd);
3779       input += 8;
3780       outptr += 8;
3781     }
3782   }
3783 
3784   if (optimised_cols) {
3785     idct8_sse2(inptr);
3786 
3787     // Final round & shift and Reconstruction and Store
3788     {
3789       __m128i d[8];
3790       for (i = 0; i < 8; i++) {
3791         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3792         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3793         inptr[i] = _mm_srai_epi16(inptr[i], 5);
3794         d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3795         // Store
3796         _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
3797       }
3798     }
3799   } else {
3800     // Run the un-optimised column transform
3801     tran_low_t temp_in[8], temp_out[8];
3802     for (i = 0; i < 8; ++i) {
3803       for (j = 0; j < 8; ++j)
3804         temp_in[j] = out[j * 8 + i];
3805       vpx_highbd_idct8_c(temp_in, temp_out, bd);
3806       for (j = 0; j < 8; ++j) {
3807         dest[j * stride + i] = highbd_clip_pixel_add(
3808             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3809       }
3810     }
3811   }
3812 }
3813 
vpx_highbd_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3814 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3815                                        int stride, int bd) {
3816   tran_low_t out[16 * 16];
3817   tran_low_t *outptr = out;
3818   int i, j, test;
3819   __m128i inptr[32];
3820   __m128i min_input, max_input, temp1, temp2, sign_bits;
3821   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3822   const __m128i zero = _mm_set1_epi16(0);
3823   const __m128i rounding = _mm_set1_epi16(32);
3824   const __m128i max = _mm_set1_epi16(3155);
3825   const __m128i min = _mm_set1_epi16(-3155);
3826   int optimised_cols = 0;
3827 
3828   // Load input into __m128i & pack to 16 bits
3829   for (i = 0; i < 16; i++) {
3830     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3831     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3832     inptr[i] = _mm_packs_epi32(temp1, temp2);
3833     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3834     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3835     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3836   }
3837 
3838   // Find the min & max for the row transform
3839   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3840   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3841   for (i = 2; i < 32; i++) {
3842     max_input = _mm_max_epi16(max_input, inptr[i]);
3843     min_input = _mm_min_epi16(min_input, inptr[i]);
3844   }
3845   max_input = _mm_cmpgt_epi16(max_input, max);
3846   min_input = _mm_cmplt_epi16(min_input, min);
3847   temp1 = _mm_or_si128(max_input, min_input);
3848   test = _mm_movemask_epi8(temp1);
3849 
3850   if (!test) {
3851     // Do the row transform
3852     idct16_sse2(inptr, inptr + 16);
3853 
3854     // Find the min & max for the column transform
3855     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3856     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3857     for (i = 2; i < 32; i++) {
3858       max_input = _mm_max_epi16(max_input, inptr[i]);
3859       min_input = _mm_min_epi16(min_input, inptr[i]);
3860     }
3861     max_input = _mm_cmpgt_epi16(max_input, max);
3862     min_input = _mm_cmplt_epi16(min_input, min);
3863     temp1 = _mm_or_si128(max_input, min_input);
3864     test = _mm_movemask_epi8(temp1);
3865 
3866     if (test) {
3867       array_transpose_16x16(inptr, inptr + 16);
3868       for (i = 0; i < 16; i++) {
3869         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3870         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3871         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3872         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3873         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3874         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3875         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3876         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3877         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3878         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3879       }
3880     } else {
3881       // Set to use the optimised transform for the column
3882       optimised_cols = 1;
3883     }
3884   } else {
3885     // Run the un-optimised row transform
3886     for (i = 0; i < 16; ++i) {
3887       vpx_highbd_idct16_c(input, outptr, bd);
3888       input += 16;
3889       outptr += 16;
3890     }
3891   }
3892 
3893   if (optimised_cols) {
3894     idct16_sse2(inptr, inptr + 16);
3895 
3896     // Final round & shift and Reconstruction and Store
3897     {
3898       __m128i d[2];
3899       for (i = 0; i < 16; i++) {
3900         inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
3901         inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
3902         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
3903         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
3904         inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
3905         inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
3906         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
3907         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
3908         // Store
3909         _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
3910         _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
3911       }
3912     }
3913   } else {
3914     // Run the un-optimised column transform
3915     tran_low_t temp_in[16], temp_out[16];
3916     for (i = 0; i < 16; ++i) {
3917       for (j = 0; j < 16; ++j)
3918         temp_in[j] = out[j * 16 + i];
3919       vpx_highbd_idct16_c(temp_in, temp_out, bd);
3920       for (j = 0; j < 16; ++j) {
3921         dest[j * stride + i] = highbd_clip_pixel_add(
3922             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3923       }
3924     }
3925   }
3926 }
3927 
vpx_highbd_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest8,int stride,int bd)3928 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3929                                       int stride, int bd) {
3930   tran_low_t out[16 * 16] = { 0 };
3931   tran_low_t *outptr = out;
3932   int i, j, test;
3933   __m128i inptr[32];
3934   __m128i min_input, max_input, temp1, temp2, sign_bits;
3935   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3936   const __m128i zero = _mm_set1_epi16(0);
3937   const __m128i rounding = _mm_set1_epi16(32);
3938   const __m128i max = _mm_set1_epi16(3155);
3939   const __m128i min = _mm_set1_epi16(-3155);
3940   int optimised_cols = 0;
3941 
3942   // Load input into __m128i & pack to 16 bits
3943   for (i = 0; i < 16; i++) {
3944     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3945     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3946     inptr[i] = _mm_packs_epi32(temp1, temp2);
3947     temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3948     temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3949     inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3950   }
3951 
3952   // Find the min & max for the row transform
3953   // Since all non-zero dct coefficients are in upper-left 4x4 area,
3954   // we only need to consider first 4 rows here.
3955   max_input = _mm_max_epi16(inptr[0], inptr[1]);
3956   min_input = _mm_min_epi16(inptr[0], inptr[1]);
3957   for (i = 2; i < 4; i++) {
3958     max_input = _mm_max_epi16(max_input, inptr[i]);
3959     min_input = _mm_min_epi16(min_input, inptr[i]);
3960   }
3961   max_input = _mm_cmpgt_epi16(max_input, max);
3962   min_input = _mm_cmplt_epi16(min_input, min);
3963   temp1 = _mm_or_si128(max_input, min_input);
3964   test = _mm_movemask_epi8(temp1);
3965 
3966   if (!test) {
3967     // Do the row transform (N.B. This transposes inptr)
3968     idct16_sse2(inptr, inptr + 16);
3969 
3970     // Find the min & max for the column transform
3971     // N.B. Only first 4 cols contain non-zero coeffs
3972     max_input = _mm_max_epi16(inptr[0], inptr[1]);
3973     min_input = _mm_min_epi16(inptr[0], inptr[1]);
3974     for (i = 2; i < 16; i++) {
3975       max_input = _mm_max_epi16(max_input, inptr[i]);
3976       min_input = _mm_min_epi16(min_input, inptr[i]);
3977     }
3978     max_input = _mm_cmpgt_epi16(max_input, max);
3979     min_input = _mm_cmplt_epi16(min_input, min);
3980     temp1 = _mm_or_si128(max_input, min_input);
3981     test = _mm_movemask_epi8(temp1);
3982 
3983     if (test) {
3984       // Use fact only first 4 rows contain non-zero coeffs
3985       array_transpose_8x8(inptr, inptr);
3986       array_transpose_8x8(inptr + 8, inptr + 16);
3987       for (i = 0; i < 4; i++) {
3988         sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3989         temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3990         temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3991         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3992         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3993         sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3994         temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3995         temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3996         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3997         _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3998       }
3999     } else {
4000       // Set to use the optimised transform for the column
4001       optimised_cols = 1;
4002     }
4003   } else {
4004     // Run the un-optimised row transform
4005     for (i = 0; i < 4; ++i) {
4006       vpx_highbd_idct16_c(input, outptr, bd);
4007       input += 16;
4008       outptr += 16;
4009     }
4010   }
4011 
4012   if (optimised_cols) {
4013     idct16_sse2(inptr, inptr + 16);
4014 
4015     // Final round & shift and Reconstruction and Store
4016     {
4017       __m128i d[2];
4018       for (i = 0; i < 16; i++) {
4019         inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
4020         inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
4021         d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
4022         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
4023         inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
4024         inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
4025         d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
4026         d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
4027         // Store
4028         _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
4029         _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
4030       }
4031     }
4032   } else {
4033     // Run the un-optimised column transform
4034     tran_low_t temp_in[16], temp_out[16];
4035     for (i = 0; i < 16; ++i) {
4036       for (j = 0; j < 16; ++j)
4037         temp_in[j] = out[j * 16 + i];
4038       vpx_highbd_idct16_c(temp_in, temp_out, bd);
4039       for (j = 0; j < 16; ++j) {
4040         dest[j * stride + i] = highbd_clip_pixel_add(
4041             dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4042       }
4043     }
4044   }
4045 }
4046 #endif  // CONFIG_VP9_HIGHBITDEPTH
4047