1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>  // SSE2
13 
14 #include "aom_dsp/fwd_txfm.h"
15 #include "aom_dsp/txfm_common.h"
16 #include "aom_dsp/x86/txfm_common_sse2.h"
17 
18 // TODO(jingning) The high bit-depth version needs re-work for performance.
19 // The current SSE2 implementation also causes cross reference to the static
20 // functions in the C implementation file.
21 #if DCT_HIGH_BIT_DEPTH
22 #define ADD_EPI16 _mm_adds_epi16
23 #define SUB_EPI16 _mm_subs_epi16
24 #if FDCT32x32_HIGH_PRECISION
aom_fdct32x32_rows_c(const int16_t * intermediate,tran_low_t * out)25 void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
26   int i, j;
27   for (i = 0; i < 32; ++i) {
28     tran_high_t temp_in[32], temp_out[32];
29     for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
30     aom_fdct32(temp_in, temp_out, 0);
31     for (j = 0; j < 32; ++j)
32       out[j + i * 32] =
33           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
34   }
35 }
36 #define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c
37 #define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c
38 #else
aom_fdct32x32_rd_rows_c(const int16_t * intermediate,tran_low_t * out)39 void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
40   int i, j;
41   for (i = 0; i < 32; ++i) {
42     tran_high_t temp_in[32], temp_out[32];
43     for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
44     aom_fdct32(temp_in, temp_out, 1);
45     for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
46   }
47 }
48 #define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c
49 #define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c
50 #endif  // FDCT32x32_HIGH_PRECISION
51 #else
52 #define ADD_EPI16 _mm_add_epi16
53 #define SUB_EPI16 _mm_sub_epi16
54 #endif  // DCT_HIGH_BIT_DEPTH
55 
FDCT32x32_2D(const int16_t * input,tran_low_t * output_org,int stride)56 void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
57   // Calculate pre-multiplied strides
58   const int str1 = stride;
59   const int str2 = 2 * stride;
60   const int str3 = 2 * stride + str1;
61   // We need an intermediate buffer between passes.
62   DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
63   // Constants
64   //    When we use them, in one case, they are all the same. In all others
65   //    it's a pair of them that we need to repeat four times. This is done
66   //    by constructing the 32 bit constant corresponding to that pair.
67   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
68   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
69   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
70   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
71   const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
72   const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
73   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
74   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
75   const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
76   const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
77   const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
78   const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
79   const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
80   const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
81   const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
82   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
83   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
84   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
85   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
86   const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
87   const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
88   const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
89   const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
90   const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
91   const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
92   const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
93   const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
94   const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
95   const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
96   const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
97   const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
98   const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
99   const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
100   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
101   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
102   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
103   const __m128i kZero = _mm_set1_epi16(0);
104   const __m128i kOne = _mm_set1_epi16(1);
105   // Do the two transform/transpose passes
106   int pass;
107 #if DCT_HIGH_BIT_DEPTH
108   int overflow;
109 #endif
110   for (pass = 0; pass < 2; ++pass) {
111     // We process eight columns (transposed rows in second pass) at a time.
112     int column_start;
113     for (column_start = 0; column_start < 32; column_start += 8) {
114       __m128i step1[32];
115       __m128i step2[32];
116       __m128i step3[32];
117       __m128i out[32];
118       // Stage 1
119       // Note: even though all the loads below are aligned, using the aligned
120       //       intrinsic make the code slightly slower.
121       if (0 == pass) {
122         const int16_t *in = &input[column_start];
123         // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
124         // Note: the next four blocks could be in a loop. That would help the
125         //       instruction cache but is actually slower.
126         {
127           const int16_t *ina = in + 0 * str1;
128           const int16_t *inb = in + 31 * str1;
129           __m128i *step1a = &step1[0];
130           __m128i *step1b = &step1[31];
131           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
132           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
133           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
134           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
135           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
136           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
137           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
138           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
139           step1a[0] = _mm_add_epi16(ina0, inb0);
140           step1a[1] = _mm_add_epi16(ina1, inb1);
141           step1a[2] = _mm_add_epi16(ina2, inb2);
142           step1a[3] = _mm_add_epi16(ina3, inb3);
143           step1b[-3] = _mm_sub_epi16(ina3, inb3);
144           step1b[-2] = _mm_sub_epi16(ina2, inb2);
145           step1b[-1] = _mm_sub_epi16(ina1, inb1);
146           step1b[-0] = _mm_sub_epi16(ina0, inb0);
147           step1a[0] = _mm_slli_epi16(step1a[0], 2);
148           step1a[1] = _mm_slli_epi16(step1a[1], 2);
149           step1a[2] = _mm_slli_epi16(step1a[2], 2);
150           step1a[3] = _mm_slli_epi16(step1a[3], 2);
151           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
152           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
153           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
154           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
155         }
156         {
157           const int16_t *ina = in + 4 * str1;
158           const int16_t *inb = in + 27 * str1;
159           __m128i *step1a = &step1[4];
160           __m128i *step1b = &step1[27];
161           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
162           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
163           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
164           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
165           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
166           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
167           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
168           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
169           step1a[0] = _mm_add_epi16(ina0, inb0);
170           step1a[1] = _mm_add_epi16(ina1, inb1);
171           step1a[2] = _mm_add_epi16(ina2, inb2);
172           step1a[3] = _mm_add_epi16(ina3, inb3);
173           step1b[-3] = _mm_sub_epi16(ina3, inb3);
174           step1b[-2] = _mm_sub_epi16(ina2, inb2);
175           step1b[-1] = _mm_sub_epi16(ina1, inb1);
176           step1b[-0] = _mm_sub_epi16(ina0, inb0);
177           step1a[0] = _mm_slli_epi16(step1a[0], 2);
178           step1a[1] = _mm_slli_epi16(step1a[1], 2);
179           step1a[2] = _mm_slli_epi16(step1a[2], 2);
180           step1a[3] = _mm_slli_epi16(step1a[3], 2);
181           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
182           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
183           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
184           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
185         }
186         {
187           const int16_t *ina = in + 8 * str1;
188           const int16_t *inb = in + 23 * str1;
189           __m128i *step1a = &step1[8];
190           __m128i *step1b = &step1[23];
191           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
192           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
193           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
194           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
195           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
196           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
197           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
198           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
199           step1a[0] = _mm_add_epi16(ina0, inb0);
200           step1a[1] = _mm_add_epi16(ina1, inb1);
201           step1a[2] = _mm_add_epi16(ina2, inb2);
202           step1a[3] = _mm_add_epi16(ina3, inb3);
203           step1b[-3] = _mm_sub_epi16(ina3, inb3);
204           step1b[-2] = _mm_sub_epi16(ina2, inb2);
205           step1b[-1] = _mm_sub_epi16(ina1, inb1);
206           step1b[-0] = _mm_sub_epi16(ina0, inb0);
207           step1a[0] = _mm_slli_epi16(step1a[0], 2);
208           step1a[1] = _mm_slli_epi16(step1a[1], 2);
209           step1a[2] = _mm_slli_epi16(step1a[2], 2);
210           step1a[3] = _mm_slli_epi16(step1a[3], 2);
211           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
212           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
213           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
214           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
215         }
216         {
217           const int16_t *ina = in + 12 * str1;
218           const int16_t *inb = in + 19 * str1;
219           __m128i *step1a = &step1[12];
220           __m128i *step1b = &step1[19];
221           const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
222           const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
223           const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
224           const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
225           const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
226           const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
227           const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
228           const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
229           step1a[0] = _mm_add_epi16(ina0, inb0);
230           step1a[1] = _mm_add_epi16(ina1, inb1);
231           step1a[2] = _mm_add_epi16(ina2, inb2);
232           step1a[3] = _mm_add_epi16(ina3, inb3);
233           step1b[-3] = _mm_sub_epi16(ina3, inb3);
234           step1b[-2] = _mm_sub_epi16(ina2, inb2);
235           step1b[-1] = _mm_sub_epi16(ina1, inb1);
236           step1b[-0] = _mm_sub_epi16(ina0, inb0);
237           step1a[0] = _mm_slli_epi16(step1a[0], 2);
238           step1a[1] = _mm_slli_epi16(step1a[1], 2);
239           step1a[2] = _mm_slli_epi16(step1a[2], 2);
240           step1a[3] = _mm_slli_epi16(step1a[3], 2);
241           step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
242           step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
243           step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
244           step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
245         }
246       } else {
247         int16_t *in = &intermediate[column_start];
248         // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
249         // Note: using the same approach as above to have common offset is
250         //       counter-productive as all offsets can be calculated at compile
251         //       time.
252         // Note: the next four blocks could be in a loop. That would help the
253         //       instruction cache but is actually slower.
254         {
255           __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
256           __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
257           __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
258           __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
259           __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
260           __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
261           __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
262           __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
263           step1[0] = ADD_EPI16(in00, in31);
264           step1[1] = ADD_EPI16(in01, in30);
265           step1[2] = ADD_EPI16(in02, in29);
266           step1[3] = ADD_EPI16(in03, in28);
267           step1[28] = SUB_EPI16(in03, in28);
268           step1[29] = SUB_EPI16(in02, in29);
269           step1[30] = SUB_EPI16(in01, in30);
270           step1[31] = SUB_EPI16(in00, in31);
271 #if DCT_HIGH_BIT_DEPTH
272           overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
273                                              &step1[3], &step1[28], &step1[29],
274                                              &step1[30], &step1[31]);
275           if (overflow) {
276             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
277             return;
278           }
279 #endif  // DCT_HIGH_BIT_DEPTH
280         }
281         {
282           __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
283           __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
284           __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
285           __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
286           __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
287           __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
288           __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
289           __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
290           step1[4] = ADD_EPI16(in04, in27);
291           step1[5] = ADD_EPI16(in05, in26);
292           step1[6] = ADD_EPI16(in06, in25);
293           step1[7] = ADD_EPI16(in07, in24);
294           step1[24] = SUB_EPI16(in07, in24);
295           step1[25] = SUB_EPI16(in06, in25);
296           step1[26] = SUB_EPI16(in05, in26);
297           step1[27] = SUB_EPI16(in04, in27);
298 #if DCT_HIGH_BIT_DEPTH
299           overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
300                                              &step1[7], &step1[24], &step1[25],
301                                              &step1[26], &step1[27]);
302           if (overflow) {
303             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
304             return;
305           }
306 #endif  // DCT_HIGH_BIT_DEPTH
307         }
308         {
309           __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
310           __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
311           __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
312           __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
313           __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
314           __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
315           __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
316           __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
317           step1[8] = ADD_EPI16(in08, in23);
318           step1[9] = ADD_EPI16(in09, in22);
319           step1[10] = ADD_EPI16(in10, in21);
320           step1[11] = ADD_EPI16(in11, in20);
321           step1[20] = SUB_EPI16(in11, in20);
322           step1[21] = SUB_EPI16(in10, in21);
323           step1[22] = SUB_EPI16(in09, in22);
324           step1[23] = SUB_EPI16(in08, in23);
325 #if DCT_HIGH_BIT_DEPTH
326           overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
327                                              &step1[11], &step1[20], &step1[21],
328                                              &step1[22], &step1[23]);
329           if (overflow) {
330             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
331             return;
332           }
333 #endif  // DCT_HIGH_BIT_DEPTH
334         }
335         {
336           __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
337           __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
338           __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
339           __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
340           __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
341           __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
342           __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
343           __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
344           step1[12] = ADD_EPI16(in12, in19);
345           step1[13] = ADD_EPI16(in13, in18);
346           step1[14] = ADD_EPI16(in14, in17);
347           step1[15] = ADD_EPI16(in15, in16);
348           step1[16] = SUB_EPI16(in15, in16);
349           step1[17] = SUB_EPI16(in14, in17);
350           step1[18] = SUB_EPI16(in13, in18);
351           step1[19] = SUB_EPI16(in12, in19);
352 #if DCT_HIGH_BIT_DEPTH
353           overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
354                                              &step1[15], &step1[16], &step1[17],
355                                              &step1[18], &step1[19]);
356           if (overflow) {
357             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
358             return;
359           }
360 #endif  // DCT_HIGH_BIT_DEPTH
361         }
362       }
363       // Stage 2
364       {
365         step2[0] = ADD_EPI16(step1[0], step1[15]);
366         step2[1] = ADD_EPI16(step1[1], step1[14]);
367         step2[2] = ADD_EPI16(step1[2], step1[13]);
368         step2[3] = ADD_EPI16(step1[3], step1[12]);
369         step2[4] = ADD_EPI16(step1[4], step1[11]);
370         step2[5] = ADD_EPI16(step1[5], step1[10]);
371         step2[6] = ADD_EPI16(step1[6], step1[9]);
372         step2[7] = ADD_EPI16(step1[7], step1[8]);
373         step2[8] = SUB_EPI16(step1[7], step1[8]);
374         step2[9] = SUB_EPI16(step1[6], step1[9]);
375         step2[10] = SUB_EPI16(step1[5], step1[10]);
376         step2[11] = SUB_EPI16(step1[4], step1[11]);
377         step2[12] = SUB_EPI16(step1[3], step1[12]);
378         step2[13] = SUB_EPI16(step1[2], step1[13]);
379         step2[14] = SUB_EPI16(step1[1], step1[14]);
380         step2[15] = SUB_EPI16(step1[0], step1[15]);
381 #if DCT_HIGH_BIT_DEPTH
382         overflow = check_epi16_overflow_x16(
383             &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
384             &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
385             &step2[12], &step2[13], &step2[14], &step2[15]);
386         if (overflow) {
387           if (pass == 0)
388             HIGH_FDCT32x32_2D_C(input, output_org, stride);
389           else
390             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
391           return;
392         }
393 #endif  // DCT_HIGH_BIT_DEPTH
394       }
395       {
396         const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
397         const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
398         const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
399         const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
400         const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
401         const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
402         const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
403         const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
404         const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
405         const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
406         const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
407         const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
408         const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
409         const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
410         const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
411         const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
412         const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
413         const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
414         const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
415         const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
416         const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
417         const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
418         const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
419         const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
420         // dct_const_round_shift
421         const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
422         const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
423         const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
424         const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
425         const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
426         const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
427         const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
428         const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
429         const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
430         const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
431         const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
432         const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
433         const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
434         const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
435         const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
436         const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
437         const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
438         const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
439         const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
440         const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
441         const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
442         const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
443         const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
444         const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
445         const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
446         const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
447         const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
448         const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
449         const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
450         const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
451         const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
452         const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
453         // Combine
454         step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
455         step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
456         step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
457         step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
458         step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
459         step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
460         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
461         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
462 #if DCT_HIGH_BIT_DEPTH
463         overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
464                                            &step2[23], &step2[24], &step2[25],
465                                            &step2[26], &step2[27]);
466         if (overflow) {
467           if (pass == 0)
468             HIGH_FDCT32x32_2D_C(input, output_org, stride);
469           else
470             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
471           return;
472         }
473 #endif  // DCT_HIGH_BIT_DEPTH
474       }
475 
476 #if !FDCT32x32_HIGH_PRECISION
477       // dump the magnitude by half, hence the intermediate values are within
478       // the range of 16 bits.
479       if (1 == pass) {
480         __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
481         __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
482         __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
483         __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
484         __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
485         __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
486         __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
487         __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
488         __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
489         __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
490         __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
491         __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
492         __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
493         __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
494         __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
495         __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
496         __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
497         __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
498         __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
499         __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
500         __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
501         __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
502         __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
503         __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
504         __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
505         __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
506         __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
507         __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
508         __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
509         __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
510         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
511         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
512 
513         step2[0] = SUB_EPI16(step2[0], s3_00_0);
514         step2[1] = SUB_EPI16(step2[1], s3_01_0);
515         step2[2] = SUB_EPI16(step2[2], s3_02_0);
516         step2[3] = SUB_EPI16(step2[3], s3_03_0);
517         step2[4] = SUB_EPI16(step2[4], s3_04_0);
518         step2[5] = SUB_EPI16(step2[5], s3_05_0);
519         step2[6] = SUB_EPI16(step2[6], s3_06_0);
520         step2[7] = SUB_EPI16(step2[7], s3_07_0);
521         step2[8] = SUB_EPI16(step2[8], s2_08_0);
522         step2[9] = SUB_EPI16(step2[9], s2_09_0);
523         step2[10] = SUB_EPI16(step2[10], s3_10_0);
524         step2[11] = SUB_EPI16(step2[11], s3_11_0);
525         step2[12] = SUB_EPI16(step2[12], s3_12_0);
526         step2[13] = SUB_EPI16(step2[13], s3_13_0);
527         step2[14] = SUB_EPI16(step2[14], s2_14_0);
528         step2[15] = SUB_EPI16(step2[15], s2_15_0);
529         step1[16] = SUB_EPI16(step1[16], s3_16_0);
530         step1[17] = SUB_EPI16(step1[17], s3_17_0);
531         step1[18] = SUB_EPI16(step1[18], s3_18_0);
532         step1[19] = SUB_EPI16(step1[19], s3_19_0);
533         step2[20] = SUB_EPI16(step2[20], s3_20_0);
534         step2[21] = SUB_EPI16(step2[21], s3_21_0);
535         step2[22] = SUB_EPI16(step2[22], s3_22_0);
536         step2[23] = SUB_EPI16(step2[23], s3_23_0);
537         step2[24] = SUB_EPI16(step2[24], s3_24_0);
538         step2[25] = SUB_EPI16(step2[25], s3_25_0);
539         step2[26] = SUB_EPI16(step2[26], s3_26_0);
540         step2[27] = SUB_EPI16(step2[27], s3_27_0);
541         step1[28] = SUB_EPI16(step1[28], s3_28_0);
542         step1[29] = SUB_EPI16(step1[29], s3_29_0);
543         step1[30] = SUB_EPI16(step1[30], s3_30_0);
544         step1[31] = SUB_EPI16(step1[31], s3_31_0);
545 #if DCT_HIGH_BIT_DEPTH
546         overflow = check_epi16_overflow_x32(
547             &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
548             &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
549             &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
550             &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
551             &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
552             &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
553         if (overflow) {
554           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
555           return;
556         }
557 #endif  // DCT_HIGH_BIT_DEPTH
558         step2[0] = _mm_add_epi16(step2[0], kOne);
559         step2[1] = _mm_add_epi16(step2[1], kOne);
560         step2[2] = _mm_add_epi16(step2[2], kOne);
561         step2[3] = _mm_add_epi16(step2[3], kOne);
562         step2[4] = _mm_add_epi16(step2[4], kOne);
563         step2[5] = _mm_add_epi16(step2[5], kOne);
564         step2[6] = _mm_add_epi16(step2[6], kOne);
565         step2[7] = _mm_add_epi16(step2[7], kOne);
566         step2[8] = _mm_add_epi16(step2[8], kOne);
567         step2[9] = _mm_add_epi16(step2[9], kOne);
568         step2[10] = _mm_add_epi16(step2[10], kOne);
569         step2[11] = _mm_add_epi16(step2[11], kOne);
570         step2[12] = _mm_add_epi16(step2[12], kOne);
571         step2[13] = _mm_add_epi16(step2[13], kOne);
572         step2[14] = _mm_add_epi16(step2[14], kOne);
573         step2[15] = _mm_add_epi16(step2[15], kOne);
574         step1[16] = _mm_add_epi16(step1[16], kOne);
575         step1[17] = _mm_add_epi16(step1[17], kOne);
576         step1[18] = _mm_add_epi16(step1[18], kOne);
577         step1[19] = _mm_add_epi16(step1[19], kOne);
578         step2[20] = _mm_add_epi16(step2[20], kOne);
579         step2[21] = _mm_add_epi16(step2[21], kOne);
580         step2[22] = _mm_add_epi16(step2[22], kOne);
581         step2[23] = _mm_add_epi16(step2[23], kOne);
582         step2[24] = _mm_add_epi16(step2[24], kOne);
583         step2[25] = _mm_add_epi16(step2[25], kOne);
584         step2[26] = _mm_add_epi16(step2[26], kOne);
585         step2[27] = _mm_add_epi16(step2[27], kOne);
586         step1[28] = _mm_add_epi16(step1[28], kOne);
587         step1[29] = _mm_add_epi16(step1[29], kOne);
588         step1[30] = _mm_add_epi16(step1[30], kOne);
589         step1[31] = _mm_add_epi16(step1[31], kOne);
590 
591         step2[0] = _mm_srai_epi16(step2[0], 2);
592         step2[1] = _mm_srai_epi16(step2[1], 2);
593         step2[2] = _mm_srai_epi16(step2[2], 2);
594         step2[3] = _mm_srai_epi16(step2[3], 2);
595         step2[4] = _mm_srai_epi16(step2[4], 2);
596         step2[5] = _mm_srai_epi16(step2[5], 2);
597         step2[6] = _mm_srai_epi16(step2[6], 2);
598         step2[7] = _mm_srai_epi16(step2[7], 2);
599         step2[8] = _mm_srai_epi16(step2[8], 2);
600         step2[9] = _mm_srai_epi16(step2[9], 2);
601         step2[10] = _mm_srai_epi16(step2[10], 2);
602         step2[11] = _mm_srai_epi16(step2[11], 2);
603         step2[12] = _mm_srai_epi16(step2[12], 2);
604         step2[13] = _mm_srai_epi16(step2[13], 2);
605         step2[14] = _mm_srai_epi16(step2[14], 2);
606         step2[15] = _mm_srai_epi16(step2[15], 2);
607         step1[16] = _mm_srai_epi16(step1[16], 2);
608         step1[17] = _mm_srai_epi16(step1[17], 2);
609         step1[18] = _mm_srai_epi16(step1[18], 2);
610         step1[19] = _mm_srai_epi16(step1[19], 2);
611         step2[20] = _mm_srai_epi16(step2[20], 2);
612         step2[21] = _mm_srai_epi16(step2[21], 2);
613         step2[22] = _mm_srai_epi16(step2[22], 2);
614         step2[23] = _mm_srai_epi16(step2[23], 2);
615         step2[24] = _mm_srai_epi16(step2[24], 2);
616         step2[25] = _mm_srai_epi16(step2[25], 2);
617         step2[26] = _mm_srai_epi16(step2[26], 2);
618         step2[27] = _mm_srai_epi16(step2[27], 2);
619         step1[28] = _mm_srai_epi16(step1[28], 2);
620         step1[29] = _mm_srai_epi16(step1[29], 2);
621         step1[30] = _mm_srai_epi16(step1[30], 2);
622         step1[31] = _mm_srai_epi16(step1[31], 2);
623       }
624 #endif  // !FDCT32x32_HIGH_PRECISION
625 
626 #if FDCT32x32_HIGH_PRECISION
627       if (pass == 0) {
628 #endif
629         // Stage 3
630         {
631           step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
632           step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
633           step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
634           step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
635           step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
636           step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
637           step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
638           step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
639 #if DCT_HIGH_BIT_DEPTH
640           overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
641                                              &step3[3], &step3[4], &step3[5],
642                                              &step3[6], &step3[7]);
643           if (overflow) {
644             if (pass == 0)
645               HIGH_FDCT32x32_2D_C(input, output_org, stride);
646             else
647               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
648             return;
649           }
650 #endif  // DCT_HIGH_BIT_DEPTH
651         }
652         {
653           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
654           const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
655           const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
656           const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
657           const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
658           const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
659           const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
660           const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
661           const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
662           const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
663           const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
664           const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
665           // dct_const_round_shift
666           const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
667           const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
668           const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
669           const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
670           const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
671           const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
672           const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
673           const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
674           const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
675           const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
676           const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
677           const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
678           const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
679           const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
680           const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
681           const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
682           // Combine
683           step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
684           step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
685           step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
686           step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
687 #if DCT_HIGH_BIT_DEPTH
688           overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
689                                              &step3[13]);
690           if (overflow) {
691             if (pass == 0)
692               HIGH_FDCT32x32_2D_C(input, output_org, stride);
693             else
694               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
695             return;
696           }
697 #endif  // DCT_HIGH_BIT_DEPTH
698         }
699         {
700           step3[16] = ADD_EPI16(step2[23], step1[16]);
701           step3[17] = ADD_EPI16(step2[22], step1[17]);
702           step3[18] = ADD_EPI16(step2[21], step1[18]);
703           step3[19] = ADD_EPI16(step2[20], step1[19]);
704           step3[20] = SUB_EPI16(step1[19], step2[20]);
705           step3[21] = SUB_EPI16(step1[18], step2[21]);
706           step3[22] = SUB_EPI16(step1[17], step2[22]);
707           step3[23] = SUB_EPI16(step1[16], step2[23]);
708           step3[24] = SUB_EPI16(step1[31], step2[24]);
709           step3[25] = SUB_EPI16(step1[30], step2[25]);
710           step3[26] = SUB_EPI16(step1[29], step2[26]);
711           step3[27] = SUB_EPI16(step1[28], step2[27]);
712           step3[28] = ADD_EPI16(step2[27], step1[28]);
713           step3[29] = ADD_EPI16(step2[26], step1[29]);
714           step3[30] = ADD_EPI16(step2[25], step1[30]);
715           step3[31] = ADD_EPI16(step2[24], step1[31]);
716 #if DCT_HIGH_BIT_DEPTH
717           overflow = check_epi16_overflow_x16(
718               &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
719               &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
720               &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
721               &step3[31]);
722           if (overflow) {
723             if (pass == 0)
724               HIGH_FDCT32x32_2D_C(input, output_org, stride);
725             else
726               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
727             return;
728           }
729 #endif  // DCT_HIGH_BIT_DEPTH
730         }
731 
732         // Stage 4
733         {
734           step1[0] = ADD_EPI16(step3[3], step3[0]);
735           step1[1] = ADD_EPI16(step3[2], step3[1]);
736           step1[2] = SUB_EPI16(step3[1], step3[2]);
737           step1[3] = SUB_EPI16(step3[0], step3[3]);
738           step1[8] = ADD_EPI16(step3[11], step2[8]);
739           step1[9] = ADD_EPI16(step3[10], step2[9]);
740           step1[10] = SUB_EPI16(step2[9], step3[10]);
741           step1[11] = SUB_EPI16(step2[8], step3[11]);
742           step1[12] = SUB_EPI16(step2[15], step3[12]);
743           step1[13] = SUB_EPI16(step2[14], step3[13]);
744           step1[14] = ADD_EPI16(step3[13], step2[14]);
745           step1[15] = ADD_EPI16(step3[12], step2[15]);
746 #if DCT_HIGH_BIT_DEPTH
747           overflow = check_epi16_overflow_x16(
748               &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
749               &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
750               &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
751           if (overflow) {
752             if (pass == 0)
753               HIGH_FDCT32x32_2D_C(input, output_org, stride);
754             else
755               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
756             return;
757           }
758 #endif  // DCT_HIGH_BIT_DEPTH
759         }
760         {
761           const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
762           const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
763           const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
764           const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
765           const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
766           const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
767           // dct_const_round_shift
768           const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
769           const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
770           const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
771           const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
772           const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
773           const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
774           const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
775           const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
776           // Combine
777           step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
778           step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
779 #if DCT_HIGH_BIT_DEPTH
780           overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
781           if (overflow) {
782             if (pass == 0)
783               HIGH_FDCT32x32_2D_C(input, output_org, stride);
784             else
785               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
786             return;
787           }
788 #endif  // DCT_HIGH_BIT_DEPTH
789         }
790         {
791           const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
792           const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
793           const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
794           const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
795           const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
796           const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
797           const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
798           const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
799           const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
800           const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
801           const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
802           const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
803           const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
804           const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
805           const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
806           const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
807           const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
808           const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
809           const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
810           const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
811           const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
812           const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
813           const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
814           const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
815           // dct_const_round_shift
816           const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
817           const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
818           const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
819           const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
820           const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
821           const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
822           const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
823           const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
824           const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
825           const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
826           const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
827           const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
828           const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
829           const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
830           const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
831           const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
832           const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
833           const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
834           const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
835           const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
836           const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
837           const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
838           const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
839           const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
840           const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
841           const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
842           const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
843           const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
844           const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
845           const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
846           const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
847           const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
848           // Combine
849           step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
850           step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
851           step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
852           step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
853           step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
854           step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
855           step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
856           step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
857 #if DCT_HIGH_BIT_DEPTH
858           overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
859                                              &step1[21], &step1[26], &step1[27],
860                                              &step1[28], &step1[29]);
861           if (overflow) {
862             if (pass == 0)
863               HIGH_FDCT32x32_2D_C(input, output_org, stride);
864             else
865               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
866             return;
867           }
868 #endif  // DCT_HIGH_BIT_DEPTH
869         }
870         // Stage 5
871         {
872           step2[4] = ADD_EPI16(step1[5], step3[4]);
873           step2[5] = SUB_EPI16(step3[4], step1[5]);
874           step2[6] = SUB_EPI16(step3[7], step1[6]);
875           step2[7] = ADD_EPI16(step1[6], step3[7]);
876 #if DCT_HIGH_BIT_DEPTH
877           overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
878                                              &step2[7]);
879           if (overflow) {
880             if (pass == 0)
881               HIGH_FDCT32x32_2D_C(input, output_org, stride);
882             else
883               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
884             return;
885           }
886 #endif  // DCT_HIGH_BIT_DEPTH
887         }
888         {
889           const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
890           const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
891           const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
892           const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
893           const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
894           const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
895           const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
896           const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
897           const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
898           const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
899           const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
900           const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
901           // dct_const_round_shift
902           const __m128i out_00_4 =
903               _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
904           const __m128i out_00_5 =
905               _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
906           const __m128i out_16_4 =
907               _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
908           const __m128i out_16_5 =
909               _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
910           const __m128i out_08_4 =
911               _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
912           const __m128i out_08_5 =
913               _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
914           const __m128i out_24_4 =
915               _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
916           const __m128i out_24_5 =
917               _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
918           const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
919           const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
920           const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
921           const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
922           const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
923           const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
924           const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
925           const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
926           // Combine
927           out[0] = _mm_packs_epi32(out_00_6, out_00_7);
928           out[16] = _mm_packs_epi32(out_16_6, out_16_7);
929           out[8] = _mm_packs_epi32(out_08_6, out_08_7);
930           out[24] = _mm_packs_epi32(out_24_6, out_24_7);
931 #if DCT_HIGH_BIT_DEPTH
932           overflow =
933               check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
934           if (overflow) {
935             if (pass == 0)
936               HIGH_FDCT32x32_2D_C(input, output_org, stride);
937             else
938               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
939             return;
940           }
941 #endif  // DCT_HIGH_BIT_DEPTH
942         }
943         {
944           const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
945           const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
946           const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
947           const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
948           const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
949           const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
950           const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
951           const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
952           const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
953           const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
954           const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
955           const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
956           // dct_const_round_shift
957           const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
958           const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
959           const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
960           const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
961           const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
962           const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
963           const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
964           const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
965           const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
966           const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
967           const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
968           const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
969           const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
970           const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
971           const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
972           const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
973           // Combine
974           step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
975           step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
976           step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
977           step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
978 #if DCT_HIGH_BIT_DEPTH
979           overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
980                                              &step2[14]);
981           if (overflow) {
982             if (pass == 0)
983               HIGH_FDCT32x32_2D_C(input, output_org, stride);
984             else
985               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
986             return;
987           }
988 #endif  // DCT_HIGH_BIT_DEPTH
989         }
990         {
991           step2[16] = ADD_EPI16(step1[19], step3[16]);
992           step2[17] = ADD_EPI16(step1[18], step3[17]);
993           step2[18] = SUB_EPI16(step3[17], step1[18]);
994           step2[19] = SUB_EPI16(step3[16], step1[19]);
995           step2[20] = SUB_EPI16(step3[23], step1[20]);
996           step2[21] = SUB_EPI16(step3[22], step1[21]);
997           step2[22] = ADD_EPI16(step1[21], step3[22]);
998           step2[23] = ADD_EPI16(step1[20], step3[23]);
999           step2[24] = ADD_EPI16(step1[27], step3[24]);
1000           step2[25] = ADD_EPI16(step1[26], step3[25]);
1001           step2[26] = SUB_EPI16(step3[25], step1[26]);
1002           step2[27] = SUB_EPI16(step3[24], step1[27]);
1003           step2[28] = SUB_EPI16(step3[31], step1[28]);
1004           step2[29] = SUB_EPI16(step3[30], step1[29]);
1005           step2[30] = ADD_EPI16(step1[29], step3[30]);
1006           step2[31] = ADD_EPI16(step1[28], step3[31]);
1007 #if DCT_HIGH_BIT_DEPTH
1008           overflow = check_epi16_overflow_x16(
1009               &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
1010               &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
1011               &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
1012               &step2[31]);
1013           if (overflow) {
1014             if (pass == 0)
1015               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1016             else
1017               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1018             return;
1019           }
1020 #endif  // DCT_HIGH_BIT_DEPTH
1021         }
1022         // Stage 6
1023         {
1024           const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1025           const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1026           const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1027           const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1028           const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
1029           const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
1030           const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
1031           const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
1032           const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
1033           const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
1034           const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
1035           const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
1036           const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
1037           const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
1038           const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
1039           const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
1040           // dct_const_round_shift
1041           const __m128i out_04_4 =
1042               _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
1043           const __m128i out_04_5 =
1044               _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
1045           const __m128i out_20_4 =
1046               _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
1047           const __m128i out_20_5 =
1048               _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
1049           const __m128i out_12_4 =
1050               _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
1051           const __m128i out_12_5 =
1052               _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
1053           const __m128i out_28_4 =
1054               _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
1055           const __m128i out_28_5 =
1056               _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
1057           const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
1058           const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
1059           const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
1060           const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
1061           const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
1062           const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
1063           const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
1064           const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
1065           // Combine
1066           out[4] = _mm_packs_epi32(out_04_6, out_04_7);
1067           out[20] = _mm_packs_epi32(out_20_6, out_20_7);
1068           out[12] = _mm_packs_epi32(out_12_6, out_12_7);
1069           out[28] = _mm_packs_epi32(out_28_6, out_28_7);
1070 #if DCT_HIGH_BIT_DEPTH
1071           overflow =
1072               check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
1073           if (overflow) {
1074             if (pass == 0)
1075               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1076             else
1077               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1078             return;
1079           }
1080 #endif  // DCT_HIGH_BIT_DEPTH
1081         }
1082         {
1083           step3[8] = ADD_EPI16(step2[9], step1[8]);
1084           step3[9] = SUB_EPI16(step1[8], step2[9]);
1085           step3[10] = SUB_EPI16(step1[11], step2[10]);
1086           step3[11] = ADD_EPI16(step2[10], step1[11]);
1087           step3[12] = ADD_EPI16(step2[13], step1[12]);
1088           step3[13] = SUB_EPI16(step1[12], step2[13]);
1089           step3[14] = SUB_EPI16(step1[15], step2[14]);
1090           step3[15] = ADD_EPI16(step2[14], step1[15]);
1091 #if DCT_HIGH_BIT_DEPTH
1092           overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
1093                                              &step3[11], &step3[12], &step3[13],
1094                                              &step3[14], &step3[15]);
1095           if (overflow) {
1096             if (pass == 0)
1097               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1098             else
1099               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1100             return;
1101           }
1102 #endif  // DCT_HIGH_BIT_DEPTH
1103         }
1104         {
1105           const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
1106           const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
1107           const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
1108           const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
1109           const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
1110           const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
1111           const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
1112           const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
1113           const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
1114           const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
1115           const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
1116           const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
1117           const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
1118           const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
1119           const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
1120           const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
1121           const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
1122           const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
1123           const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
1124           const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
1125           const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
1126           const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
1127           const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
1128           const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
1129           // dct_const_round_shift
1130           const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
1131           const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
1132           const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
1133           const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
1134           const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
1135           const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
1136           const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
1137           const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
1138           const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
1139           const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
1140           const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
1141           const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
1142           const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
1143           const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
1144           const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
1145           const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
1146           const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
1147           const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
1148           const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
1149           const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
1150           const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
1151           const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
1152           const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
1153           const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
1154           const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
1155           const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
1156           const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
1157           const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
1158           const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
1159           const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
1160           const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
1161           const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
1162           // Combine
1163           step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
1164           step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
1165           step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
1166           step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
1167           // Combine
1168           step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
1169           step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
1170           step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
1171           step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
1172 #if DCT_HIGH_BIT_DEPTH
1173           overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
1174                                              &step3[22], &step3[25], &step3[26],
1175                                              &step3[29], &step3[30]);
1176           if (overflow) {
1177             if (pass == 0)
1178               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1179             else
1180               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1181             return;
1182           }
1183 #endif  // DCT_HIGH_BIT_DEPTH
1184         }
1185         // Stage 7
1186         {
1187           const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
1188           const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
1189           const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
1190           const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
1191           const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
1192           const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
1193           const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
1194           const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
1195           const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
1196           const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
1197           const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
1198           const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
1199           const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
1200           const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
1201           const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
1202           const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
1203           const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
1204           const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
1205           const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
1206           const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
1207           const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
1208           const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
1209           const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
1210           const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
1211           // dct_const_round_shift
1212           const __m128i out_02_4 =
1213               _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
1214           const __m128i out_02_5 =
1215               _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
1216           const __m128i out_18_4 =
1217               _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
1218           const __m128i out_18_5 =
1219               _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
1220           const __m128i out_10_4 =
1221               _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
1222           const __m128i out_10_5 =
1223               _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
1224           const __m128i out_26_4 =
1225               _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
1226           const __m128i out_26_5 =
1227               _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
1228           const __m128i out_06_4 =
1229               _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
1230           const __m128i out_06_5 =
1231               _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
1232           const __m128i out_22_4 =
1233               _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
1234           const __m128i out_22_5 =
1235               _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
1236           const __m128i out_14_4 =
1237               _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
1238           const __m128i out_14_5 =
1239               _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
1240           const __m128i out_30_4 =
1241               _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
1242           const __m128i out_30_5 =
1243               _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
1244           const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
1245           const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
1246           const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
1247           const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
1248           const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
1249           const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
1250           const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
1251           const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
1252           const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
1253           const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
1254           const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
1255           const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
1256           const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
1257           const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
1258           const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
1259           const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
1260           // Combine
1261           out[2] = _mm_packs_epi32(out_02_6, out_02_7);
1262           out[18] = _mm_packs_epi32(out_18_6, out_18_7);
1263           out[10] = _mm_packs_epi32(out_10_6, out_10_7);
1264           out[26] = _mm_packs_epi32(out_26_6, out_26_7);
1265           out[6] = _mm_packs_epi32(out_06_6, out_06_7);
1266           out[22] = _mm_packs_epi32(out_22_6, out_22_7);
1267           out[14] = _mm_packs_epi32(out_14_6, out_14_7);
1268           out[30] = _mm_packs_epi32(out_30_6, out_30_7);
1269 #if DCT_HIGH_BIT_DEPTH
1270           overflow =
1271               check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
1272                                       &out[6], &out[22], &out[14], &out[30]);
1273           if (overflow) {
1274             if (pass == 0)
1275               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1276             else
1277               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1278             return;
1279           }
1280 #endif  // DCT_HIGH_BIT_DEPTH
1281         }
1282         {
1283           step1[16] = ADD_EPI16(step3[17], step2[16]);
1284           step1[17] = SUB_EPI16(step2[16], step3[17]);
1285           step1[18] = SUB_EPI16(step2[19], step3[18]);
1286           step1[19] = ADD_EPI16(step3[18], step2[19]);
1287           step1[20] = ADD_EPI16(step3[21], step2[20]);
1288           step1[21] = SUB_EPI16(step2[20], step3[21]);
1289           step1[22] = SUB_EPI16(step2[23], step3[22]);
1290           step1[23] = ADD_EPI16(step3[22], step2[23]);
1291           step1[24] = ADD_EPI16(step3[25], step2[24]);
1292           step1[25] = SUB_EPI16(step2[24], step3[25]);
1293           step1[26] = SUB_EPI16(step2[27], step3[26]);
1294           step1[27] = ADD_EPI16(step3[26], step2[27]);
1295           step1[28] = ADD_EPI16(step3[29], step2[28]);
1296           step1[29] = SUB_EPI16(step2[28], step3[29]);
1297           step1[30] = SUB_EPI16(step2[31], step3[30]);
1298           step1[31] = ADD_EPI16(step3[30], step2[31]);
1299 #if DCT_HIGH_BIT_DEPTH
1300           overflow = check_epi16_overflow_x16(
1301               &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
1302               &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
1303               &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
1304               &step1[31]);
1305           if (overflow) {
1306             if (pass == 0)
1307               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1308             else
1309               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1310             return;
1311           }
1312 #endif  // DCT_HIGH_BIT_DEPTH
1313         }
1314         // Final stage --- outputs indices are bit-reversed.
1315         {
1316           const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
1317           const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
1318           const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
1319           const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
1320           const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
1321           const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
1322           const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
1323           const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
1324           const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
1325           const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
1326           const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
1327           const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
1328           const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
1329           const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
1330           const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
1331           const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
1332           const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
1333           const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
1334           const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
1335           const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
1336           const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
1337           const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
1338           const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
1339           const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
1340           // dct_const_round_shift
1341           const __m128i out_01_4 =
1342               _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
1343           const __m128i out_01_5 =
1344               _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
1345           const __m128i out_17_4 =
1346               _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
1347           const __m128i out_17_5 =
1348               _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
1349           const __m128i out_09_4 =
1350               _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
1351           const __m128i out_09_5 =
1352               _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
1353           const __m128i out_25_4 =
1354               _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
1355           const __m128i out_25_5 =
1356               _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
1357           const __m128i out_07_4 =
1358               _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
1359           const __m128i out_07_5 =
1360               _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
1361           const __m128i out_23_4 =
1362               _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
1363           const __m128i out_23_5 =
1364               _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
1365           const __m128i out_15_4 =
1366               _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
1367           const __m128i out_15_5 =
1368               _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
1369           const __m128i out_31_4 =
1370               _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
1371           const __m128i out_31_5 =
1372               _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
1373           const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
1374           const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
1375           const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
1376           const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
1377           const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
1378           const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
1379           const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
1380           const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
1381           const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
1382           const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
1383           const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
1384           const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
1385           const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
1386           const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
1387           const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
1388           const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
1389           // Combine
1390           out[1] = _mm_packs_epi32(out_01_6, out_01_7);
1391           out[17] = _mm_packs_epi32(out_17_6, out_17_7);
1392           out[9] = _mm_packs_epi32(out_09_6, out_09_7);
1393           out[25] = _mm_packs_epi32(out_25_6, out_25_7);
1394           out[7] = _mm_packs_epi32(out_07_6, out_07_7);
1395           out[23] = _mm_packs_epi32(out_23_6, out_23_7);
1396           out[15] = _mm_packs_epi32(out_15_6, out_15_7);
1397           out[31] = _mm_packs_epi32(out_31_6, out_31_7);
1398 #if DCT_HIGH_BIT_DEPTH
1399           overflow =
1400               check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
1401                                       &out[7], &out[23], &out[15], &out[31]);
1402           if (overflow) {
1403             if (pass == 0)
1404               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1405             else
1406               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1407             return;
1408           }
1409 #endif  // DCT_HIGH_BIT_DEPTH
1410         }
1411         {
1412           const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
1413           const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
1414           const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
1415           const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
1416           const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
1417           const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
1418           const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
1419           const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
1420           const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
1421           const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
1422           const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
1423           const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
1424           const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
1425           const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
1426           const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
1427           const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
1428           const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
1429           const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
1430           const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
1431           const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
1432           const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
1433           const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
1434           const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
1435           const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
1436           // dct_const_round_shift
1437           const __m128i out_05_4 =
1438               _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
1439           const __m128i out_05_5 =
1440               _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
1441           const __m128i out_21_4 =
1442               _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
1443           const __m128i out_21_5 =
1444               _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
1445           const __m128i out_13_4 =
1446               _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
1447           const __m128i out_13_5 =
1448               _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
1449           const __m128i out_29_4 =
1450               _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
1451           const __m128i out_29_5 =
1452               _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
1453           const __m128i out_03_4 =
1454               _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
1455           const __m128i out_03_5 =
1456               _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
1457           const __m128i out_19_4 =
1458               _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
1459           const __m128i out_19_5 =
1460               _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
1461           const __m128i out_11_4 =
1462               _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
1463           const __m128i out_11_5 =
1464               _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
1465           const __m128i out_27_4 =
1466               _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
1467           const __m128i out_27_5 =
1468               _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
1469           const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
1470           const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
1471           const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
1472           const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
1473           const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
1474           const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
1475           const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
1476           const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
1477           const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
1478           const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
1479           const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
1480           const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
1481           const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
1482           const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
1483           const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
1484           const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
1485           // Combine
1486           out[5] = _mm_packs_epi32(out_05_6, out_05_7);
1487           out[21] = _mm_packs_epi32(out_21_6, out_21_7);
1488           out[13] = _mm_packs_epi32(out_13_6, out_13_7);
1489           out[29] = _mm_packs_epi32(out_29_6, out_29_7);
1490           out[3] = _mm_packs_epi32(out_03_6, out_03_7);
1491           out[19] = _mm_packs_epi32(out_19_6, out_19_7);
1492           out[11] = _mm_packs_epi32(out_11_6, out_11_7);
1493           out[27] = _mm_packs_epi32(out_27_6, out_27_7);
1494 #if DCT_HIGH_BIT_DEPTH
1495           overflow =
1496               check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
1497                                       &out[3], &out[19], &out[11], &out[27]);
1498           if (overflow) {
1499             if (pass == 0)
1500               HIGH_FDCT32x32_2D_C(input, output_org, stride);
1501             else
1502               HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1503             return;
1504           }
1505 #endif  // DCT_HIGH_BIT_DEPTH
1506         }
1507 #if FDCT32x32_HIGH_PRECISION
1508       } else {
1509         __m128i lstep1[64], lstep2[64], lstep3[64];
1510         __m128i u[32], v[32], sign[16];
1511         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
1512         // start using 32-bit operations
1513         // stage 3
1514         {
1515           // expanding to 32-bit length priori to addition operations
1516           lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
1517           lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
1518           lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
1519           lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
1520           lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
1521           lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
1522           lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
1523           lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
1524           lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
1525           lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
1526           lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
1527           lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
1528           lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
1529           lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
1530           lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
1531           lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
1532           lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
1533           lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
1534           lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
1535           lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
1536           lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
1537           lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
1538           lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
1539           lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
1540           lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
1541           lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
1542           lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
1543           lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
1544           lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
1545           lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
1546           lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
1547           lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
1548 
1549           lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
1550           lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
1551           lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
1552           lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
1553           lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
1554           lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
1555           lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
1556           lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
1557           lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
1558           lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
1559           lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
1560           lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
1561           lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
1562           lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
1563           lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
1564           lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
1565         }
1566         {
1567           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
1568           const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
1569           const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
1570           const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
1571           const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
1572           const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
1573           const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
1574           const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
1575           const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
1576           const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
1577           const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
1578           const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
1579           // dct_const_round_shift
1580           const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
1581           const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
1582           const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
1583           const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
1584           const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
1585           const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
1586           const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
1587           const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
1588           lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
1589           lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
1590           lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
1591           lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
1592           lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
1593           lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
1594           lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
1595           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
1596         }
1597         {
1598           lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
1599           lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
1600           lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
1601           lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
1602           lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
1603           lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
1604           lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
1605           lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
1606           lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
1607           lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
1608           lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
1609           lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
1610           lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
1611           lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
1612           lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
1613           lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
1614           lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
1615           lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
1616           lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
1617           lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
1618           lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
1619           lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
1620           lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
1621           lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
1622           lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
1623           lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
1624           lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
1625           lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
1626           lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
1627           lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
1628           lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
1629           lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
1630 
1631           lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
1632           lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
1633           lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
1634           lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
1635           lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
1636           lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
1637           lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
1638           lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
1639           lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
1640           lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
1641           lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
1642           lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
1643           lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
1644           lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
1645           lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
1646           lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
1647           lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
1648           lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
1649           lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
1650           lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
1651           lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
1652           lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
1653           lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
1654           lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
1655           lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
1656           lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
1657           lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
1658           lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
1659           lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
1660           lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
1661           lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
1662           lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
1663 
1664           lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
1665           lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
1666 
1667           lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
1668           lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
1669           lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
1670           lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
1671           lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
1672           lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
1673           lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
1674           lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
1675           lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
1676           lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
1677           lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
1678           lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
1679           lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
1680           lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
1681           lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
1682           lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
1683           lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
1684           lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
1685           lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
1686           lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
1687           lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
1688           lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
1689           lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
1690           lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
1691           lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
1692           lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
1693           lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
1694           lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
1695           lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
1696           lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
1697         }
1698 
1699         // stage 4
1700         {
1701           // expanding to 32-bit length priori to addition operations
1702           lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
1703           lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
1704           lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
1705           lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
1706           lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
1707           lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
1708           lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
1709           lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
1710           lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
1711           lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
1712           lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
1713           lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
1714           lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
1715           lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
1716           lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
1717           lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
1718 
1719           lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
1720           lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
1721           lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
1722           lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
1723           lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
1724           lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
1725           lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
1726           lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
1727           lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
1728           lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
1729           lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
1730           lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
1731           lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
1732           lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
1733           lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
1734           lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
1735           lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
1736           lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
1737           lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
1738           lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
1739           lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
1740           lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
1741           lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
1742           lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
1743         }
1744         {
1745           // to be continued...
1746           //
1747           const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1748           const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1749 
1750           u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
1751           u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
1752           u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
1753           u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
1754 
1755           // TODO(jingning): manually inline k_madd_epi32_ to further hide
1756           // instruction latency.
1757           v[0] = k_madd_epi32(u[0], k32_p16_m16);
1758           v[1] = k_madd_epi32(u[1], k32_p16_m16);
1759           v[2] = k_madd_epi32(u[2], k32_p16_m16);
1760           v[3] = k_madd_epi32(u[3], k32_p16_m16);
1761           v[4] = k_madd_epi32(u[0], k32_p16_p16);
1762           v[5] = k_madd_epi32(u[1], k32_p16_p16);
1763           v[6] = k_madd_epi32(u[2], k32_p16_p16);
1764           v[7] = k_madd_epi32(u[3], k32_p16_p16);
1765 #if DCT_HIGH_BIT_DEPTH
1766           overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
1767                                               &v[5], &v[6], &v[7], &kZero);
1768           if (overflow) {
1769             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1770             return;
1771           }
1772 #endif  // DCT_HIGH_BIT_DEPTH
1773           u[0] = k_packs_epi64(v[0], v[1]);
1774           u[1] = k_packs_epi64(v[2], v[3]);
1775           u[2] = k_packs_epi64(v[4], v[5]);
1776           u[3] = k_packs_epi64(v[6], v[7]);
1777 
1778           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1779           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1780           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1781           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1782 
1783           lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1784           lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1785           lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1786           lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1787         }
1788         {
1789           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1790           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
1791           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1792 
1793           u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
1794           u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
1795           u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
1796           u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
1797           u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
1798           u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
1799           u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
1800           u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
1801           u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
1802           u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
1803           u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
1804           u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
1805           u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
1806           u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
1807           u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
1808           u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
1809 
1810           v[0] = k_madd_epi32(u[0], k32_m08_p24);
1811           v[1] = k_madd_epi32(u[1], k32_m08_p24);
1812           v[2] = k_madd_epi32(u[2], k32_m08_p24);
1813           v[3] = k_madd_epi32(u[3], k32_m08_p24);
1814           v[4] = k_madd_epi32(u[4], k32_m08_p24);
1815           v[5] = k_madd_epi32(u[5], k32_m08_p24);
1816           v[6] = k_madd_epi32(u[6], k32_m08_p24);
1817           v[7] = k_madd_epi32(u[7], k32_m08_p24);
1818           v[8] = k_madd_epi32(u[8], k32_m24_m08);
1819           v[9] = k_madd_epi32(u[9], k32_m24_m08);
1820           v[10] = k_madd_epi32(u[10], k32_m24_m08);
1821           v[11] = k_madd_epi32(u[11], k32_m24_m08);
1822           v[12] = k_madd_epi32(u[12], k32_m24_m08);
1823           v[13] = k_madd_epi32(u[13], k32_m24_m08);
1824           v[14] = k_madd_epi32(u[14], k32_m24_m08);
1825           v[15] = k_madd_epi32(u[15], k32_m24_m08);
1826           v[16] = k_madd_epi32(u[12], k32_m08_p24);
1827           v[17] = k_madd_epi32(u[13], k32_m08_p24);
1828           v[18] = k_madd_epi32(u[14], k32_m08_p24);
1829           v[19] = k_madd_epi32(u[15], k32_m08_p24);
1830           v[20] = k_madd_epi32(u[8], k32_m08_p24);
1831           v[21] = k_madd_epi32(u[9], k32_m08_p24);
1832           v[22] = k_madd_epi32(u[10], k32_m08_p24);
1833           v[23] = k_madd_epi32(u[11], k32_m08_p24);
1834           v[24] = k_madd_epi32(u[4], k32_p24_p08);
1835           v[25] = k_madd_epi32(u[5], k32_p24_p08);
1836           v[26] = k_madd_epi32(u[6], k32_p24_p08);
1837           v[27] = k_madd_epi32(u[7], k32_p24_p08);
1838           v[28] = k_madd_epi32(u[0], k32_p24_p08);
1839           v[29] = k_madd_epi32(u[1], k32_p24_p08);
1840           v[30] = k_madd_epi32(u[2], k32_p24_p08);
1841           v[31] = k_madd_epi32(u[3], k32_p24_p08);
1842 
1843 #if DCT_HIGH_BIT_DEPTH
1844           overflow = k_check_epi32_overflow_32(
1845               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
1846               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
1847               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
1848               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
1849           if (overflow) {
1850             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1851             return;
1852           }
1853 #endif  // DCT_HIGH_BIT_DEPTH
1854           u[0] = k_packs_epi64(v[0], v[1]);
1855           u[1] = k_packs_epi64(v[2], v[3]);
1856           u[2] = k_packs_epi64(v[4], v[5]);
1857           u[3] = k_packs_epi64(v[6], v[7]);
1858           u[4] = k_packs_epi64(v[8], v[9]);
1859           u[5] = k_packs_epi64(v[10], v[11]);
1860           u[6] = k_packs_epi64(v[12], v[13]);
1861           u[7] = k_packs_epi64(v[14], v[15]);
1862           u[8] = k_packs_epi64(v[16], v[17]);
1863           u[9] = k_packs_epi64(v[18], v[19]);
1864           u[10] = k_packs_epi64(v[20], v[21]);
1865           u[11] = k_packs_epi64(v[22], v[23]);
1866           u[12] = k_packs_epi64(v[24], v[25]);
1867           u[13] = k_packs_epi64(v[26], v[27]);
1868           u[14] = k_packs_epi64(v[28], v[29]);
1869           u[15] = k_packs_epi64(v[30], v[31]);
1870 
1871           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1872           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1873           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1874           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1875           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1876           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1877           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1878           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1879           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1880           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1881           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1882           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1883           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1884           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1885           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1886           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1887 
1888           lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1889           lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1890           lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1891           lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1892           lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1893           lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1894           lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1895           lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1896           lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1897           lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1898           lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1899           lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1900           lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1901           lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1902           lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1903           lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1904         }
1905         // stage 5
1906         {
1907           lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
1908           lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
1909           lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
1910           lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
1911           lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
1912           lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
1913           lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
1914           lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
1915         }
1916         {
1917           const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
1918           const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
1919           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
1920           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
1921 
1922           u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
1923           u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
1924           u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
1925           u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
1926           u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
1927           u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
1928           u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
1929           u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
1930 
1931           // TODO(jingning): manually inline k_madd_epi32_ to further hide
1932           // instruction latency.
1933           v[0] = k_madd_epi32(u[0], k32_p16_p16);
1934           v[1] = k_madd_epi32(u[1], k32_p16_p16);
1935           v[2] = k_madd_epi32(u[2], k32_p16_p16);
1936           v[3] = k_madd_epi32(u[3], k32_p16_p16);
1937           v[4] = k_madd_epi32(u[0], k32_p16_m16);
1938           v[5] = k_madd_epi32(u[1], k32_p16_m16);
1939           v[6] = k_madd_epi32(u[2], k32_p16_m16);
1940           v[7] = k_madd_epi32(u[3], k32_p16_m16);
1941           v[8] = k_madd_epi32(u[4], k32_p24_p08);
1942           v[9] = k_madd_epi32(u[5], k32_p24_p08);
1943           v[10] = k_madd_epi32(u[6], k32_p24_p08);
1944           v[11] = k_madd_epi32(u[7], k32_p24_p08);
1945           v[12] = k_madd_epi32(u[4], k32_m08_p24);
1946           v[13] = k_madd_epi32(u[5], k32_m08_p24);
1947           v[14] = k_madd_epi32(u[6], k32_m08_p24);
1948           v[15] = k_madd_epi32(u[7], k32_m08_p24);
1949 
1950 #if DCT_HIGH_BIT_DEPTH
1951           overflow = k_check_epi32_overflow_16(
1952               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
1953               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
1954           if (overflow) {
1955             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
1956             return;
1957           }
1958 #endif  // DCT_HIGH_BIT_DEPTH
1959           u[0] = k_packs_epi64(v[0], v[1]);
1960           u[1] = k_packs_epi64(v[2], v[3]);
1961           u[2] = k_packs_epi64(v[4], v[5]);
1962           u[3] = k_packs_epi64(v[6], v[7]);
1963           u[4] = k_packs_epi64(v[8], v[9]);
1964           u[5] = k_packs_epi64(v[10], v[11]);
1965           u[6] = k_packs_epi64(v[12], v[13]);
1966           u[7] = k_packs_epi64(v[14], v[15]);
1967 
1968           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1969           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1970           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1971           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1972           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1973           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1974           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1975           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1976 
1977           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1978           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1979           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1980           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1981           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1982           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1983           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1984           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1985 
1986           sign[0] = _mm_cmplt_epi32(u[0], kZero);
1987           sign[1] = _mm_cmplt_epi32(u[1], kZero);
1988           sign[2] = _mm_cmplt_epi32(u[2], kZero);
1989           sign[3] = _mm_cmplt_epi32(u[3], kZero);
1990           sign[4] = _mm_cmplt_epi32(u[4], kZero);
1991           sign[5] = _mm_cmplt_epi32(u[5], kZero);
1992           sign[6] = _mm_cmplt_epi32(u[6], kZero);
1993           sign[7] = _mm_cmplt_epi32(u[7], kZero);
1994 
1995           u[0] = _mm_sub_epi32(u[0], sign[0]);
1996           u[1] = _mm_sub_epi32(u[1], sign[1]);
1997           u[2] = _mm_sub_epi32(u[2], sign[2]);
1998           u[3] = _mm_sub_epi32(u[3], sign[3]);
1999           u[4] = _mm_sub_epi32(u[4], sign[4]);
2000           u[5] = _mm_sub_epi32(u[5], sign[5]);
2001           u[6] = _mm_sub_epi32(u[6], sign[6]);
2002           u[7] = _mm_sub_epi32(u[7], sign[7]);
2003 
2004           u[0] = _mm_add_epi32(u[0], K32One);
2005           u[1] = _mm_add_epi32(u[1], K32One);
2006           u[2] = _mm_add_epi32(u[2], K32One);
2007           u[3] = _mm_add_epi32(u[3], K32One);
2008           u[4] = _mm_add_epi32(u[4], K32One);
2009           u[5] = _mm_add_epi32(u[5], K32One);
2010           u[6] = _mm_add_epi32(u[6], K32One);
2011           u[7] = _mm_add_epi32(u[7], K32One);
2012 
2013           u[0] = _mm_srai_epi32(u[0], 2);
2014           u[1] = _mm_srai_epi32(u[1], 2);
2015           u[2] = _mm_srai_epi32(u[2], 2);
2016           u[3] = _mm_srai_epi32(u[3], 2);
2017           u[4] = _mm_srai_epi32(u[4], 2);
2018           u[5] = _mm_srai_epi32(u[5], 2);
2019           u[6] = _mm_srai_epi32(u[6], 2);
2020           u[7] = _mm_srai_epi32(u[7], 2);
2021 
2022           // Combine
2023           out[0] = _mm_packs_epi32(u[0], u[1]);
2024           out[16] = _mm_packs_epi32(u[2], u[3]);
2025           out[8] = _mm_packs_epi32(u[4], u[5]);
2026           out[24] = _mm_packs_epi32(u[6], u[7]);
2027 #if DCT_HIGH_BIT_DEPTH
2028           overflow =
2029               check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
2030           if (overflow) {
2031             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2032             return;
2033           }
2034 #endif  // DCT_HIGH_BIT_DEPTH
2035         }
2036         {
2037           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
2038           const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
2039           const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
2040 
2041           u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
2042           u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
2043           u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
2044           u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
2045           u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
2046           u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
2047           u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
2048           u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
2049 
2050           v[0] = k_madd_epi32(u[0], k32_m08_p24);
2051           v[1] = k_madd_epi32(u[1], k32_m08_p24);
2052           v[2] = k_madd_epi32(u[2], k32_m08_p24);
2053           v[3] = k_madd_epi32(u[3], k32_m08_p24);
2054           v[4] = k_madd_epi32(u[4], k32_m24_m08);
2055           v[5] = k_madd_epi32(u[5], k32_m24_m08);
2056           v[6] = k_madd_epi32(u[6], k32_m24_m08);
2057           v[7] = k_madd_epi32(u[7], k32_m24_m08);
2058           v[8] = k_madd_epi32(u[4], k32_m08_p24);
2059           v[9] = k_madd_epi32(u[5], k32_m08_p24);
2060           v[10] = k_madd_epi32(u[6], k32_m08_p24);
2061           v[11] = k_madd_epi32(u[7], k32_m08_p24);
2062           v[12] = k_madd_epi32(u[0], k32_p24_p08);
2063           v[13] = k_madd_epi32(u[1], k32_p24_p08);
2064           v[14] = k_madd_epi32(u[2], k32_p24_p08);
2065           v[15] = k_madd_epi32(u[3], k32_p24_p08);
2066 
2067 #if DCT_HIGH_BIT_DEPTH
2068           overflow = k_check_epi32_overflow_16(
2069               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2070               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
2071           if (overflow) {
2072             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2073             return;
2074           }
2075 #endif  // DCT_HIGH_BIT_DEPTH
2076           u[0] = k_packs_epi64(v[0], v[1]);
2077           u[1] = k_packs_epi64(v[2], v[3]);
2078           u[2] = k_packs_epi64(v[4], v[5]);
2079           u[3] = k_packs_epi64(v[6], v[7]);
2080           u[4] = k_packs_epi64(v[8], v[9]);
2081           u[5] = k_packs_epi64(v[10], v[11]);
2082           u[6] = k_packs_epi64(v[12], v[13]);
2083           u[7] = k_packs_epi64(v[14], v[15]);
2084 
2085           u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2086           u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2087           u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2088           u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2089           u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2090           u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2091           u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2092           u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2093 
2094           lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2095           lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2096           lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2097           lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2098           lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2099           lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2100           lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2101           lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2102         }
2103         {
2104           lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
2105           lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
2106           lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
2107           lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
2108           lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
2109           lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
2110           lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
2111           lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
2112           lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
2113           lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
2114           lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
2115           lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
2116           lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
2117           lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
2118           lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
2119           lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
2120           lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
2121           lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
2122           lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
2123           lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
2124           lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
2125           lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
2126           lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
2127           lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
2128           lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
2129           lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
2130           lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
2131           lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
2132           lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
2133           lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
2134           lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
2135           lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
2136         }
2137         // stage 6
2138         {
2139           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2140           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2141           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2142           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2143 
2144           u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
2145           u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
2146           u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
2147           u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
2148           u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2149           u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2150           u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2151           u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2152           u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
2153           u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
2154           u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
2155           u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
2156           u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
2157           u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
2158           u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
2159           u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
2160 
2161           v[0] = k_madd_epi32(u[0], k32_p28_p04);
2162           v[1] = k_madd_epi32(u[1], k32_p28_p04);
2163           v[2] = k_madd_epi32(u[2], k32_p28_p04);
2164           v[3] = k_madd_epi32(u[3], k32_p28_p04);
2165           v[4] = k_madd_epi32(u[4], k32_p12_p20);
2166           v[5] = k_madd_epi32(u[5], k32_p12_p20);
2167           v[6] = k_madd_epi32(u[6], k32_p12_p20);
2168           v[7] = k_madd_epi32(u[7], k32_p12_p20);
2169           v[8] = k_madd_epi32(u[8], k32_m20_p12);
2170           v[9] = k_madd_epi32(u[9], k32_m20_p12);
2171           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2172           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2173           v[12] = k_madd_epi32(u[12], k32_m04_p28);
2174           v[13] = k_madd_epi32(u[13], k32_m04_p28);
2175           v[14] = k_madd_epi32(u[14], k32_m04_p28);
2176           v[15] = k_madd_epi32(u[15], k32_m04_p28);
2177 
2178 #if DCT_HIGH_BIT_DEPTH
2179           overflow = k_check_epi32_overflow_16(
2180               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2181               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
2182           if (overflow) {
2183             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2184             return;
2185           }
2186 #endif  // DCT_HIGH_BIT_DEPTH
2187           u[0] = k_packs_epi64(v[0], v[1]);
2188           u[1] = k_packs_epi64(v[2], v[3]);
2189           u[2] = k_packs_epi64(v[4], v[5]);
2190           u[3] = k_packs_epi64(v[6], v[7]);
2191           u[4] = k_packs_epi64(v[8], v[9]);
2192           u[5] = k_packs_epi64(v[10], v[11]);
2193           u[6] = k_packs_epi64(v[12], v[13]);
2194           u[7] = k_packs_epi64(v[14], v[15]);
2195 
2196           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2197           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2198           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2199           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2200           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2201           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2202           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2203           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2204 
2205           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2206           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2207           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2208           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2209           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2210           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2211           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2212           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2213 
2214           sign[0] = _mm_cmplt_epi32(u[0], kZero);
2215           sign[1] = _mm_cmplt_epi32(u[1], kZero);
2216           sign[2] = _mm_cmplt_epi32(u[2], kZero);
2217           sign[3] = _mm_cmplt_epi32(u[3], kZero);
2218           sign[4] = _mm_cmplt_epi32(u[4], kZero);
2219           sign[5] = _mm_cmplt_epi32(u[5], kZero);
2220           sign[6] = _mm_cmplt_epi32(u[6], kZero);
2221           sign[7] = _mm_cmplt_epi32(u[7], kZero);
2222 
2223           u[0] = _mm_sub_epi32(u[0], sign[0]);
2224           u[1] = _mm_sub_epi32(u[1], sign[1]);
2225           u[2] = _mm_sub_epi32(u[2], sign[2]);
2226           u[3] = _mm_sub_epi32(u[3], sign[3]);
2227           u[4] = _mm_sub_epi32(u[4], sign[4]);
2228           u[5] = _mm_sub_epi32(u[5], sign[5]);
2229           u[6] = _mm_sub_epi32(u[6], sign[6]);
2230           u[7] = _mm_sub_epi32(u[7], sign[7]);
2231 
2232           u[0] = _mm_add_epi32(u[0], K32One);
2233           u[1] = _mm_add_epi32(u[1], K32One);
2234           u[2] = _mm_add_epi32(u[2], K32One);
2235           u[3] = _mm_add_epi32(u[3], K32One);
2236           u[4] = _mm_add_epi32(u[4], K32One);
2237           u[5] = _mm_add_epi32(u[5], K32One);
2238           u[6] = _mm_add_epi32(u[6], K32One);
2239           u[7] = _mm_add_epi32(u[7], K32One);
2240 
2241           u[0] = _mm_srai_epi32(u[0], 2);
2242           u[1] = _mm_srai_epi32(u[1], 2);
2243           u[2] = _mm_srai_epi32(u[2], 2);
2244           u[3] = _mm_srai_epi32(u[3], 2);
2245           u[4] = _mm_srai_epi32(u[4], 2);
2246           u[5] = _mm_srai_epi32(u[5], 2);
2247           u[6] = _mm_srai_epi32(u[6], 2);
2248           u[7] = _mm_srai_epi32(u[7], 2);
2249 
2250           out[4] = _mm_packs_epi32(u[0], u[1]);
2251           out[20] = _mm_packs_epi32(u[2], u[3]);
2252           out[12] = _mm_packs_epi32(u[4], u[5]);
2253           out[28] = _mm_packs_epi32(u[6], u[7]);
2254 #if DCT_HIGH_BIT_DEPTH
2255           overflow =
2256               check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
2257           if (overflow) {
2258             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2259             return;
2260           }
2261 #endif  // DCT_HIGH_BIT_DEPTH
2262         }
2263         {
2264           lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
2265           lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
2266           lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
2267           lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
2268           lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
2269           lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
2270           lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
2271           lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
2272           lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
2273           lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
2274           lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
2275           lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
2276           lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
2277           lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
2278           lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
2279           lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
2280         }
2281         {
2282           const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
2283           const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
2284           const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
2285           const __m128i k32_m12_m20 =
2286               pair_set_epi32(-cospi_12_64, -cospi_20_64);
2287           const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
2288           const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
2289 
2290           u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
2291           u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
2292           u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
2293           u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
2294           u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
2295           u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
2296           u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
2297           u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
2298           u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
2299           u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
2300           u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
2301           u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
2302           u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
2303           u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
2304           u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
2305           u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
2306 
2307           v[0] = k_madd_epi32(u[0], k32_m04_p28);
2308           v[1] = k_madd_epi32(u[1], k32_m04_p28);
2309           v[2] = k_madd_epi32(u[2], k32_m04_p28);
2310           v[3] = k_madd_epi32(u[3], k32_m04_p28);
2311           v[4] = k_madd_epi32(u[4], k32_m28_m04);
2312           v[5] = k_madd_epi32(u[5], k32_m28_m04);
2313           v[6] = k_madd_epi32(u[6], k32_m28_m04);
2314           v[7] = k_madd_epi32(u[7], k32_m28_m04);
2315           v[8] = k_madd_epi32(u[8], k32_m20_p12);
2316           v[9] = k_madd_epi32(u[9], k32_m20_p12);
2317           v[10] = k_madd_epi32(u[10], k32_m20_p12);
2318           v[11] = k_madd_epi32(u[11], k32_m20_p12);
2319           v[12] = k_madd_epi32(u[12], k32_m12_m20);
2320           v[13] = k_madd_epi32(u[13], k32_m12_m20);
2321           v[14] = k_madd_epi32(u[14], k32_m12_m20);
2322           v[15] = k_madd_epi32(u[15], k32_m12_m20);
2323           v[16] = k_madd_epi32(u[12], k32_m20_p12);
2324           v[17] = k_madd_epi32(u[13], k32_m20_p12);
2325           v[18] = k_madd_epi32(u[14], k32_m20_p12);
2326           v[19] = k_madd_epi32(u[15], k32_m20_p12);
2327           v[20] = k_madd_epi32(u[8], k32_p12_p20);
2328           v[21] = k_madd_epi32(u[9], k32_p12_p20);
2329           v[22] = k_madd_epi32(u[10], k32_p12_p20);
2330           v[23] = k_madd_epi32(u[11], k32_p12_p20);
2331           v[24] = k_madd_epi32(u[4], k32_m04_p28);
2332           v[25] = k_madd_epi32(u[5], k32_m04_p28);
2333           v[26] = k_madd_epi32(u[6], k32_m04_p28);
2334           v[27] = k_madd_epi32(u[7], k32_m04_p28);
2335           v[28] = k_madd_epi32(u[0], k32_p28_p04);
2336           v[29] = k_madd_epi32(u[1], k32_p28_p04);
2337           v[30] = k_madd_epi32(u[2], k32_p28_p04);
2338           v[31] = k_madd_epi32(u[3], k32_p28_p04);
2339 
2340 #if DCT_HIGH_BIT_DEPTH
2341           overflow = k_check_epi32_overflow_32(
2342               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2343               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2344               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2345               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2346           if (overflow) {
2347             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2348             return;
2349           }
2350 #endif  // DCT_HIGH_BIT_DEPTH
2351           u[0] = k_packs_epi64(v[0], v[1]);
2352           u[1] = k_packs_epi64(v[2], v[3]);
2353           u[2] = k_packs_epi64(v[4], v[5]);
2354           u[3] = k_packs_epi64(v[6], v[7]);
2355           u[4] = k_packs_epi64(v[8], v[9]);
2356           u[5] = k_packs_epi64(v[10], v[11]);
2357           u[6] = k_packs_epi64(v[12], v[13]);
2358           u[7] = k_packs_epi64(v[14], v[15]);
2359           u[8] = k_packs_epi64(v[16], v[17]);
2360           u[9] = k_packs_epi64(v[18], v[19]);
2361           u[10] = k_packs_epi64(v[20], v[21]);
2362           u[11] = k_packs_epi64(v[22], v[23]);
2363           u[12] = k_packs_epi64(v[24], v[25]);
2364           u[13] = k_packs_epi64(v[26], v[27]);
2365           u[14] = k_packs_epi64(v[28], v[29]);
2366           u[15] = k_packs_epi64(v[30], v[31]);
2367 
2368           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2369           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2370           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2371           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2372           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2373           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2374           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2375           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2376           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2377           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2378           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2379           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2380           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2381           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2382           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2383           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2384 
2385           lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2386           lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2387           lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2388           lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2389           lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2390           lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2391           lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2392           lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2393           lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2394           lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2395           lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2396           lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2397           lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2398           lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2399           lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2400           lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2401         }
2402         // stage 7
2403         {
2404           const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
2405           const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
2406           const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
2407           const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
2408           const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
2409           const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
2410           const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
2411           const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
2412 
2413           u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
2414           u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
2415           u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
2416           u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
2417           u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
2418           u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
2419           u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
2420           u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
2421           u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
2422           u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
2423           u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
2424           u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
2425           u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
2426           u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
2427           u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
2428           u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
2429 
2430           v[0] = k_madd_epi32(u[0], k32_p30_p02);
2431           v[1] = k_madd_epi32(u[1], k32_p30_p02);
2432           v[2] = k_madd_epi32(u[2], k32_p30_p02);
2433           v[3] = k_madd_epi32(u[3], k32_p30_p02);
2434           v[4] = k_madd_epi32(u[4], k32_p14_p18);
2435           v[5] = k_madd_epi32(u[5], k32_p14_p18);
2436           v[6] = k_madd_epi32(u[6], k32_p14_p18);
2437           v[7] = k_madd_epi32(u[7], k32_p14_p18);
2438           v[8] = k_madd_epi32(u[8], k32_p22_p10);
2439           v[9] = k_madd_epi32(u[9], k32_p22_p10);
2440           v[10] = k_madd_epi32(u[10], k32_p22_p10);
2441           v[11] = k_madd_epi32(u[11], k32_p22_p10);
2442           v[12] = k_madd_epi32(u[12], k32_p06_p26);
2443           v[13] = k_madd_epi32(u[13], k32_p06_p26);
2444           v[14] = k_madd_epi32(u[14], k32_p06_p26);
2445           v[15] = k_madd_epi32(u[15], k32_p06_p26);
2446           v[16] = k_madd_epi32(u[12], k32_m26_p06);
2447           v[17] = k_madd_epi32(u[13], k32_m26_p06);
2448           v[18] = k_madd_epi32(u[14], k32_m26_p06);
2449           v[19] = k_madd_epi32(u[15], k32_m26_p06);
2450           v[20] = k_madd_epi32(u[8], k32_m10_p22);
2451           v[21] = k_madd_epi32(u[9], k32_m10_p22);
2452           v[22] = k_madd_epi32(u[10], k32_m10_p22);
2453           v[23] = k_madd_epi32(u[11], k32_m10_p22);
2454           v[24] = k_madd_epi32(u[4], k32_m18_p14);
2455           v[25] = k_madd_epi32(u[5], k32_m18_p14);
2456           v[26] = k_madd_epi32(u[6], k32_m18_p14);
2457           v[27] = k_madd_epi32(u[7], k32_m18_p14);
2458           v[28] = k_madd_epi32(u[0], k32_m02_p30);
2459           v[29] = k_madd_epi32(u[1], k32_m02_p30);
2460           v[30] = k_madd_epi32(u[2], k32_m02_p30);
2461           v[31] = k_madd_epi32(u[3], k32_m02_p30);
2462 
2463 #if DCT_HIGH_BIT_DEPTH
2464           overflow = k_check_epi32_overflow_32(
2465               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2466               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2467               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2468               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2469           if (overflow) {
2470             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2471             return;
2472           }
2473 #endif  // DCT_HIGH_BIT_DEPTH
2474           u[0] = k_packs_epi64(v[0], v[1]);
2475           u[1] = k_packs_epi64(v[2], v[3]);
2476           u[2] = k_packs_epi64(v[4], v[5]);
2477           u[3] = k_packs_epi64(v[6], v[7]);
2478           u[4] = k_packs_epi64(v[8], v[9]);
2479           u[5] = k_packs_epi64(v[10], v[11]);
2480           u[6] = k_packs_epi64(v[12], v[13]);
2481           u[7] = k_packs_epi64(v[14], v[15]);
2482           u[8] = k_packs_epi64(v[16], v[17]);
2483           u[9] = k_packs_epi64(v[18], v[19]);
2484           u[10] = k_packs_epi64(v[20], v[21]);
2485           u[11] = k_packs_epi64(v[22], v[23]);
2486           u[12] = k_packs_epi64(v[24], v[25]);
2487           u[13] = k_packs_epi64(v[26], v[27]);
2488           u[14] = k_packs_epi64(v[28], v[29]);
2489           u[15] = k_packs_epi64(v[30], v[31]);
2490 
2491           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2492           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2493           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2494           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2495           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2496           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2497           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2498           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2499           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2500           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2501           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2502           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2503           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2504           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2505           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2506           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2507 
2508           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2509           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2510           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2511           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2512           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2513           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2514           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2515           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2516           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2517           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2518           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2519           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2520           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2521           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2522           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2523           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2524 
2525           v[0] = _mm_cmplt_epi32(u[0], kZero);
2526           v[1] = _mm_cmplt_epi32(u[1], kZero);
2527           v[2] = _mm_cmplt_epi32(u[2], kZero);
2528           v[3] = _mm_cmplt_epi32(u[3], kZero);
2529           v[4] = _mm_cmplt_epi32(u[4], kZero);
2530           v[5] = _mm_cmplt_epi32(u[5], kZero);
2531           v[6] = _mm_cmplt_epi32(u[6], kZero);
2532           v[7] = _mm_cmplt_epi32(u[7], kZero);
2533           v[8] = _mm_cmplt_epi32(u[8], kZero);
2534           v[9] = _mm_cmplt_epi32(u[9], kZero);
2535           v[10] = _mm_cmplt_epi32(u[10], kZero);
2536           v[11] = _mm_cmplt_epi32(u[11], kZero);
2537           v[12] = _mm_cmplt_epi32(u[12], kZero);
2538           v[13] = _mm_cmplt_epi32(u[13], kZero);
2539           v[14] = _mm_cmplt_epi32(u[14], kZero);
2540           v[15] = _mm_cmplt_epi32(u[15], kZero);
2541 
2542           u[0] = _mm_sub_epi32(u[0], v[0]);
2543           u[1] = _mm_sub_epi32(u[1], v[1]);
2544           u[2] = _mm_sub_epi32(u[2], v[2]);
2545           u[3] = _mm_sub_epi32(u[3], v[3]);
2546           u[4] = _mm_sub_epi32(u[4], v[4]);
2547           u[5] = _mm_sub_epi32(u[5], v[5]);
2548           u[6] = _mm_sub_epi32(u[6], v[6]);
2549           u[7] = _mm_sub_epi32(u[7], v[7]);
2550           u[8] = _mm_sub_epi32(u[8], v[8]);
2551           u[9] = _mm_sub_epi32(u[9], v[9]);
2552           u[10] = _mm_sub_epi32(u[10], v[10]);
2553           u[11] = _mm_sub_epi32(u[11], v[11]);
2554           u[12] = _mm_sub_epi32(u[12], v[12]);
2555           u[13] = _mm_sub_epi32(u[13], v[13]);
2556           u[14] = _mm_sub_epi32(u[14], v[14]);
2557           u[15] = _mm_sub_epi32(u[15], v[15]);
2558 
2559           v[0] = _mm_add_epi32(u[0], K32One);
2560           v[1] = _mm_add_epi32(u[1], K32One);
2561           v[2] = _mm_add_epi32(u[2], K32One);
2562           v[3] = _mm_add_epi32(u[3], K32One);
2563           v[4] = _mm_add_epi32(u[4], K32One);
2564           v[5] = _mm_add_epi32(u[5], K32One);
2565           v[6] = _mm_add_epi32(u[6], K32One);
2566           v[7] = _mm_add_epi32(u[7], K32One);
2567           v[8] = _mm_add_epi32(u[8], K32One);
2568           v[9] = _mm_add_epi32(u[9], K32One);
2569           v[10] = _mm_add_epi32(u[10], K32One);
2570           v[11] = _mm_add_epi32(u[11], K32One);
2571           v[12] = _mm_add_epi32(u[12], K32One);
2572           v[13] = _mm_add_epi32(u[13], K32One);
2573           v[14] = _mm_add_epi32(u[14], K32One);
2574           v[15] = _mm_add_epi32(u[15], K32One);
2575 
2576           u[0] = _mm_srai_epi32(v[0], 2);
2577           u[1] = _mm_srai_epi32(v[1], 2);
2578           u[2] = _mm_srai_epi32(v[2], 2);
2579           u[3] = _mm_srai_epi32(v[3], 2);
2580           u[4] = _mm_srai_epi32(v[4], 2);
2581           u[5] = _mm_srai_epi32(v[5], 2);
2582           u[6] = _mm_srai_epi32(v[6], 2);
2583           u[7] = _mm_srai_epi32(v[7], 2);
2584           u[8] = _mm_srai_epi32(v[8], 2);
2585           u[9] = _mm_srai_epi32(v[9], 2);
2586           u[10] = _mm_srai_epi32(v[10], 2);
2587           u[11] = _mm_srai_epi32(v[11], 2);
2588           u[12] = _mm_srai_epi32(v[12], 2);
2589           u[13] = _mm_srai_epi32(v[13], 2);
2590           u[14] = _mm_srai_epi32(v[14], 2);
2591           u[15] = _mm_srai_epi32(v[15], 2);
2592 
2593           out[2] = _mm_packs_epi32(u[0], u[1]);
2594           out[18] = _mm_packs_epi32(u[2], u[3]);
2595           out[10] = _mm_packs_epi32(u[4], u[5]);
2596           out[26] = _mm_packs_epi32(u[6], u[7]);
2597           out[6] = _mm_packs_epi32(u[8], u[9]);
2598           out[22] = _mm_packs_epi32(u[10], u[11]);
2599           out[14] = _mm_packs_epi32(u[12], u[13]);
2600           out[30] = _mm_packs_epi32(u[14], u[15]);
2601 #if DCT_HIGH_BIT_DEPTH
2602           overflow =
2603               check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
2604                                       &out[6], &out[22], &out[14], &out[30]);
2605           if (overflow) {
2606             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2607             return;
2608           }
2609 #endif  // DCT_HIGH_BIT_DEPTH
2610         }
2611         {
2612           lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
2613           lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
2614           lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
2615           lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
2616           lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
2617           lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
2618           lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
2619           lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
2620           lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
2621           lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
2622           lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
2623           lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
2624           lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
2625           lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
2626           lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
2627           lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
2628           lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
2629           lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
2630           lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
2631           lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
2632           lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
2633           lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
2634           lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
2635           lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
2636           lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
2637           lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
2638           lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
2639           lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
2640           lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
2641           lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
2642           lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
2643           lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
2644         }
2645         // stage 8
2646         {
2647           const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
2648           const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
2649           const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
2650           const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
2651           const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
2652           const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
2653           const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
2654           const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
2655 
2656           u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
2657           u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
2658           u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
2659           u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
2660           u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
2661           u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
2662           u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
2663           u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
2664           u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
2665           u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
2666           u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
2667           u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
2668           u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
2669           u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
2670           u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
2671           u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
2672 
2673           v[0] = k_madd_epi32(u[0], k32_p31_p01);
2674           v[1] = k_madd_epi32(u[1], k32_p31_p01);
2675           v[2] = k_madd_epi32(u[2], k32_p31_p01);
2676           v[3] = k_madd_epi32(u[3], k32_p31_p01);
2677           v[4] = k_madd_epi32(u[4], k32_p15_p17);
2678           v[5] = k_madd_epi32(u[5], k32_p15_p17);
2679           v[6] = k_madd_epi32(u[6], k32_p15_p17);
2680           v[7] = k_madd_epi32(u[7], k32_p15_p17);
2681           v[8] = k_madd_epi32(u[8], k32_p23_p09);
2682           v[9] = k_madd_epi32(u[9], k32_p23_p09);
2683           v[10] = k_madd_epi32(u[10], k32_p23_p09);
2684           v[11] = k_madd_epi32(u[11], k32_p23_p09);
2685           v[12] = k_madd_epi32(u[12], k32_p07_p25);
2686           v[13] = k_madd_epi32(u[13], k32_p07_p25);
2687           v[14] = k_madd_epi32(u[14], k32_p07_p25);
2688           v[15] = k_madd_epi32(u[15], k32_p07_p25);
2689           v[16] = k_madd_epi32(u[12], k32_m25_p07);
2690           v[17] = k_madd_epi32(u[13], k32_m25_p07);
2691           v[18] = k_madd_epi32(u[14], k32_m25_p07);
2692           v[19] = k_madd_epi32(u[15], k32_m25_p07);
2693           v[20] = k_madd_epi32(u[8], k32_m09_p23);
2694           v[21] = k_madd_epi32(u[9], k32_m09_p23);
2695           v[22] = k_madd_epi32(u[10], k32_m09_p23);
2696           v[23] = k_madd_epi32(u[11], k32_m09_p23);
2697           v[24] = k_madd_epi32(u[4], k32_m17_p15);
2698           v[25] = k_madd_epi32(u[5], k32_m17_p15);
2699           v[26] = k_madd_epi32(u[6], k32_m17_p15);
2700           v[27] = k_madd_epi32(u[7], k32_m17_p15);
2701           v[28] = k_madd_epi32(u[0], k32_m01_p31);
2702           v[29] = k_madd_epi32(u[1], k32_m01_p31);
2703           v[30] = k_madd_epi32(u[2], k32_m01_p31);
2704           v[31] = k_madd_epi32(u[3], k32_m01_p31);
2705 
2706 #if DCT_HIGH_BIT_DEPTH
2707           overflow = k_check_epi32_overflow_32(
2708               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2709               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2710               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2711               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2712           if (overflow) {
2713             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2714             return;
2715           }
2716 #endif  // DCT_HIGH_BIT_DEPTH
2717           u[0] = k_packs_epi64(v[0], v[1]);
2718           u[1] = k_packs_epi64(v[2], v[3]);
2719           u[2] = k_packs_epi64(v[4], v[5]);
2720           u[3] = k_packs_epi64(v[6], v[7]);
2721           u[4] = k_packs_epi64(v[8], v[9]);
2722           u[5] = k_packs_epi64(v[10], v[11]);
2723           u[6] = k_packs_epi64(v[12], v[13]);
2724           u[7] = k_packs_epi64(v[14], v[15]);
2725           u[8] = k_packs_epi64(v[16], v[17]);
2726           u[9] = k_packs_epi64(v[18], v[19]);
2727           u[10] = k_packs_epi64(v[20], v[21]);
2728           u[11] = k_packs_epi64(v[22], v[23]);
2729           u[12] = k_packs_epi64(v[24], v[25]);
2730           u[13] = k_packs_epi64(v[26], v[27]);
2731           u[14] = k_packs_epi64(v[28], v[29]);
2732           u[15] = k_packs_epi64(v[30], v[31]);
2733 
2734           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2735           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2736           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2737           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2738           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2739           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2740           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2741           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2742           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2743           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2744           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2745           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2746           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2747           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2748           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2749           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2750 
2751           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2752           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2753           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2754           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2755           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2756           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2757           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2758           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2759           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2760           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2761           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2762           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2763           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2764           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2765           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2766           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2767 
2768           v[0] = _mm_cmplt_epi32(u[0], kZero);
2769           v[1] = _mm_cmplt_epi32(u[1], kZero);
2770           v[2] = _mm_cmplt_epi32(u[2], kZero);
2771           v[3] = _mm_cmplt_epi32(u[3], kZero);
2772           v[4] = _mm_cmplt_epi32(u[4], kZero);
2773           v[5] = _mm_cmplt_epi32(u[5], kZero);
2774           v[6] = _mm_cmplt_epi32(u[6], kZero);
2775           v[7] = _mm_cmplt_epi32(u[7], kZero);
2776           v[8] = _mm_cmplt_epi32(u[8], kZero);
2777           v[9] = _mm_cmplt_epi32(u[9], kZero);
2778           v[10] = _mm_cmplt_epi32(u[10], kZero);
2779           v[11] = _mm_cmplt_epi32(u[11], kZero);
2780           v[12] = _mm_cmplt_epi32(u[12], kZero);
2781           v[13] = _mm_cmplt_epi32(u[13], kZero);
2782           v[14] = _mm_cmplt_epi32(u[14], kZero);
2783           v[15] = _mm_cmplt_epi32(u[15], kZero);
2784 
2785           u[0] = _mm_sub_epi32(u[0], v[0]);
2786           u[1] = _mm_sub_epi32(u[1], v[1]);
2787           u[2] = _mm_sub_epi32(u[2], v[2]);
2788           u[3] = _mm_sub_epi32(u[3], v[3]);
2789           u[4] = _mm_sub_epi32(u[4], v[4]);
2790           u[5] = _mm_sub_epi32(u[5], v[5]);
2791           u[6] = _mm_sub_epi32(u[6], v[6]);
2792           u[7] = _mm_sub_epi32(u[7], v[7]);
2793           u[8] = _mm_sub_epi32(u[8], v[8]);
2794           u[9] = _mm_sub_epi32(u[9], v[9]);
2795           u[10] = _mm_sub_epi32(u[10], v[10]);
2796           u[11] = _mm_sub_epi32(u[11], v[11]);
2797           u[12] = _mm_sub_epi32(u[12], v[12]);
2798           u[13] = _mm_sub_epi32(u[13], v[13]);
2799           u[14] = _mm_sub_epi32(u[14], v[14]);
2800           u[15] = _mm_sub_epi32(u[15], v[15]);
2801 
2802           v[0] = _mm_add_epi32(u[0], K32One);
2803           v[1] = _mm_add_epi32(u[1], K32One);
2804           v[2] = _mm_add_epi32(u[2], K32One);
2805           v[3] = _mm_add_epi32(u[3], K32One);
2806           v[4] = _mm_add_epi32(u[4], K32One);
2807           v[5] = _mm_add_epi32(u[5], K32One);
2808           v[6] = _mm_add_epi32(u[6], K32One);
2809           v[7] = _mm_add_epi32(u[7], K32One);
2810           v[8] = _mm_add_epi32(u[8], K32One);
2811           v[9] = _mm_add_epi32(u[9], K32One);
2812           v[10] = _mm_add_epi32(u[10], K32One);
2813           v[11] = _mm_add_epi32(u[11], K32One);
2814           v[12] = _mm_add_epi32(u[12], K32One);
2815           v[13] = _mm_add_epi32(u[13], K32One);
2816           v[14] = _mm_add_epi32(u[14], K32One);
2817           v[15] = _mm_add_epi32(u[15], K32One);
2818 
2819           u[0] = _mm_srai_epi32(v[0], 2);
2820           u[1] = _mm_srai_epi32(v[1], 2);
2821           u[2] = _mm_srai_epi32(v[2], 2);
2822           u[3] = _mm_srai_epi32(v[3], 2);
2823           u[4] = _mm_srai_epi32(v[4], 2);
2824           u[5] = _mm_srai_epi32(v[5], 2);
2825           u[6] = _mm_srai_epi32(v[6], 2);
2826           u[7] = _mm_srai_epi32(v[7], 2);
2827           u[8] = _mm_srai_epi32(v[8], 2);
2828           u[9] = _mm_srai_epi32(v[9], 2);
2829           u[10] = _mm_srai_epi32(v[10], 2);
2830           u[11] = _mm_srai_epi32(v[11], 2);
2831           u[12] = _mm_srai_epi32(v[12], 2);
2832           u[13] = _mm_srai_epi32(v[13], 2);
2833           u[14] = _mm_srai_epi32(v[14], 2);
2834           u[15] = _mm_srai_epi32(v[15], 2);
2835 
2836           out[1] = _mm_packs_epi32(u[0], u[1]);
2837           out[17] = _mm_packs_epi32(u[2], u[3]);
2838           out[9] = _mm_packs_epi32(u[4], u[5]);
2839           out[25] = _mm_packs_epi32(u[6], u[7]);
2840           out[7] = _mm_packs_epi32(u[8], u[9]);
2841           out[23] = _mm_packs_epi32(u[10], u[11]);
2842           out[15] = _mm_packs_epi32(u[12], u[13]);
2843           out[31] = _mm_packs_epi32(u[14], u[15]);
2844 #if DCT_HIGH_BIT_DEPTH
2845           overflow =
2846               check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
2847                                       &out[7], &out[23], &out[15], &out[31]);
2848           if (overflow) {
2849             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2850             return;
2851           }
2852 #endif  // DCT_HIGH_BIT_DEPTH
2853         }
2854         {
2855           const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
2856           const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
2857           const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
2858           const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
2859           const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
2860           const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
2861           const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
2862           const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
2863 
2864           u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
2865           u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
2866           u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
2867           u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
2868           u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
2869           u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
2870           u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
2871           u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
2872           u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
2873           u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
2874           u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
2875           u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
2876           u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
2877           u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
2878           u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
2879           u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
2880 
2881           v[0] = k_madd_epi32(u[0], k32_p27_p05);
2882           v[1] = k_madd_epi32(u[1], k32_p27_p05);
2883           v[2] = k_madd_epi32(u[2], k32_p27_p05);
2884           v[3] = k_madd_epi32(u[3], k32_p27_p05);
2885           v[4] = k_madd_epi32(u[4], k32_p11_p21);
2886           v[5] = k_madd_epi32(u[5], k32_p11_p21);
2887           v[6] = k_madd_epi32(u[6], k32_p11_p21);
2888           v[7] = k_madd_epi32(u[7], k32_p11_p21);
2889           v[8] = k_madd_epi32(u[8], k32_p19_p13);
2890           v[9] = k_madd_epi32(u[9], k32_p19_p13);
2891           v[10] = k_madd_epi32(u[10], k32_p19_p13);
2892           v[11] = k_madd_epi32(u[11], k32_p19_p13);
2893           v[12] = k_madd_epi32(u[12], k32_p03_p29);
2894           v[13] = k_madd_epi32(u[13], k32_p03_p29);
2895           v[14] = k_madd_epi32(u[14], k32_p03_p29);
2896           v[15] = k_madd_epi32(u[15], k32_p03_p29);
2897           v[16] = k_madd_epi32(u[12], k32_m29_p03);
2898           v[17] = k_madd_epi32(u[13], k32_m29_p03);
2899           v[18] = k_madd_epi32(u[14], k32_m29_p03);
2900           v[19] = k_madd_epi32(u[15], k32_m29_p03);
2901           v[20] = k_madd_epi32(u[8], k32_m13_p19);
2902           v[21] = k_madd_epi32(u[9], k32_m13_p19);
2903           v[22] = k_madd_epi32(u[10], k32_m13_p19);
2904           v[23] = k_madd_epi32(u[11], k32_m13_p19);
2905           v[24] = k_madd_epi32(u[4], k32_m21_p11);
2906           v[25] = k_madd_epi32(u[5], k32_m21_p11);
2907           v[26] = k_madd_epi32(u[6], k32_m21_p11);
2908           v[27] = k_madd_epi32(u[7], k32_m21_p11);
2909           v[28] = k_madd_epi32(u[0], k32_m05_p27);
2910           v[29] = k_madd_epi32(u[1], k32_m05_p27);
2911           v[30] = k_madd_epi32(u[2], k32_m05_p27);
2912           v[31] = k_madd_epi32(u[3], k32_m05_p27);
2913 
2914 #if DCT_HIGH_BIT_DEPTH
2915           overflow = k_check_epi32_overflow_32(
2916               &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
2917               &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
2918               &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
2919               &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
2920           if (overflow) {
2921             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
2922             return;
2923           }
2924 #endif  // DCT_HIGH_BIT_DEPTH
2925           u[0] = k_packs_epi64(v[0], v[1]);
2926           u[1] = k_packs_epi64(v[2], v[3]);
2927           u[2] = k_packs_epi64(v[4], v[5]);
2928           u[3] = k_packs_epi64(v[6], v[7]);
2929           u[4] = k_packs_epi64(v[8], v[9]);
2930           u[5] = k_packs_epi64(v[10], v[11]);
2931           u[6] = k_packs_epi64(v[12], v[13]);
2932           u[7] = k_packs_epi64(v[14], v[15]);
2933           u[8] = k_packs_epi64(v[16], v[17]);
2934           u[9] = k_packs_epi64(v[18], v[19]);
2935           u[10] = k_packs_epi64(v[20], v[21]);
2936           u[11] = k_packs_epi64(v[22], v[23]);
2937           u[12] = k_packs_epi64(v[24], v[25]);
2938           u[13] = k_packs_epi64(v[26], v[27]);
2939           u[14] = k_packs_epi64(v[28], v[29]);
2940           u[15] = k_packs_epi64(v[30], v[31]);
2941 
2942           v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
2943           v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
2944           v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
2945           v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
2946           v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
2947           v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
2948           v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
2949           v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
2950           v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
2951           v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
2952           v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
2953           v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
2954           v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
2955           v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
2956           v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
2957           v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
2958 
2959           u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
2960           u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
2961           u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
2962           u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
2963           u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
2964           u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
2965           u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
2966           u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
2967           u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
2968           u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
2969           u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
2970           u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
2971           u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
2972           u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
2973           u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
2974           u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
2975 
2976           v[0] = _mm_cmplt_epi32(u[0], kZero);
2977           v[1] = _mm_cmplt_epi32(u[1], kZero);
2978           v[2] = _mm_cmplt_epi32(u[2], kZero);
2979           v[3] = _mm_cmplt_epi32(u[3], kZero);
2980           v[4] = _mm_cmplt_epi32(u[4], kZero);
2981           v[5] = _mm_cmplt_epi32(u[5], kZero);
2982           v[6] = _mm_cmplt_epi32(u[6], kZero);
2983           v[7] = _mm_cmplt_epi32(u[7], kZero);
2984           v[8] = _mm_cmplt_epi32(u[8], kZero);
2985           v[9] = _mm_cmplt_epi32(u[9], kZero);
2986           v[10] = _mm_cmplt_epi32(u[10], kZero);
2987           v[11] = _mm_cmplt_epi32(u[11], kZero);
2988           v[12] = _mm_cmplt_epi32(u[12], kZero);
2989           v[13] = _mm_cmplt_epi32(u[13], kZero);
2990           v[14] = _mm_cmplt_epi32(u[14], kZero);
2991           v[15] = _mm_cmplt_epi32(u[15], kZero);
2992 
2993           u[0] = _mm_sub_epi32(u[0], v[0]);
2994           u[1] = _mm_sub_epi32(u[1], v[1]);
2995           u[2] = _mm_sub_epi32(u[2], v[2]);
2996           u[3] = _mm_sub_epi32(u[3], v[3]);
2997           u[4] = _mm_sub_epi32(u[4], v[4]);
2998           u[5] = _mm_sub_epi32(u[5], v[5]);
2999           u[6] = _mm_sub_epi32(u[6], v[6]);
3000           u[7] = _mm_sub_epi32(u[7], v[7]);
3001           u[8] = _mm_sub_epi32(u[8], v[8]);
3002           u[9] = _mm_sub_epi32(u[9], v[9]);
3003           u[10] = _mm_sub_epi32(u[10], v[10]);
3004           u[11] = _mm_sub_epi32(u[11], v[11]);
3005           u[12] = _mm_sub_epi32(u[12], v[12]);
3006           u[13] = _mm_sub_epi32(u[13], v[13]);
3007           u[14] = _mm_sub_epi32(u[14], v[14]);
3008           u[15] = _mm_sub_epi32(u[15], v[15]);
3009 
3010           v[0] = _mm_add_epi32(u[0], K32One);
3011           v[1] = _mm_add_epi32(u[1], K32One);
3012           v[2] = _mm_add_epi32(u[2], K32One);
3013           v[3] = _mm_add_epi32(u[3], K32One);
3014           v[4] = _mm_add_epi32(u[4], K32One);
3015           v[5] = _mm_add_epi32(u[5], K32One);
3016           v[6] = _mm_add_epi32(u[6], K32One);
3017           v[7] = _mm_add_epi32(u[7], K32One);
3018           v[8] = _mm_add_epi32(u[8], K32One);
3019           v[9] = _mm_add_epi32(u[9], K32One);
3020           v[10] = _mm_add_epi32(u[10], K32One);
3021           v[11] = _mm_add_epi32(u[11], K32One);
3022           v[12] = _mm_add_epi32(u[12], K32One);
3023           v[13] = _mm_add_epi32(u[13], K32One);
3024           v[14] = _mm_add_epi32(u[14], K32One);
3025           v[15] = _mm_add_epi32(u[15], K32One);
3026 
3027           u[0] = _mm_srai_epi32(v[0], 2);
3028           u[1] = _mm_srai_epi32(v[1], 2);
3029           u[2] = _mm_srai_epi32(v[2], 2);
3030           u[3] = _mm_srai_epi32(v[3], 2);
3031           u[4] = _mm_srai_epi32(v[4], 2);
3032           u[5] = _mm_srai_epi32(v[5], 2);
3033           u[6] = _mm_srai_epi32(v[6], 2);
3034           u[7] = _mm_srai_epi32(v[7], 2);
3035           u[8] = _mm_srai_epi32(v[8], 2);
3036           u[9] = _mm_srai_epi32(v[9], 2);
3037           u[10] = _mm_srai_epi32(v[10], 2);
3038           u[11] = _mm_srai_epi32(v[11], 2);
3039           u[12] = _mm_srai_epi32(v[12], 2);
3040           u[13] = _mm_srai_epi32(v[13], 2);
3041           u[14] = _mm_srai_epi32(v[14], 2);
3042           u[15] = _mm_srai_epi32(v[15], 2);
3043 
3044           out[5] = _mm_packs_epi32(u[0], u[1]);
3045           out[21] = _mm_packs_epi32(u[2], u[3]);
3046           out[13] = _mm_packs_epi32(u[4], u[5]);
3047           out[29] = _mm_packs_epi32(u[6], u[7]);
3048           out[3] = _mm_packs_epi32(u[8], u[9]);
3049           out[19] = _mm_packs_epi32(u[10], u[11]);
3050           out[11] = _mm_packs_epi32(u[12], u[13]);
3051           out[27] = _mm_packs_epi32(u[14], u[15]);
3052 #if DCT_HIGH_BIT_DEPTH
3053           overflow =
3054               check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
3055                                       &out[3], &out[19], &out[11], &out[27]);
3056           if (overflow) {
3057             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
3058             return;
3059           }
3060 #endif  // DCT_HIGH_BIT_DEPTH
3061         }
3062       }
3063 #endif  // FDCT32x32_HIGH_PRECISION
3064       // Transpose the results, do it as four 8x8 transposes.
3065       {
3066         int transpose_block;
3067         int16_t *output0 = &intermediate[column_start * 32];
3068         tran_low_t *output1 = &output_org[column_start * 32];
3069         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
3070           __m128i *this_out = &out[8 * transpose_block];
3071           // 00 01 02 03 04 05 06 07
3072           // 10 11 12 13 14 15 16 17
3073           // 20 21 22 23 24 25 26 27
3074           // 30 31 32 33 34 35 36 37
3075           // 40 41 42 43 44 45 46 47
3076           // 50 51 52 53 54 55 56 57
3077           // 60 61 62 63 64 65 66 67
3078           // 70 71 72 73 74 75 76 77
3079           const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
3080           const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
3081           const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
3082           const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
3083           const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
3084           const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
3085           const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
3086           const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
3087           // 00 10 01 11 02 12 03 13
3088           // 20 30 21 31 22 32 23 33
3089           // 04 14 05 15 06 16 07 17
3090           // 24 34 25 35 26 36 27 37
3091           // 40 50 41 51 42 52 43 53
3092           // 60 70 61 71 62 72 63 73
3093           // 54 54 55 55 56 56 57 57
3094           // 64 74 65 75 66 76 67 77
3095           const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
3096           const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
3097           const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
3098           const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
3099           const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
3100           const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
3101           const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
3102           const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
3103           // 00 10 20 30 01 11 21 31
3104           // 40 50 60 70 41 51 61 71
3105           // 02 12 22 32 03 13 23 33
3106           // 42 52 62 72 43 53 63 73
3107           // 04 14 24 34 05 15 21 36
3108           // 44 54 64 74 45 55 61 76
3109           // 06 16 26 36 07 17 27 37
3110           // 46 56 66 76 47 57 67 77
3111           __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
3112           __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
3113           __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
3114           __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
3115           __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
3116           __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
3117           __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
3118           __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
3119           // 00 10 20 30 40 50 60 70
3120           // 01 11 21 31 41 51 61 71
3121           // 02 12 22 32 42 52 62 72
3122           // 03 13 23 33 43 53 63 73
3123           // 04 14 24 34 44 54 64 74
3124           // 05 15 25 35 45 55 65 75
3125           // 06 16 26 36 46 56 66 76
3126           // 07 17 27 37 47 57 67 77
3127           if (0 == pass) {
3128             // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
3129             // TODO(cd): see quality impact of only doing
3130             //           output[j] = (output[j] + 1) >> 2;
3131             //           which would remove the code between here ...
3132             __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
3133             __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
3134             __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
3135             __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
3136             __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
3137             __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
3138             __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
3139             __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
3140             tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
3141             tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
3142             tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
3143             tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
3144             tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
3145             tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
3146             tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
3147             tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
3148             //           ... and here.
3149             //           PS: also change code in av1/encoder/av1_dct.c
3150             tr2_0 = _mm_add_epi16(tr2_0, kOne);
3151             tr2_1 = _mm_add_epi16(tr2_1, kOne);
3152             tr2_2 = _mm_add_epi16(tr2_2, kOne);
3153             tr2_3 = _mm_add_epi16(tr2_3, kOne);
3154             tr2_4 = _mm_add_epi16(tr2_4, kOne);
3155             tr2_5 = _mm_add_epi16(tr2_5, kOne);
3156             tr2_6 = _mm_add_epi16(tr2_6, kOne);
3157             tr2_7 = _mm_add_epi16(tr2_7, kOne);
3158             tr2_0 = _mm_srai_epi16(tr2_0, 2);
3159             tr2_1 = _mm_srai_epi16(tr2_1, 2);
3160             tr2_2 = _mm_srai_epi16(tr2_2, 2);
3161             tr2_3 = _mm_srai_epi16(tr2_3, 2);
3162             tr2_4 = _mm_srai_epi16(tr2_4, 2);
3163             tr2_5 = _mm_srai_epi16(tr2_5, 2);
3164             tr2_6 = _mm_srai_epi16(tr2_6, 2);
3165             tr2_7 = _mm_srai_epi16(tr2_7, 2);
3166           }
3167           // Note: even though all these stores are aligned, using the aligned
3168           //       intrinsic make the code slightly slower.
3169           if (pass == 0) {
3170             _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
3171             _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
3172             _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
3173             _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
3174             _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
3175             _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
3176             _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
3177             _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
3178             // Process next 8x8
3179             output0 += 8;
3180           } else {
3181             storeu_output(&tr2_0, (output1 + 0 * 32));
3182             storeu_output(&tr2_1, (output1 + 1 * 32));
3183             storeu_output(&tr2_2, (output1 + 2 * 32));
3184             storeu_output(&tr2_3, (output1 + 3 * 32));
3185             storeu_output(&tr2_4, (output1 + 4 * 32));
3186             storeu_output(&tr2_5, (output1 + 5 * 32));
3187             storeu_output(&tr2_6, (output1 + 6 * 32));
3188             storeu_output(&tr2_7, (output1 + 7 * 32));
3189             // Process next 8x8
3190             output1 += 8;
3191           }
3192         }
3193       }
3194     }
3195   }
3196 }  // NOLINT
3197 
3198 #undef ADD_EPI16
3199 #undef SUB_EPI16
3200 #undef HIGH_FDCT32x32_2D_C
3201 #undef HIGH_FDCT32x32_2D_ROWS_C
3202