1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_idct.h"
13 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17 
highbd_iadst_half_butterfly_sse4_1(const __m128i in,const int c,__m128i * const s)18 static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
19                                                       const int c,
20                                                       __m128i *const s) {
21   const __m128i pair_c = pair_set_epi32(4 * c, 0);
22   __m128i x[2];
23 
24   extend_64bit(in, x);
25   s[0] = _mm_mul_epi32(pair_c, x[0]);
26   s[1] = _mm_mul_epi32(pair_c, x[1]);
27 }
28 
highbd_iadst_butterfly_sse4_1(const __m128i in0,const __m128i in1,const int c0,const int c1,__m128i * const s0,__m128i * const s1)29 static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
30                                                  const __m128i in1,
31                                                  const int c0, const int c1,
32                                                  __m128i *const s0,
33                                                  __m128i *const s1) {
34   const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
35   const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
36   __m128i t00[2], t01[2], t10[2], t11[2];
37   __m128i x0[2], x1[2];
38 
39   extend_64bit(in0, x0);
40   extend_64bit(in1, x1);
41   t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
42   t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
43   t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
44   t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
45   t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
46   t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
47   t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
48   t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
49 
50   s0[0] = _mm_add_epi64(t00[0], t11[0]);
51   s0[1] = _mm_add_epi64(t00[1], t11[1]);
52   s1[0] = _mm_sub_epi64(t10[0], t01[0]);
53   s1[1] = _mm_sub_epi64(t10[1], t01[1]);
54 }
55 
highbd_iadst16_4col_sse4_1(__m128i * const io)56 static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
57   __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
58       s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
59   __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
60       x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
61 
62   // stage 1
63   highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
64   highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
65   highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
66   highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
67   highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
68   highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
69                                 s11);
70   highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
71                                 s13);
72   highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
73                                 s15);
74 
75   x0[0] = _mm_add_epi64(s0[0], s8[0]);
76   x0[1] = _mm_add_epi64(s0[1], s8[1]);
77   x1[0] = _mm_add_epi64(s1[0], s9[0]);
78   x1[1] = _mm_add_epi64(s1[1], s9[1]);
79   x2[0] = _mm_add_epi64(s2[0], s10[0]);
80   x2[1] = _mm_add_epi64(s2[1], s10[1]);
81   x3[0] = _mm_add_epi64(s3[0], s11[0]);
82   x3[1] = _mm_add_epi64(s3[1], s11[1]);
83   x4[0] = _mm_add_epi64(s4[0], s12[0]);
84   x4[1] = _mm_add_epi64(s4[1], s12[1]);
85   x5[0] = _mm_add_epi64(s5[0], s13[0]);
86   x5[1] = _mm_add_epi64(s5[1], s13[1]);
87   x6[0] = _mm_add_epi64(s6[0], s14[0]);
88   x6[1] = _mm_add_epi64(s6[1], s14[1]);
89   x7[0] = _mm_add_epi64(s7[0], s15[0]);
90   x7[1] = _mm_add_epi64(s7[1], s15[1]);
91   x8[0] = _mm_sub_epi64(s0[0], s8[0]);
92   x8[1] = _mm_sub_epi64(s0[1], s8[1]);
93   x9[0] = _mm_sub_epi64(s1[0], s9[0]);
94   x9[1] = _mm_sub_epi64(s1[1], s9[1]);
95   x10[0] = _mm_sub_epi64(s2[0], s10[0]);
96   x10[1] = _mm_sub_epi64(s2[1], s10[1]);
97   x11[0] = _mm_sub_epi64(s3[0], s11[0]);
98   x11[1] = _mm_sub_epi64(s3[1], s11[1]);
99   x12[0] = _mm_sub_epi64(s4[0], s12[0]);
100   x12[1] = _mm_sub_epi64(s4[1], s12[1]);
101   x13[0] = _mm_sub_epi64(s5[0], s13[0]);
102   x13[1] = _mm_sub_epi64(s5[1], s13[1]);
103   x14[0] = _mm_sub_epi64(s6[0], s14[0]);
104   x14[1] = _mm_sub_epi64(s6[1], s14[1]);
105   x15[0] = _mm_sub_epi64(s7[0], s15[0]);
106   x15[1] = _mm_sub_epi64(s7[1], s15[1]);
107 
108   x0[0] = dct_const_round_shift_64bit(x0[0]);
109   x0[1] = dct_const_round_shift_64bit(x0[1]);
110   x1[0] = dct_const_round_shift_64bit(x1[0]);
111   x1[1] = dct_const_round_shift_64bit(x1[1]);
112   x2[0] = dct_const_round_shift_64bit(x2[0]);
113   x2[1] = dct_const_round_shift_64bit(x2[1]);
114   x3[0] = dct_const_round_shift_64bit(x3[0]);
115   x3[1] = dct_const_round_shift_64bit(x3[1]);
116   x4[0] = dct_const_round_shift_64bit(x4[0]);
117   x4[1] = dct_const_round_shift_64bit(x4[1]);
118   x5[0] = dct_const_round_shift_64bit(x5[0]);
119   x5[1] = dct_const_round_shift_64bit(x5[1]);
120   x6[0] = dct_const_round_shift_64bit(x6[0]);
121   x6[1] = dct_const_round_shift_64bit(x6[1]);
122   x7[0] = dct_const_round_shift_64bit(x7[0]);
123   x7[1] = dct_const_round_shift_64bit(x7[1]);
124   x8[0] = dct_const_round_shift_64bit(x8[0]);
125   x8[1] = dct_const_round_shift_64bit(x8[1]);
126   x9[0] = dct_const_round_shift_64bit(x9[0]);
127   x9[1] = dct_const_round_shift_64bit(x9[1]);
128   x10[0] = dct_const_round_shift_64bit(x10[0]);
129   x10[1] = dct_const_round_shift_64bit(x10[1]);
130   x11[0] = dct_const_round_shift_64bit(x11[0]);
131   x11[1] = dct_const_round_shift_64bit(x11[1]);
132   x12[0] = dct_const_round_shift_64bit(x12[0]);
133   x12[1] = dct_const_round_shift_64bit(x12[1]);
134   x13[0] = dct_const_round_shift_64bit(x13[0]);
135   x13[1] = dct_const_round_shift_64bit(x13[1]);
136   x14[0] = dct_const_round_shift_64bit(x14[0]);
137   x14[1] = dct_const_round_shift_64bit(x14[1]);
138   x15[0] = dct_const_round_shift_64bit(x15[0]);
139   x15[1] = dct_const_round_shift_64bit(x15[1]);
140   x0[0] = pack_4(x0[0], x0[1]);
141   x1[0] = pack_4(x1[0], x1[1]);
142   x2[0] = pack_4(x2[0], x2[1]);
143   x3[0] = pack_4(x3[0], x3[1]);
144   x4[0] = pack_4(x4[0], x4[1]);
145   x5[0] = pack_4(x5[0], x5[1]);
146   x6[0] = pack_4(x6[0], x6[1]);
147   x7[0] = pack_4(x7[0], x7[1]);
148   x8[0] = pack_4(x8[0], x8[1]);
149   x9[0] = pack_4(x9[0], x9[1]);
150   x10[0] = pack_4(x10[0], x10[1]);
151   x11[0] = pack_4(x11[0], x11[1]);
152   x12[0] = pack_4(x12[0], x12[1]);
153   x13[0] = pack_4(x13[0], x13[1]);
154   x14[0] = pack_4(x14[0], x14[1]);
155   x15[0] = pack_4(x15[0], x15[1]);
156 
157   // stage 2
158   s0[0] = x0[0];
159   s1[0] = x1[0];
160   s2[0] = x2[0];
161   s3[0] = x3[0];
162   s4[0] = x4[0];
163   s5[0] = x5[0];
164   s6[0] = x6[0];
165   s7[0] = x7[0];
166   x0[0] = _mm_add_epi32(s0[0], s4[0]);
167   x1[0] = _mm_add_epi32(s1[0], s5[0]);
168   x2[0] = _mm_add_epi32(s2[0], s6[0]);
169   x3[0] = _mm_add_epi32(s3[0], s7[0]);
170   x4[0] = _mm_sub_epi32(s0[0], s4[0]);
171   x5[0] = _mm_sub_epi32(s1[0], s5[0]);
172   x6[0] = _mm_sub_epi32(s2[0], s6[0]);
173   x7[0] = _mm_sub_epi32(s3[0], s7[0]);
174 
175   highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
176   highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
177                                 s11);
178   highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
179                                 s12);
180   highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
181                                 s14);
182 
183   x8[0] = _mm_add_epi64(s8[0], s12[0]);
184   x8[1] = _mm_add_epi64(s8[1], s12[1]);
185   x9[0] = _mm_add_epi64(s9[0], s13[0]);
186   x9[1] = _mm_add_epi64(s9[1], s13[1]);
187   x10[0] = _mm_add_epi64(s10[0], s14[0]);
188   x10[1] = _mm_add_epi64(s10[1], s14[1]);
189   x11[0] = _mm_add_epi64(s11[0], s15[0]);
190   x11[1] = _mm_add_epi64(s11[1], s15[1]);
191   x12[0] = _mm_sub_epi64(s8[0], s12[0]);
192   x12[1] = _mm_sub_epi64(s8[1], s12[1]);
193   x13[0] = _mm_sub_epi64(s9[0], s13[0]);
194   x13[1] = _mm_sub_epi64(s9[1], s13[1]);
195   x14[0] = _mm_sub_epi64(s10[0], s14[0]);
196   x14[1] = _mm_sub_epi64(s10[1], s14[1]);
197   x15[0] = _mm_sub_epi64(s11[0], s15[0]);
198   x15[1] = _mm_sub_epi64(s11[1], s15[1]);
199   x8[0] = dct_const_round_shift_64bit(x8[0]);
200   x8[1] = dct_const_round_shift_64bit(x8[1]);
201   x9[0] = dct_const_round_shift_64bit(x9[0]);
202   x9[1] = dct_const_round_shift_64bit(x9[1]);
203   x10[0] = dct_const_round_shift_64bit(x10[0]);
204   x10[1] = dct_const_round_shift_64bit(x10[1]);
205   x11[0] = dct_const_round_shift_64bit(x11[0]);
206   x11[1] = dct_const_round_shift_64bit(x11[1]);
207   x12[0] = dct_const_round_shift_64bit(x12[0]);
208   x12[1] = dct_const_round_shift_64bit(x12[1]);
209   x13[0] = dct_const_round_shift_64bit(x13[0]);
210   x13[1] = dct_const_round_shift_64bit(x13[1]);
211   x14[0] = dct_const_round_shift_64bit(x14[0]);
212   x14[1] = dct_const_round_shift_64bit(x14[1]);
213   x15[0] = dct_const_round_shift_64bit(x15[0]);
214   x15[1] = dct_const_round_shift_64bit(x15[1]);
215   x8[0] = pack_4(x8[0], x8[1]);
216   x9[0] = pack_4(x9[0], x9[1]);
217   x10[0] = pack_4(x10[0], x10[1]);
218   x11[0] = pack_4(x11[0], x11[1]);
219   x12[0] = pack_4(x12[0], x12[1]);
220   x13[0] = pack_4(x13[0], x13[1]);
221   x14[0] = pack_4(x14[0], x14[1]);
222   x15[0] = pack_4(x15[0], x15[1]);
223 
224   // stage 3
225   s0[0] = x0[0];
226   s1[0] = x1[0];
227   s2[0] = x2[0];
228   s3[0] = x3[0];
229   highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
230   highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
231   s8[0] = x8[0];
232   s9[0] = x9[0];
233   s10[0] = x10[0];
234   s11[0] = x11[0];
235   highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
236                                 s13);
237   highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
238                                 s14);
239 
240   x0[0] = _mm_add_epi32(s0[0], s2[0]);
241   x1[0] = _mm_add_epi32(s1[0], s3[0]);
242   x2[0] = _mm_sub_epi32(s0[0], s2[0]);
243   x3[0] = _mm_sub_epi32(s1[0], s3[0]);
244   x4[0] = _mm_add_epi64(s4[0], s6[0]);
245   x4[1] = _mm_add_epi64(s4[1], s6[1]);
246   x5[0] = _mm_add_epi64(s5[0], s7[0]);
247   x5[1] = _mm_add_epi64(s5[1], s7[1]);
248   x6[0] = _mm_sub_epi64(s4[0], s6[0]);
249   x6[1] = _mm_sub_epi64(s4[1], s6[1]);
250   x7[0] = _mm_sub_epi64(s5[0], s7[0]);
251   x7[1] = _mm_sub_epi64(s5[1], s7[1]);
252   x4[0] = dct_const_round_shift_64bit(x4[0]);
253   x4[1] = dct_const_round_shift_64bit(x4[1]);
254   x5[0] = dct_const_round_shift_64bit(x5[0]);
255   x5[1] = dct_const_round_shift_64bit(x5[1]);
256   x6[0] = dct_const_round_shift_64bit(x6[0]);
257   x6[1] = dct_const_round_shift_64bit(x6[1]);
258   x7[0] = dct_const_round_shift_64bit(x7[0]);
259   x7[1] = dct_const_round_shift_64bit(x7[1]);
260   x4[0] = pack_4(x4[0], x4[1]);
261   x5[0] = pack_4(x5[0], x5[1]);
262   x6[0] = pack_4(x6[0], x6[1]);
263   x7[0] = pack_4(x7[0], x7[1]);
264   x8[0] = _mm_add_epi32(s8[0], s10[0]);
265   x9[0] = _mm_add_epi32(s9[0], s11[0]);
266   x10[0] = _mm_sub_epi32(s8[0], s10[0]);
267   x11[0] = _mm_sub_epi32(s9[0], s11[0]);
268   x12[0] = _mm_add_epi64(s12[0], s14[0]);
269   x12[1] = _mm_add_epi64(s12[1], s14[1]);
270   x13[0] = _mm_add_epi64(s13[0], s15[0]);
271   x13[1] = _mm_add_epi64(s13[1], s15[1]);
272   x14[0] = _mm_sub_epi64(s12[0], s14[0]);
273   x14[1] = _mm_sub_epi64(s12[1], s14[1]);
274   x15[0] = _mm_sub_epi64(s13[0], s15[0]);
275   x15[1] = _mm_sub_epi64(s13[1], s15[1]);
276   x12[0] = dct_const_round_shift_64bit(x12[0]);
277   x12[1] = dct_const_round_shift_64bit(x12[1]);
278   x13[0] = dct_const_round_shift_64bit(x13[0]);
279   x13[1] = dct_const_round_shift_64bit(x13[1]);
280   x14[0] = dct_const_round_shift_64bit(x14[0]);
281   x14[1] = dct_const_round_shift_64bit(x14[1]);
282   x15[0] = dct_const_round_shift_64bit(x15[0]);
283   x15[1] = dct_const_round_shift_64bit(x15[1]);
284   x12[0] = pack_4(x12[0], x12[1]);
285   x13[0] = pack_4(x13[0], x13[1]);
286   x14[0] = pack_4(x14[0], x14[1]);
287   x15[0] = pack_4(x15[0], x15[1]);
288 
289   // stage 4
290   s2[0] = _mm_add_epi32(x2[0], x3[0]);
291   s3[0] = _mm_sub_epi32(x2[0], x3[0]);
292   s6[0] = _mm_add_epi32(x7[0], x6[0]);
293   s7[0] = _mm_sub_epi32(x7[0], x6[0]);
294   s10[0] = _mm_add_epi32(x11[0], x10[0]);
295   s11[0] = _mm_sub_epi32(x11[0], x10[0]);
296   s14[0] = _mm_add_epi32(x14[0], x15[0]);
297   s15[0] = _mm_sub_epi32(x14[0], x15[0]);
298   highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
299   highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
300   highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
301   highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
302   highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
303   highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
304   highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
305   highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
306 
307   x2[0] = dct_const_round_shift_64bit(s2[0]);
308   x2[1] = dct_const_round_shift_64bit(s2[1]);
309   x3[0] = dct_const_round_shift_64bit(s3[0]);
310   x3[1] = dct_const_round_shift_64bit(s3[1]);
311   x6[0] = dct_const_round_shift_64bit(s6[0]);
312   x6[1] = dct_const_round_shift_64bit(s6[1]);
313   x7[0] = dct_const_round_shift_64bit(s7[0]);
314   x7[1] = dct_const_round_shift_64bit(s7[1]);
315   x10[0] = dct_const_round_shift_64bit(s10[0]);
316   x10[1] = dct_const_round_shift_64bit(s10[1]);
317   x11[0] = dct_const_round_shift_64bit(s11[0]);
318   x11[1] = dct_const_round_shift_64bit(s11[1]);
319   x14[0] = dct_const_round_shift_64bit(s14[0]);
320   x14[1] = dct_const_round_shift_64bit(s14[1]);
321   x15[0] = dct_const_round_shift_64bit(s15[0]);
322   x15[1] = dct_const_round_shift_64bit(s15[1]);
323   x2[0] = pack_4(x2[0], x2[1]);
324   x3[0] = pack_4(x3[0], x3[1]);
325   x6[0] = pack_4(x6[0], x6[1]);
326   x7[0] = pack_4(x7[0], x7[1]);
327   x10[0] = pack_4(x10[0], x10[1]);
328   x11[0] = pack_4(x11[0], x11[1]);
329   x14[0] = pack_4(x14[0], x14[1]);
330   x15[0] = pack_4(x15[0], x15[1]);
331 
332   io[0] = x0[0];
333   io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
334   io[2] = x12[0];
335   io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
336   io[4] = x6[0];
337   io[5] = x14[0];
338   io[6] = x10[0];
339   io[7] = x2[0];
340   io[8] = x3[0];
341   io[9] = x11[0];
342   io[10] = x15[0];
343   io[11] = x7[0];
344   io[12] = x5[0];
345   io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
346   io[14] = x9[0];
347   io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
348 }
349 
vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)350 void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
351                                         int stride, int tx_type, int bd) {
352   int i;
353   __m128i out[16], *in;
354 
355   if (bd == 8) {
356     __m128i l[16], r[16];
357 
358     in = l;
359     for (i = 0; i < 2; i++) {
360       highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
361       highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
362       if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
363         idct16_8col(in, in);
364       } else {
365         vpx_iadst16_8col_sse2(in);
366       }
367       in = r;
368       input += 128;
369     }
370 
371     for (i = 0; i < 16; i += 8) {
372       int j;
373       transpose_16bit_8x8(l + i, out);
374       transpose_16bit_8x8(r + i, out + 8);
375       if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
376         idct16_8col(out, out);
377       } else {
378         vpx_iadst16_8col_sse2(out);
379       }
380 
381       for (j = 0; j < 16; ++j) {
382         highbd_write_buffer_8(dest + j * stride, out[j], bd);
383       }
384       dest += 8;
385     }
386   } else {
387     __m128i all[4][16];
388 
389     for (i = 0; i < 4; i++) {
390       in = all[i];
391       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
392       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
393       if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
394         vpx_highbd_idct16_4col_sse4_1(in);
395       } else {
396         highbd_iadst16_4col_sse4_1(in);
397       }
398       input += 4 * 16;
399     }
400 
401     for (i = 0; i < 16; i += 4) {
402       int j;
403       transpose_32bit_4x4(all[0] + i, out + 0);
404       transpose_32bit_4x4(all[1] + i, out + 4);
405       transpose_32bit_4x4(all[2] + i, out + 8);
406       transpose_32bit_4x4(all[3] + i, out + 12);
407       if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
408         vpx_highbd_idct16_4col_sse4_1(out);
409       } else {
410         highbd_iadst16_4col_sse4_1(out);
411       }
412 
413       for (j = 0; j < 16; ++j) {
414         highbd_write_buffer_4(dest + j * stride, out[j], bd);
415       }
416       dest += 4;
417     }
418   }
419 }
420