1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vp9/common/vp9_enums.h"
15 #include "vp9/common/arm/neon/vp9_iht_neon.h"
16 #include "vpx_dsp/arm/highbd_idct_neon.h"
17 #include "vpx_dsp/arm/idct_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/inv_txfm.h"
20 
21 // Use macros to make sure argument lane is passed in as an constant integer.
22 
23 #define vmull_lane_s32_dual(in, c, lane, out)                          \
24   do {                                                                 \
25     out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane);  \
26     out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane);  \
27     out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \
28     out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \
29   } while (0)
30 
31 #define vmlal_lane_s32_dual(in, c, lane, out)                             \
32   do {                                                                    \
33     out[0].val[0] =                                                       \
34         vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
35     out[0].val[1] =                                                       \
36         vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
37     out[1].val[0] =                                                       \
38         vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
39     out[1].val[1] =                                                       \
40         vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
41   } while (0)
42 
43 #define vmlsl_lane_s32_dual(in, c, lane, out)                             \
44   do {                                                                    \
45     out[0].val[0] =                                                       \
46         vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
47     out[0].val[1] =                                                       \
48         vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
49     out[1].val[0] =                                                       \
50         vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
51     out[1].val[1] =                                                       \
52         vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
53   } while (0)
54 
55 static INLINE int32x4x2_t
highbd_dct_const_round_shift_low_8(const int64x2x2_t * const in)56 highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
57   int32x4x2_t out;
58   out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS),
59                             vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS));
60   out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS),
61                             vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS));
62   return out;
63 }
64 
65 #define highbd_iadst_half_butterfly(in, c, lane, out) \
66   do {                                                \
67     int64x2x2_t t[2];                                 \
68     vmull_lane_s32_dual(in, c, lane, t);              \
69     out = highbd_dct_const_round_shift_low_8(t);      \
70   } while (0)
71 
72 #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
73   do {                                                            \
74     vmull_lane_s32_dual(in0, c, lane0, s0);                       \
75     vmull_lane_s32_dual(in0, c, lane1, s1);                       \
76     vmlal_lane_s32_dual(in1, c, lane1, s0);                       \
77     vmlsl_lane_s32_dual(in1, c, lane0, s1);                       \
78   } while (0)
79 
vaddq_s32_dual(const int32x4x2_t in0,const int32x4x2_t in1)80 static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0,
81                                          const int32x4x2_t in1) {
82   int32x4x2_t out;
83   out.val[0] = vaddq_s32(in0.val[0], in1.val[0]);
84   out.val[1] = vaddq_s32(in0.val[1], in1.val[1]);
85   return out;
86 }
87 
vaddq_s64_dual(const int64x2x2_t in0,const int64x2x2_t in1)88 static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0,
89                                          const int64x2x2_t in1) {
90   int64x2x2_t out;
91   out.val[0] = vaddq_s64(in0.val[0], in1.val[0]);
92   out.val[1] = vaddq_s64(in0.val[1], in1.val[1]);
93   return out;
94 }
95 
vsubq_s32_dual(const int32x4x2_t in0,const int32x4x2_t in1)96 static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0,
97                                          const int32x4x2_t in1) {
98   int32x4x2_t out;
99   out.val[0] = vsubq_s32(in0.val[0], in1.val[0]);
100   out.val[1] = vsubq_s32(in0.val[1], in1.val[1]);
101   return out;
102 }
103 
vsubq_s64_dual(const int64x2x2_t in0,const int64x2x2_t in1)104 static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0,
105                                          const int64x2x2_t in1) {
106   int64x2x2_t out;
107   out.val[0] = vsubq_s64(in0.val[0], in1.val[0]);
108   out.val[1] = vsubq_s64(in0.val[1], in1.val[1]);
109   return out;
110 }
111 
vcombine_s32_dual(const int32x2x2_t in0,const int32x2x2_t in1)112 static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0,
113                                             const int32x2x2_t in1) {
114   int32x4x2_t out;
115   out.val[0] = vcombine_s32(in0.val[0], in1.val[0]);
116   out.val[1] = vcombine_s32(in0.val[1], in1.val[1]);
117   return out;
118 }
119 
highbd_add_dct_const_round_shift_low_8(const int64x2x2_t * const in0,const int64x2x2_t * const in1)120 static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8(
121     const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
122   const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]);
123   const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]);
124   int32x2x2_t out_lo, out_hi;
125 
126   out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS);
127   out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS);
128   out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS);
129   out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS);
130   return vcombine_s32_dual(out_lo, out_hi);
131 }
132 
highbd_sub_dct_const_round_shift_low_8(const int64x2x2_t * const in0,const int64x2x2_t * const in1)133 static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8(
134     const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
135   const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]);
136   const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]);
137   int32x2x2_t out_lo, out_hi;
138 
139   out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS);
140   out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS);
141   out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS);
142   out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS);
143   return vcombine_s32_dual(out_lo, out_hi);
144 }
145 
vnegq_s32_dual(const int32x4x2_t in)146 static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) {
147   int32x4x2_t out;
148   out.val[0] = vnegq_s32(in.val[0]);
149   out.val[1] = vnegq_s32(in.val[1]);
150   return out;
151 }
152 
highbd_iadst16_neon(const int32_t * input,int32_t * output,uint16_t * dest,const int stride,const int bd)153 static void highbd_iadst16_neon(const int32_t *input, int32_t *output,
154                                 uint16_t *dest, const int stride,
155                                 const int bd) {
156   const int32x4_t c_1_31_5_27 =
157       create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
158   const int32x4_t c_9_23_13_19 =
159       create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64);
160   const int32x4_t c_17_15_21_11 =
161       create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64);
162   const int32x4_t c_25_7_29_3 =
163       create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64);
164   const int32x4_t c_4_28_20_12 =
165       create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64);
166   const int32x4_t c_16_n16_8_24 =
167       create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64);
168   int32x4x2_t in[16], out[16];
169   int32x4x2_t x[16], t[12];
170   int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
171   int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
172 
173   // Load input (16x8)
174   in[0].val[0] = vld1q_s32(input);
175   in[0].val[1] = vld1q_s32(input + 4);
176   input += 8;
177   in[8].val[0] = vld1q_s32(input);
178   in[8].val[1] = vld1q_s32(input + 4);
179   input += 8;
180   in[1].val[0] = vld1q_s32(input);
181   in[1].val[1] = vld1q_s32(input + 4);
182   input += 8;
183   in[9].val[0] = vld1q_s32(input);
184   in[9].val[1] = vld1q_s32(input + 4);
185   input += 8;
186   in[2].val[0] = vld1q_s32(input);
187   in[2].val[1] = vld1q_s32(input + 4);
188   input += 8;
189   in[10].val[0] = vld1q_s32(input);
190   in[10].val[1] = vld1q_s32(input + 4);
191   input += 8;
192   in[3].val[0] = vld1q_s32(input);
193   in[3].val[1] = vld1q_s32(input + 4);
194   input += 8;
195   in[11].val[0] = vld1q_s32(input);
196   in[11].val[1] = vld1q_s32(input + 4);
197   input += 8;
198   in[4].val[0] = vld1q_s32(input);
199   in[4].val[1] = vld1q_s32(input + 4);
200   input += 8;
201   in[12].val[0] = vld1q_s32(input);
202   in[12].val[1] = vld1q_s32(input + 4);
203   input += 8;
204   in[5].val[0] = vld1q_s32(input);
205   in[5].val[1] = vld1q_s32(input + 4);
206   input += 8;
207   in[13].val[0] = vld1q_s32(input);
208   in[13].val[1] = vld1q_s32(input + 4);
209   input += 8;
210   in[6].val[0] = vld1q_s32(input);
211   in[6].val[1] = vld1q_s32(input + 4);
212   input += 8;
213   in[14].val[0] = vld1q_s32(input);
214   in[14].val[1] = vld1q_s32(input + 4);
215   input += 8;
216   in[7].val[0] = vld1q_s32(input);
217   in[7].val[1] = vld1q_s32(input + 4);
218   input += 8;
219   in[15].val[0] = vld1q_s32(input);
220   in[15].val[1] = vld1q_s32(input + 4);
221 
222   // Transpose
223   transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
224                     &in[7]);
225   transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
226                     &in[15]);
227 
228   x[0] = in[15];
229   x[1] = in[0];
230   x[2] = in[13];
231   x[3] = in[2];
232   x[4] = in[11];
233   x[5] = in[4];
234   x[6] = in[9];
235   x[7] = in[6];
236   x[8] = in[7];
237   x[9] = in[8];
238   x[10] = in[5];
239   x[11] = in[10];
240   x[12] = in[3];
241   x[13] = in[12];
242   x[14] = in[1];
243   x[15] = in[14];
244 
245   // stage 1
246   highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1);
247   highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3);
248   highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5);
249   highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7);
250   highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9);
251   highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10,
252                          s11);
253   highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12,
254                          s13);
255   highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14,
256                          s15);
257 
258   x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8);
259   x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9);
260   x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10);
261   x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11);
262   x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12);
263   x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13);
264   x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14);
265   x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15);
266   x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8);
267   x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9);
268   x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10);
269   x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11);
270   x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12);
271   x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13);
272   x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14);
273   x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15);
274 
275   // stage 2
276   t[0] = x[0];
277   t[1] = x[1];
278   t[2] = x[2];
279   t[3] = x[3];
280   t[4] = x[4];
281   t[5] = x[5];
282   t[6] = x[6];
283   t[7] = x[7];
284   highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9);
285   highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10,
286                          s11);
287   highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13,
288                          s12);
289   highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15,
290                          s14);
291 
292   x[0] = vaddq_s32_dual(t[0], t[4]);
293   x[1] = vaddq_s32_dual(t[1], t[5]);
294   x[2] = vaddq_s32_dual(t[2], t[6]);
295   x[3] = vaddq_s32_dual(t[3], t[7]);
296   x[4] = vsubq_s32_dual(t[0], t[4]);
297   x[5] = vsubq_s32_dual(t[1], t[5]);
298   x[6] = vsubq_s32_dual(t[2], t[6]);
299   x[7] = vsubq_s32_dual(t[3], t[7]);
300   x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12);
301   x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13);
302   x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14);
303   x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15);
304   x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12);
305   x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13);
306   x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14);
307   x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15);
308 
309   // stage 3
310   t[0] = x[0];
311   t[1] = x[1];
312   t[2] = x[2];
313   t[3] = x[3];
314   highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4,
315                          s5);
316   highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7,
317                          s6);
318   t[8] = x[8];
319   t[9] = x[9];
320   t[10] = x[10];
321   t[11] = x[11];
322   highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12,
323                          s13);
324   highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15,
325                          s14);
326 
327   x[0] = vaddq_s32_dual(t[0], t[2]);
328   x[1] = vaddq_s32_dual(t[1], t[3]);
329   x[2] = vsubq_s32_dual(t[0], t[2]);
330   x[3] = vsubq_s32_dual(t[1], t[3]);
331   x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6);
332   x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7);
333   x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6);
334   x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7);
335   x[8] = vaddq_s32_dual(t[8], t[10]);
336   x[9] = vaddq_s32_dual(t[9], t[11]);
337   x[10] = vsubq_s32_dual(t[8], t[10]);
338   x[11] = vsubq_s32_dual(t[9], t[11]);
339   x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14);
340   x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15);
341   x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14);
342   x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15);
343 
344   // stage 4
345   {
346     const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]);
347     const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]);
348     highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]);
349     highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]);
350   }
351   {
352     const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]);
353     const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]);
354     highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]);
355     highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]);
356   }
357   {
358     const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]);
359     const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]);
360     highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]);
361     highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]);
362   }
363   {
364     const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]);
365     const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]);
366     highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]);
367     highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]);
368   }
369 
370   out[0] = x[0];
371   out[1] = vnegq_s32_dual(x[8]);
372   out[2] = x[12];
373   out[3] = vnegq_s32_dual(x[4]);
374   out[4] = x[6];
375   out[5] = x[14];
376   out[6] = x[10];
377   out[7] = x[2];
378   out[8] = x[3];
379   out[9] = x[11];
380   out[10] = x[15];
381   out[11] = x[7];
382   out[12] = x[5];
383   out[13] = vnegq_s32_dual(x[13]);
384   out[14] = x[9];
385   out[15] = vnegq_s32_dual(x[1]);
386 
387   if (output) {
388     highbd_idct16x16_store_pass1(out, output);
389   } else {
390     highbd_idct16x16_add_store(out, dest, stride, bd);
391   }
392 }
393 
394 typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output,
395                               uint16_t *dest, const int stride, const int bd);
396 
397 typedef struct {
398   highbd_iht_1d cols, rows;  // vertical and horizontal
399 } highbd_iht_2d;
400 
vp9_highbd_iht16x16_256_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)401 void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
402                                       int stride, int tx_type, int bd) {
403   if (bd == 8) {
404     static const iht_2d IHT_16[] = {
405       { vpx_idct16x16_256_add_half1d,
406         vpx_idct16x16_256_add_half1d },  // DCT_DCT  = 0
407       { vpx_iadst16x16_256_add_half1d,
408         vpx_idct16x16_256_add_half1d },  // ADST_DCT = 1
409       { vpx_idct16x16_256_add_half1d,
410         vpx_iadst16x16_256_add_half1d },  // DCT_ADST = 2
411       { vpx_iadst16x16_256_add_half1d,
412         vpx_iadst16x16_256_add_half1d }  // ADST_ADST = 3
413     };
414     const iht_2d ht = IHT_16[tx_type];
415     int16_t row_output[16 * 16];
416 
417     // pass 1
418     ht.rows(input, row_output, dest, stride, 1);               // upper 8 rows
419     ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1);  // lower 8 rows
420 
421     // pass 2
422     ht.cols(row_output, NULL, dest, stride, 1);               // left 8 columns
423     ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1);  // right 8 columns
424   } else {
425     static const highbd_iht_2d IHT_16[] = {
426       { vpx_highbd_idct16x16_256_add_half1d,
427         vpx_highbd_idct16x16_256_add_half1d },  // DCT_DCT  = 0
428       { highbd_iadst16_neon,
429         vpx_highbd_idct16x16_256_add_half1d },  // ADST_DCT = 1
430       { vpx_highbd_idct16x16_256_add_half1d,
431         highbd_iadst16_neon },                      // DCT_ADST = 2
432       { highbd_iadst16_neon, highbd_iadst16_neon }  // ADST_ADST = 3
433     };
434     const highbd_iht_2d ht = IHT_16[tx_type];
435     int32_t row_output[16 * 16];
436 
437     // pass 1
438     ht.rows(input, row_output, dest, stride, bd);               // upper 8 rows
439     ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd);  // lower 8 rows
440 
441     // pass 2
442     ht.cols(row_output, NULL, dest, stride, bd);  // left 8 columns
443     ht.cols(row_output + 8 * 16, NULL, dest + 8, stride,
444             bd);  // right 8 columns
445   }
446 }
447