1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "aom_dsp/txfm_common.h"
16 #include "aom_dsp/arm/mem_neon.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
20 #include "config/aom_config.h"
21 #include "config/av1_rtcd.h"
22 
23 #define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1));
24 
transpose_16bit_4x4(const int16x8_t * const in,int16x8_t * const out)25 static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
26                                        int16x8_t *const out) {
27 #if defined(__aarch64__)
28   const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
29   const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
30 #else
31   int16x4x2_t temp;
32   temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
33   const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
34   temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
35   const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
36 #endif
37 
38   int32x4x2_t a01 =
39       vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
40   out[0] = vreinterpretq_s16_s32(a01.val[0]);
41   out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
42   out[2] = vreinterpretq_s16_s32(a01.val[1]);
43   out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
44 }
45 
transpose_16bit_4x8(const int16x8_t * const in,int16x8_t * const out)46 static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
47                                        int16x8_t *const out) {
48 #if defined(__aarch64__)
49   const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
50   const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
51   const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
52   const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
53 #else
54   int16x4x2_t temp;
55   temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
56   const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
57   temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
58   const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
59   temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
60   const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
61   temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
62   const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
63 #endif
64 
65   const int32x4x2_t b02 =
66       vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
67   const int32x4x2_t b13 =
68       vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
69 
70 #if defined(__aarch64__)
71   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
72                                             vreinterpretq_s64_s32(b13.val[0])));
73   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
74                                             vreinterpretq_s64_s32(b13.val[0])));
75   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
76                                             vreinterpretq_s64_s32(b13.val[1])));
77   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
78                                             vreinterpretq_s64_s32(b13.val[1])));
79 #else
80   out[0] = vreinterpretq_s16_s32(
81       vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
82   out[2] = vreinterpretq_s16_s32(
83       vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
84   out[1] = vreinterpretq_s16_s32(
85       vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
86   out[3] = vreinterpretq_s16_s32(
87       vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
88 #endif
89 }
90 
transpose_16bit_8x4(const int16x8_t * const in,int16x8_t * const out)91 static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
92                                        int16x8_t *const out) {
93   const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
94   const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
95 
96   const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
97                                     vreinterpretq_s32_s16(a15.val[0]));
98   const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
99                                     vreinterpretq_s32_s16(a15.val[1]));
100 
101   const int32x4_t zeros = vdupq_n_s32(0);
102 
103 #if defined(__aarch64__)
104   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
105                                             vreinterpretq_s64_s32(zeros)));
106   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
107                                             vreinterpretq_s64_s32(zeros)));
108   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
109                                             vreinterpretq_s64_s32(zeros)));
110   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
111                                             vreinterpretq_s64_s32(zeros)));
112   out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
113                                             vreinterpretq_s64_s32(zeros)));
114   out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
115                                             vreinterpretq_s64_s32(zeros)));
116   out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
117                                             vreinterpretq_s64_s32(zeros)));
118   out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
119                                             vreinterpretq_s64_s32(zeros)));
120 #else
121   out[0] = vreinterpretq_s16_s32(
122       vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
123   out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
124   out[2] = vreinterpretq_s16_s32(
125       vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
126   out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
127   out[4] = vreinterpretq_s16_s32(
128       vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
129   out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
130   out[6] = vreinterpretq_s16_s32(
131       vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
132   out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
133 #endif
134 }
135 
transpose_16bit_8x8(const int16x8_t * const in,int16x8_t * const out)136 static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
137                                        int16x8_t *const out) {
138   const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
139   const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
140   const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
141   const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
142 
143   const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
144                                     vreinterpretq_s32_s16(a15.val[0]));
145   const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
146                                     vreinterpretq_s32_s16(a37.val[0]));
147   const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
148                                     vreinterpretq_s32_s16(a15.val[1]));
149   const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
150                                     vreinterpretq_s32_s16(a37.val[1]));
151 
152 #if defined(__aarch64__)
153   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
154                                             vreinterpretq_s64_s32(b15.val[0])));
155   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
156                                             vreinterpretq_s64_s32(b15.val[0])));
157   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
158                                             vreinterpretq_s64_s32(b15.val[1])));
159   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
160                                             vreinterpretq_s64_s32(b15.val[1])));
161   out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
162                                             vreinterpretq_s64_s32(b37.val[0])));
163   out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
164                                             vreinterpretq_s64_s32(b37.val[0])));
165   out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
166                                             vreinterpretq_s64_s32(b37.val[1])));
167   out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
168                                             vreinterpretq_s64_s32(b37.val[1])));
169 #else
170   out[0] = vreinterpretq_s16_s32(
171       vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
172   out[1] = vreinterpretq_s16_s32(
173       vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
174   out[2] = vreinterpretq_s16_s32(
175       vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
176   out[3] = vreinterpretq_s16_s32(
177       vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
178   out[4] = vreinterpretq_s16_s32(
179       vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
180   out[5] = vreinterpretq_s16_s32(
181       vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
182   out[6] = vreinterpretq_s16_s32(
183       vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
184   out[7] = vreinterpretq_s16_s32(
185       vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
186 #endif
187 }
188 
av1_round_shift_rect_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)189 static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
190                                                       int32x4_t *output,
191                                                       const int size) {
192   int i;
193   for (i = 0; i < size; i++) {
194     output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
195                              NewSqrt2Bits);
196   }
197 }
198 
av1_round_shift_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)199 static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
200                                                  int32x4_t *output,
201                                                  const int size) {
202   int i;
203   for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
204 }
205 
206 #define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
207   do {                                                       \
208     out0 = vmulq_n_s32(in0, w0);                             \
209     out0 = vmlaq_n_s32(out0, in1, w1);                       \
210     out0 = vrshlq_s32(out0, v_cos_bit);                      \
211     out1 = vmulq_n_s32(in0, w1);                             \
212     out1 = vmlsq_n_s32(out1, in1, w0);                       \
213     out1 = vrshlq_s32(out1, v_cos_bit);                      \
214   } while (0)
215 
216 #define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
217   do {                                                             \
218     btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit);          \
219   } while (0)
220 
221 #define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
222   do {                                                             \
223     out0 = vmulq_n_s32(in1, w1);                                   \
224     out0 = vmlsq_n_s32(out0, in0, w0);                             \
225     out0 = vrshlq_s32(out0, v_cos_bit);                            \
226     out1 = vmulq_n_s32(in0, w1);                                   \
227     out1 = vmlaq_n_s32(out1, in1, w0);                             \
228     out1 = vrshlq_s32(out1, v_cos_bit);                            \
229   } while (0)
230 
231 #define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
232   do {                                                              \
233     out0 = vmulq_n_s32(in1, w1);                                    \
234     out0 = vmlaq_n_s32(out0, in0, w0);                              \
235     out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit);                  \
236     out1 = vmulq_n_s32(in1, w0);                                    \
237     out1 = vmlsq_n_s32(out1, in0, w1);                              \
238     out1 = vrshlq_s32(out1, v_cos_bit);                             \
239   } while (0)
240 
flip_buf_neon(int16x8_t * in,int16x8_t * out,int size)241 static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
242   for (int i = 0; i < size; ++i) {
243     out[size - i - 1] = in[i];
244   }
245 }
246 
store_16bit_to_32bit_w4(const int16x8_t a,int32_t * const b)247 static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
248                                            int32_t *const b) {
249   vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
250 }
251 
store_16bit_to_32bit(int16x8_t a,int32_t * b)252 static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
253   vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
254   vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
255 }
256 
store_rect_16bit_to_32bit_w4(const int16x8_t a,int32_t * const b,const int16x4_t * v_newsqrt2,const int32x4_t * v_newsqrt2bits)257 static INLINE void store_rect_16bit_to_32bit_w4(
258     const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
259     const int32x4_t *v_newsqrt2bits) {
260   const int32x4_t b_lo =
261       vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
262   vst1q_s32(b, b_lo);
263 }
264 
store_rect_16bit_to_32bit(const int16x8_t a,int32_t * const b,const int16x4_t * v_newsqrt2,const int32x4_t * v_newsqrt2bits)265 static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
266                                              int32_t *const b,
267                                              const int16x4_t *v_newsqrt2,
268                                              const int32x4_t *v_newsqrt2bits) {
269   const int32x4_t b_lo =
270       vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
271   const int32x4_t b_hi =
272       vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
273   vst1q_s32(b, b_lo);
274   vst1q_s32((b + 4), b_hi);
275 }
276 
load_buffer_16bit_to_16bit_w4(const int16_t * const in,const int stride,int16x8_t * const out,const int out_size)277 static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
278                                                  const int stride,
279                                                  int16x8_t *const out,
280                                                  const int out_size) {
281   for (int i = 0; i < out_size; ++i)
282     out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
283         (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
284 }
285 
load_buffer_16bit_to_16bit_w4_flip(const int16_t * const in,const int stride,int16x8_t * const out,const int out_size)286 static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
287                                                       const int stride,
288                                                       int16x8_t *const out,
289                                                       const int out_size) {
290   for (int i = 0; i < out_size; ++i)
291     out[out_size - i - 1] = vreinterpretq_s16_u64(
292         vld1q_lane_u64((uint64_t *)(in + i * stride),
293                        vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
294 }
295 
load_buffer_16bit_to_16bit(const int16_t * in,int stride,int16x8_t * out,int out_size)296 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
297                                               int16x8_t *out, int out_size) {
298   for (int i = 0; i < out_size; ++i) {
299     out[i] = vld1q_s16(in + i * stride);
300   }
301 }
302 
load_buffer_16bit_to_16bit_flip(const int16_t * in,int stride,int16x8_t * out,int out_size)303 static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
304                                                    int stride, int16x8_t *out,
305                                                    int out_size) {
306   for (int i = 0; i < out_size; ++i) {
307     out[out_size - i - 1] = vld1q_s16(in + i * stride);
308   }
309 }
310 
store_buffer_16bit_to_32bit_w4(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)311 static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
312                                                   int32_t *const out,
313                                                   const int stride,
314                                                   const int out_size) {
315   for (int i = 0; i < out_size; ++i) {
316     store_16bit_to_32bit_w4(in[i], out + i * stride);
317   }
318 }
319 
store_buffer_16bit_to_32bit_w8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)320 static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
321                                                   int32_t *const out,
322                                                   const int stride,
323                                                   const int out_size) {
324   for (int i = 0; i < out_size; ++i) {
325     store_16bit_to_32bit(in[i], out + i * stride);
326   }
327 }
328 
store_rect_buffer_16bit_to_32bit_w4(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)329 static INLINE void store_rect_buffer_16bit_to_32bit_w4(
330     const int16x8_t *const in, int32_t *const out, const int stride,
331     const int out_size) {
332   const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
333   const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
334   for (int i = 0; i < out_size; ++i) {
335     store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
336                                  &v_newsqrt2bits);
337   }
338 }
339 
store_rect_buffer_16bit_to_32bit_w8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)340 static INLINE void store_rect_buffer_16bit_to_32bit_w8(
341     const int16x8_t *const in, int32_t *const out, const int stride,
342     const int out_size) {
343   const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
344   const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
345   for (int i = 0; i < out_size; ++i) {
346     store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
347                               &v_newsqrt2bits);
348   }
349 }
350 
round_shift_16bit(int16x8_t * in,int size,int bit)351 static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
352   const int16x8_t vbit = vdupq_n_s16(bit);
353   for (int i = 0; i < size; ++i) {
354     in[i] = vrshlq_s16(in[i], vbit);
355   }
356 }
357 
round_shift_16bit_vector(int16x8_t * in,int size,const int16x8_t * v_bit)358 static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
359                                             const int16x8_t *v_bit) {
360   for (int i = 0; i < size; ++i) {
361     in[i] = vrshlq_s16(in[i], *v_bit);
362   }
363 }
364 
av1_fadst4x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)365 void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
366                        int8_t cos_bit, const int8_t *stage_range) {
367   (void)stage_range;
368   const int32_t *sinpi = sinpi_arr(cos_bit);
369 
370   int32x4_t u[6], v[6];
371 
372   u[0] = vmovl_s16(vget_low_s16(input[0]));
373   u[1] = vmovl_s16(vget_low_s16(input[1]));
374   u[2] = vmovl_s16(vget_low_s16(input[2]));
375   u[3] = vmovl_s16(vget_low_s16(input[3]));
376   u[4] = vaddq_s32(u[0], u[1]);
377   v[5] = vmulq_n_s32(u[2], sinpi[3]);
378   v[0] = vmulq_n_s32(u[1], sinpi[2]);
379   v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
380   v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
381   v[2] = vmulq_n_s32(u[4], sinpi[3]);
382   v[3] = vmulq_n_s32(u[0], sinpi[4]);
383   v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
384   v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
385 
386   u[0] = vaddq_s32(v[0], v[1]);
387   u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
388   u[2] = vsubq_s32(v[3], v[4]);
389   u[3] = vsubq_s32(u[2], u[0]);
390   u[5] = vmlaq_n_s32(u[3], v[5], 3);
391 
392   int32x4_t vshift = vdupq_n_s32(-cos_bit);
393   u[0] = vrshlq_s32(u[0], vshift);
394   u[1] = vrshlq_s32(u[1], vshift);
395   u[2] = vrshlq_s32(u[2], vshift);
396   u[3] = vrshlq_s32(u[5], vshift);
397 
398   output[0] = custom_packs_s32(u[0], u[2]);
399 
400   output[1] = custom_packs_s32(u[1], u[3]);
401   output[2] = vextq_s16(output[0], output[0], 4);
402   output[3] = vextq_s16(output[1], output[1], 4);
403 }
404 
405 #define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
406                        v_cos_bit)                                    \
407   {                                                                  \
408     int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                  \
409     int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                  \
410     int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                         \
411     u0 = vmlaq_n_s32(u0, in1_l, w0_h);                               \
412     int32x4_t v0 = vmulq_n_s32(in0_l, w1_l);                         \
413     v0 = vmlaq_n_s32(v0, in1_l, w1_h);                               \
414     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                        \
415     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                        \
416     const int16x4_t c1 = vqmovn_s32(c0);                             \
417     const int16x4_t d1 = vqmovn_s32(d0);                             \
418     out0 = vcombine_s16(c1, c1);                                     \
419     out1 = vcombine_s16(d1, c1);                                     \
420   }
421 
422 #define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
423   {                                                                       \
424     int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
425     int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
426     int32x4_t u0 = vmulq_n_s32(in1_l, w0_h);                              \
427     u0 = vmlsq_n_s32(u0, in0_l, w0_l);                                    \
428     int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
429     v0 = vmlaq_n_s32(v0, in1_l, w0_l);                                    \
430     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
431     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
432     const int16x4_t c1 = vqmovn_s32(c0);                                  \
433     const int16x4_t d1 = vqmovn_s32(d0);                                  \
434     out0 = vcombine_s16(c1, c1);                                          \
435     out1 = vcombine_s16(d1, c1);                                          \
436   }
437 
438 #define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
439   {                                                                       \
440     int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
441     int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
442     int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
443     u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
444     int32x4_t v0 = vmulq_n_s32(in1_l, w0_l);                              \
445     v0 = vmlsq_n_s32(v0, in0_l, w0_h);                                    \
446     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
447     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
448     const int16x4_t c1 = vqmovn_s32(c0);                                  \
449     const int16x4_t d1 = vqmovn_s32(d0);                                  \
450     out0 = vcombine_s16(c1, c1);                                          \
451     out1 = vcombine_s16(d1, c1);                                          \
452   }
453 
454 #define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
455   {                                                                       \
456     int32x4_t in0_l = vmovl_s16(vget_low_s16(in0));                       \
457     int32x4_t in1_l = vmovl_s16(vget_low_s16(in1));                       \
458     int32x4_t u0 = vmulq_n_s32(in0_l, w0_l);                              \
459     u0 = vmlaq_n_s32(u0, in1_l, w0_h);                                    \
460     int32x4_t v0 = vmulq_n_s32(in0_l, w0_h);                              \
461     v0 = vmlsq_n_s32(v0, in1_l, w0_l);                                    \
462     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                             \
463     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                             \
464     const int16x4_t c1 = vqmovn_s32(c0);                                  \
465     const int16x4_t d1 = vqmovn_s32(d0);                                  \
466     out0 = vcombine_s16(c1, c1);                                          \
467     out1 = vcombine_s16(d1, c1);                                          \
468   }
469 
fadst4x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)470 static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
471                           int8_t cos_bit, const int8_t *stage_range) {
472   (void)stage_range;
473   const int32_t *cospi = cospi_arr(cos_bit);
474   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
475 
476   // stage 1-2
477   int16x8_t x2[8];
478   btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
479                        x2[2], x2[3], v_cos_bit);
480   btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
481                        x2[6], x2[7], v_cos_bit);
482 
483   // stage 3
484   int16x8_t x3[8];
485   x3[0] = vqaddq_s16(input[0], x2[2]);
486   x3[2] = vqsubq_s16(input[0], x2[2]);
487   x3[1] = vqsubq_s16(x2[3], input[7]);
488   x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
489   x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
490   x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
491   x3[5] = vqaddq_s16(input[6], x2[7]);
492   x3[7] = vqsubq_s16(input[6], x2[7]);
493 
494   // stage 4
495   int16x8_t x4[8];
496 
497   btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
498                        v_cos_bit);
499   btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
500                        v_cos_bit);
501 
502   // stage 5
503   int16x8_t x5[8];
504   x5[0] = vqaddq_s16(x3[0], x4[4]);
505   x5[4] = vqsubq_s16(x3[0], x4[4]);
506   x5[1] = vqaddq_s16(x3[1], x4[5]);
507   x5[5] = vqsubq_s16(x3[1], x4[5]);
508   x5[2] = vqaddq_s16(x3[2], x4[6]);
509   x5[6] = vqsubq_s16(x3[2], x4[6]);
510   x5[3] = vqaddq_s16(x3[3], x4[7]);
511   x5[7] = vqsubq_s16(x3[3], x4[7]);
512 
513   // stage 6-7
514   btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
515                        v_cos_bit);
516   btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
517                        v_cos_bit);
518   btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
519                        v_cos_bit);
520   btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
521                        v_cos_bit);
522 }
523 
fadst8x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)524 static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
525                           int8_t cos_bit, const int8_t *stage_range) {
526   (void)stage_range;
527   const int32_t *sinpi = sinpi_arr(cos_bit);
528 
529   const int16x8_t in7 = vaddq_s16(input[0], input[1]);
530   int32x4_t u_lo[8], u_hi[8], v_hi[8];
531 
532   int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
533   int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
534   int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
535   int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
536   int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
537   int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
538   int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
539   int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
540   int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
541   int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
542 
543   u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
544   u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
545 
546   u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
547   u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
548 
549   u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
550   u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
551 
552   u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
553   u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
554 
555   u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
556 
557   v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
558   u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
559   u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
560 
561   u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
562   u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
563 
564   u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
565   u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
566 
567   u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
568   u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
569 
570   u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
571 
572   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
573 
574   u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
575 
576   u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
577   u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
578 
579   u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
580   u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
581 
582   u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
583   u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
584   u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
585   u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
586   u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
587   u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
588   u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
589   u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
590 
591   output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
592   output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
593   output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
594   output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
595 }
596 
av1_fdct4x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)597 void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
598                       const int8_t *stage_range) {
599   (void)stage_range;
600   const int32_t *cospi = cospi_arr(cos_bit);
601   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
602 
603   int32x4_t u[4];
604 
605   int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
606   int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
607   int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
608   int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
609 
610   int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
611   int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
612   u[0] = vaddq_s32(u0ad1, u0ad2);
613   u[1] = vsubq_s32(u0ad2, u0ad1);
614   u[2] = vmulq_n_s32(in12s, cospi[48]);
615   u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
616 
617   u[3] = vmulq_n_s32(in03s, cospi[48]);
618   u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
619 
620   u[0] = vrshlq_s32(u[0], v_cos_bit);
621   u[1] = vrshlq_s32(u[1], v_cos_bit);
622   u[2] = vrshlq_s32(u[2], v_cos_bit);
623   u[3] = vrshlq_s32(u[3], v_cos_bit);
624 
625   output[0] = custom_packs_s32(u[0], u[1]);
626   output[1] = custom_packs_s32(u[2], u[3]);
627   output[2] = vextq_s16(output[0], output[0], 4);
628   output[3] = vextq_s16(output[1], output[1], 4);
629 }
630 
631 #define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
632   {                                                               \
633     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));             \
634     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));           \
635     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));             \
636     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));           \
637     int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                    \
638     u0 = vmlaq_n_s32(u0, in_low0, w0_l);                          \
639     int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                   \
640     u1 = vmlaq_n_s32(u1, in_high0, w0_l);                         \
641     int32x4_t v0 = vmulq_n_s32(in_low1, w1_h);                    \
642     v0 = vmlaq_n_s32(v0, in_low0, w1_l);                          \
643     int32x4_t v1 = vmulq_n_s32(in_high1, w1_h);                   \
644     v1 = vmlaq_n_s32(v1, in_high0, w1_l);                         \
645     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                     \
646     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                     \
647     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                     \
648     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                     \
649     out0 = custom_packs_s32(c0, c1);                              \
650     out1 = custom_packs_s32(d0, d1);                              \
651   }
652 
653 #define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
654   {                                                                    \
655     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
656     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
657     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
658     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
659     int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
660     u0 = vmlsq_n_s32(u0, in_low0, w0_l);                               \
661     int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
662     u1 = vmlsq_n_s32(u1, in_high0, w0_l);                              \
663     int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
664     v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
665     int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
666     v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
667     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
668     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
669     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
670     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
671     out0 = custom_packs_s32(c0, c1);                                   \
672     out1 = custom_packs_s32(d0, d1);                                   \
673   }
674 
675 #define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
676   {                                                                    \
677     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
678     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
679     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
680     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
681     int32x4_t u0 = vmulq_n_s32(in_low0, w0_l);                         \
682     u0 = vmlsq_n_s32(u0, in_low1, w0_h);                               \
683     int32x4_t u1 = vmulq_n_s32(in_high0, w0_l);                        \
684     u1 = vmlsq_n_s32(u1, in_high1, w0_h);                              \
685     int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
686     v0 = vmlaq_n_s32(v0, in_low0, w0_h);                               \
687     int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
688     v1 = vmlaq_n_s32(v1, in_high0, w0_h);                              \
689     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
690     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
691     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
692     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
693     out0 = custom_packs_s32(c0, c1);                                   \
694     out1 = custom_packs_s32(d0, d1);                                   \
695   }
696 
697 #define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
698   {                                                                     \
699     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                   \
700     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                 \
701     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                   \
702     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                 \
703     int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h);                         \
704     u0 = vmlsq_n_s32(u0, in_low0, w0_l);                                \
705     int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h);                        \
706     u1 = vmlsq_n_s32(u1, in_high0, w0_l);                               \
707     int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                          \
708     v0 = vmlsq_n_s32(v0, in_low0, w0_h);                                \
709     int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                         \
710     v1 = vmlsq_n_s32(v1, in_high0, w0_h);                               \
711     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                           \
712     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                           \
713     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                           \
714     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                           \
715     out0 = custom_packs_s32(c0, c1);                                    \
716     out1 = custom_packs_s32(d0, d1);                                    \
717   }
718 
719 #define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
720   {                                                                    \
721     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
722     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
723     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
724     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
725     int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
726     u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
727     int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
728     u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
729     int32x4_t v0 = vmulq_n_s32(in_low1, w0_l);                         \
730     v0 = vmlsq_n_s32(v0, in_low0, w0_h);                               \
731     int32x4_t v1 = vmulq_n_s32(in_high1, w0_l);                        \
732     v1 = vmlsq_n_s32(v1, in_high0, w0_h);                              \
733     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
734     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
735     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
736     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
737     out0 = custom_packs_s32(c0, c1);                                   \
738     out1 = custom_packs_s32(d0, d1);                                   \
739   }
740 
741 #define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
742   {                                                                    \
743     int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0));                  \
744     int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0));                \
745     int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1));                  \
746     int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1));                \
747     int32x4_t u0 = vmulq_n_s32(in_low1, w0_h);                         \
748     u0 = vmlaq_n_s32(u0, in_low0, w0_l);                               \
749     int32x4_t u1 = vmulq_n_s32(in_high1, w0_h);                        \
750     u1 = vmlaq_n_s32(u1, in_high0, w0_l);                              \
751     int32x4_t v0 = vmulq_n_s32(in_low0, w0_h);                         \
752     v0 = vmlsq_n_s32(v0, in_low1, w0_l);                               \
753     int32x4_t v1 = vmulq_n_s32(in_high0, w0_h);                        \
754     v1 = vmlsq_n_s32(v1, in_high1, w0_l);                              \
755     int32x4_t c0 = vrshlq_s32(u0, v_cos_bit);                          \
756     int32x4_t c1 = vrshlq_s32(u1, v_cos_bit);                          \
757     int32x4_t d0 = vrshlq_s32(v0, v_cos_bit);                          \
758     int32x4_t d1 = vrshlq_s32(v1, v_cos_bit);                          \
759     out0 = custom_packs_s32(c0, c1);                                   \
760     out1 = custom_packs_s32(d0, d1);                                   \
761   }
762 
fdct8x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)763 static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
764                          int8_t cos_bit, const int8_t *stage_range) {
765   (void)stage_range;
766   const int32_t *cospi = cospi_arr(cos_bit);
767   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
768 
769   // stage 1
770   int16x8_t x1[4];
771   x1[0] = vqaddq_s16(input[0], input[3]);
772   x1[3] = vqsubq_s16(input[0], input[3]);
773   x1[1] = vqaddq_s16(input[1], input[2]);
774   x1[2] = vqsubq_s16(input[1], input[2]);
775 
776   // stage 2
777   int16x8_t x2[4];
778   btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
779                     v_cos_bit);
780   btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
781                     v_cos_bit);
782 
783   // stage 3
784   output[0] = x2[0];
785   output[1] = x2[2];
786   output[2] = x2[1];
787   output[3] = x2[3];
788 }
789 
fdct4x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)790 static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
791                          int8_t cos_bit, const int8_t *stage_range) {
792   (void)stage_range;
793   const int32_t *cospi = cospi_arr(cos_bit);
794   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
795 
796   // stage 1
797   int16x8_t x1[8];
798   x1[0] = vqaddq_s16(input[0], input[7]);
799   x1[7] = vqsubq_s16(input[0], input[7]);
800   x1[1] = vqaddq_s16(input[1], input[6]);
801   x1[6] = vqsubq_s16(input[1], input[6]);
802   x1[2] = vqaddq_s16(input[2], input[5]);
803   x1[5] = vqsubq_s16(input[2], input[5]);
804   x1[3] = vqaddq_s16(input[3], input[4]);
805   x1[4] = vqsubq_s16(input[3], input[4]);
806 
807   // stage 2
808   int16x8_t x2[8];
809   x2[0] = vqaddq_s16(x1[0], x1[3]);
810   x2[3] = vqsubq_s16(x1[0], x1[3]);
811   x2[1] = vqaddq_s16(x1[1], x1[2]);
812   x2[2] = vqsubq_s16(x1[1], x1[2]);
813 
814   btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
815                        v_cos_bit);
816 
817   // stage 3
818   int16x8_t x3[8];
819   btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
820                        v_cos_bit);
821 
822   btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
823                        v_cos_bit);
824   x3[4] = vqaddq_s16(x1[4], x2[5]);
825   x3[5] = vqsubq_s16(x1[4], x2[5]);
826   x3[6] = vqsubq_s16(x1[7], x2[6]);
827   x3[7] = vqaddq_s16(x1[7], x2[6]);
828 
829   // stage 4-5
830   btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
831                        v_cos_bit);
832   btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
833                        v_cos_bit);
834 }
835 
fdct8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)836 void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
837                   const int8_t *stage_range) {
838   (void)stage_range;
839   const int32_t *cospi = cospi_arr(cos_bit);
840   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
841 
842   // stage 1
843   int16x8_t x1[8];
844   x1[0] = vqaddq_s16(input[0], input[7]);
845   x1[7] = vqsubq_s16(input[0], input[7]);
846   x1[1] = vqaddq_s16(input[1], input[6]);
847   x1[6] = vqsubq_s16(input[1], input[6]);
848   x1[2] = vqaddq_s16(input[2], input[5]);
849   x1[5] = vqsubq_s16(input[2], input[5]);
850   x1[3] = vqaddq_s16(input[3], input[4]);
851   x1[4] = vqsubq_s16(input[3], input[4]);
852 
853   // stage 2
854   int16x8_t x2[8];
855   x2[0] = vqaddq_s16(x1[0], x1[3]);
856   x2[3] = vqsubq_s16(x1[0], x1[3]);
857   x2[1] = vqaddq_s16(x1[1], x1[2]);
858   x2[2] = vqsubq_s16(x1[1], x1[2]);
859   btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
860                     v_cos_bit);
861 
862   // stage 3
863   int16x8_t x3[8];
864   btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
865                     v_cos_bit);
866   btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
867                     v_cos_bit);
868   x3[4] = vqaddq_s16(x1[4], x2[5]);
869   x3[5] = vqsubq_s16(x1[4], x2[5]);
870   x3[6] = vqsubq_s16(x1[7], x2[6]);
871   x3[7] = vqaddq_s16(x1[7], x2[6]);
872 
873   // stage 4-5
874   btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
875                     v_cos_bit);
876   btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
877                     v_cos_bit);
878 }
879 
fdct8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)880 static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
881                           int8_t cos_bit, const int8_t *stage_range) {
882   (void)stage_range;
883   const int32_t *cospi = cospi_arr(cos_bit);
884   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
885 
886   // stage 1
887   int16x8_t x1[16];
888   x1[0] = vqaddq_s16(input[0], input[15]);
889   x1[15] = vqsubq_s16(input[0], input[15]);
890   x1[1] = vqaddq_s16(input[1], input[14]);
891   x1[14] = vqsubq_s16(input[1], input[14]);
892   x1[2] = vqaddq_s16(input[2], input[13]);
893   x1[13] = vqsubq_s16(input[2], input[13]);
894   x1[3] = vqaddq_s16(input[3], input[12]);
895   x1[12] = vqsubq_s16(input[3], input[12]);
896   x1[4] = vqaddq_s16(input[4], input[11]);
897   x1[11] = vqsubq_s16(input[4], input[11]);
898   x1[5] = vqaddq_s16(input[5], input[10]);
899   x1[10] = vqsubq_s16(input[5], input[10]);
900   x1[6] = vqaddq_s16(input[6], input[9]);
901   x1[9] = vqsubq_s16(input[6], input[9]);
902   x1[7] = vqaddq_s16(input[7], input[8]);
903   x1[8] = vqsubq_s16(input[7], input[8]);
904 
905   // stage 2
906   int16x8_t x2[16];
907   x2[0] = vqaddq_s16(x1[0], x1[7]);
908   x2[7] = vqsubq_s16(x1[0], x1[7]);
909   x2[1] = vqaddq_s16(x1[1], x1[6]);
910   x2[6] = vqsubq_s16(x1[1], x1[6]);
911   x2[2] = vqaddq_s16(x1[2], x1[5]);
912   x2[5] = vqsubq_s16(x1[2], x1[5]);
913   x2[3] = vqaddq_s16(x1[3], x1[4]);
914   x2[4] = vqsubq_s16(x1[3], x1[4]);
915 
916   btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
917                     v_cos_bit);
918   btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
919                     v_cos_bit);
920 
921   // stage 3
922   int16x8_t x3[16];
923   x3[0] = vqaddq_s16(x2[0], x2[3]);
924   x3[3] = vqsubq_s16(x2[0], x2[3]);
925   x3[1] = vqaddq_s16(x2[1], x2[2]);
926   x3[2] = vqsubq_s16(x2[1], x2[2]);
927 
928   btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
929                     v_cos_bit);
930 
931   x3[8] = vqaddq_s16(x1[8], x2[11]);
932   x3[11] = vqsubq_s16(x1[8], x2[11]);
933   x3[9] = vqaddq_s16(x1[9], x2[10]);
934   x3[10] = vqsubq_s16(x1[9], x2[10]);
935   x3[12] = vqsubq_s16(x1[15], x2[12]);
936   x3[15] = vqaddq_s16(x1[15], x2[12]);
937   x3[13] = vqsubq_s16(x1[14], x2[13]);
938   x3[14] = vqaddq_s16(x1[14], x2[13]);
939 
940   // stage 4
941   int16x8_t x4[16];
942   btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
943               output[0], output[8]);
944   btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
945               output[4], output[12]);
946   x4[4] = vqaddq_s16(x2[4], x3[5]);
947   x4[5] = vqsubq_s16(x2[4], x3[5]);
948   x4[6] = vqsubq_s16(x2[7], x3[6]);
949   x4[7] = vqaddq_s16(x2[7], x3[6]);
950   btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
951                     v_cos_bit);
952   btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
953                      v_cos_bit);
954 
955   // stage 5
956   int16x8_t x5[16];
957 
958   btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
959                     v_cos_bit);
960   btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
961                     v_cos_bit);
962   x5[8] = vqaddq_s16(x3[8], x4[9]);
963   x5[9] = vqsubq_s16(x3[8], x4[9]);
964   x5[10] = vqsubq_s16(x3[11], x4[10]);
965   x5[11] = vqaddq_s16(x3[11], x4[10]);
966   x5[12] = vqaddq_s16(x3[12], x4[13]);
967   x5[13] = vqsubq_s16(x3[12], x4[13]);
968   x5[14] = vqsubq_s16(x3[15], x4[14]);
969   x5[15] = vqaddq_s16(x3[15], x4[14]);
970 
971   // stage 6-7
972   btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
973                     v_cos_bit);
974   btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
975                     v_cos_bit);
976   btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
977                     v_cos_bit);
978   btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
979                     v_cos_bit);
980 }
981 
av1_fdct8x32_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)982 void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
983                        int8_t cos_bit, const int8_t *stage_range) {
984   (void)stage_range;
985   const int32_t *cospi = cospi_arr(cos_bit);
986   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
987 
988   // stage 1
989   int16x8_t x1[32];
990   x1[0] = vqaddq_s16(input[0], input[31]);
991   x1[31] = vqsubq_s16(input[0], input[31]);
992   x1[1] = vqaddq_s16(input[1], input[30]);
993   x1[30] = vqsubq_s16(input[1], input[30]);
994   x1[2] = vqaddq_s16(input[2], input[29]);
995   x1[29] = vqsubq_s16(input[2], input[29]);
996   x1[3] = vqaddq_s16(input[3], input[28]);
997   x1[28] = vqsubq_s16(input[3], input[28]);
998   x1[4] = vqaddq_s16(input[4], input[27]);
999   x1[27] = vqsubq_s16(input[4], input[27]);
1000   x1[5] = vqaddq_s16(input[5], input[26]);
1001   x1[26] = vqsubq_s16(input[5], input[26]);
1002   x1[6] = vqaddq_s16(input[6], input[25]);
1003   x1[25] = vqsubq_s16(input[6], input[25]);
1004   x1[7] = vqaddq_s16(input[7], input[24]);
1005   x1[24] = vqsubq_s16(input[7], input[24]);
1006   x1[8] = vqaddq_s16(input[8], input[23]);
1007   x1[23] = vqsubq_s16(input[8], input[23]);
1008   x1[9] = vqaddq_s16(input[9], input[22]);
1009   x1[22] = vqsubq_s16(input[9], input[22]);
1010   x1[10] = vqaddq_s16(input[10], input[21]);
1011   x1[21] = vqsubq_s16(input[10], input[21]);
1012   x1[11] = vqaddq_s16(input[11], input[20]);
1013   x1[20] = vqsubq_s16(input[11], input[20]);
1014   x1[12] = vqaddq_s16(input[12], input[19]);
1015   x1[19] = vqsubq_s16(input[12], input[19]);
1016   x1[13] = vqaddq_s16(input[13], input[18]);
1017   x1[18] = vqsubq_s16(input[13], input[18]);
1018   x1[14] = vqaddq_s16(input[14], input[17]);
1019   x1[17] = vqsubq_s16(input[14], input[17]);
1020   x1[15] = vqaddq_s16(input[15], input[16]);
1021   x1[16] = vqsubq_s16(input[15], input[16]);
1022 
1023   // stage 2
1024   int16x8_t x2[32];
1025   x2[0] = vqaddq_s16(x1[0], x1[15]);
1026   x2[15] = vqsubq_s16(x1[0], x1[15]);
1027   x2[1] = vqaddq_s16(x1[1], x1[14]);
1028   x2[14] = vqsubq_s16(x1[1], x1[14]);
1029   x2[2] = vqaddq_s16(x1[2], x1[13]);
1030   x2[13] = vqsubq_s16(x1[2], x1[13]);
1031   x2[3] = vqaddq_s16(x1[3], x1[12]);
1032   x2[12] = vqsubq_s16(x1[3], x1[12]);
1033   x2[4] = vqaddq_s16(x1[4], x1[11]);
1034   x2[11] = vqsubq_s16(x1[4], x1[11]);
1035   x2[5] = vqaddq_s16(x1[5], x1[10]);
1036   x2[10] = vqsubq_s16(x1[5], x1[10]);
1037   x2[6] = vqaddq_s16(x1[6], x1[9]);
1038   x2[9] = vqsubq_s16(x1[6], x1[9]);
1039   x2[7] = vqaddq_s16(x1[7], x1[8]);
1040   x2[8] = vqsubq_s16(x1[7], x1[8]);
1041 
1042   btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
1043                     v_cos_bit);
1044   btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
1045                     v_cos_bit);
1046   btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
1047                     v_cos_bit);
1048   btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
1049                     v_cos_bit);
1050 
1051   // stage 3
1052   int16x8_t x3[32];
1053   x3[0] = vqaddq_s16(x2[0], x2[7]);
1054   x3[7] = vqsubq_s16(x2[0], x2[7]);
1055   x3[1] = vqaddq_s16(x2[1], x2[6]);
1056   x3[6] = vqsubq_s16(x2[1], x2[6]);
1057   x3[2] = vqaddq_s16(x2[2], x2[5]);
1058   x3[5] = vqsubq_s16(x2[2], x2[5]);
1059   x3[3] = vqaddq_s16(x2[3], x2[4]);
1060   x3[4] = vqsubq_s16(x2[3], x2[4]);
1061 
1062   btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
1063                     v_cos_bit);
1064   btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
1065                     v_cos_bit);
1066 
1067   x3[16] = vqaddq_s16(x1[16], x2[23]);
1068   x3[23] = vqsubq_s16(x1[16], x2[23]);
1069   x3[17] = vqaddq_s16(x1[17], x2[22]);
1070   x3[22] = vqsubq_s16(x1[17], x2[22]);
1071   x3[18] = vqaddq_s16(x1[18], x2[21]);
1072   x3[21] = vqsubq_s16(x1[18], x2[21]);
1073   x3[19] = vqaddq_s16(x1[19], x2[20]);
1074   x3[20] = vqsubq_s16(x1[19], x2[20]);
1075   x3[24] = vqsubq_s16(x1[31], x2[24]);
1076   x3[31] = vqaddq_s16(x1[31], x2[24]);
1077   x3[25] = vqsubq_s16(x1[30], x2[25]);
1078   x3[30] = vqaddq_s16(x1[30], x2[25]);
1079   x3[26] = vqsubq_s16(x1[29], x2[26]);
1080   x3[29] = vqaddq_s16(x1[29], x2[26]);
1081   x3[27] = vqsubq_s16(x1[28], x2[27]);
1082   x3[28] = vqaddq_s16(x1[28], x2[27]);
1083 
1084   // stage 4
1085   int16x8_t x4[32];
1086   x4[0] = vqaddq_s16(x3[0], x3[3]);
1087   x4[3] = vqsubq_s16(x3[0], x3[3]);
1088   x4[1] = vqaddq_s16(x3[1], x3[2]);
1089   x4[2] = vqsubq_s16(x3[1], x3[2]);
1090   btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
1091                     v_cos_bit);
1092   x4[8] = vqaddq_s16(x2[8], x3[11]);
1093   x4[11] = vqsubq_s16(x2[8], x3[11]);
1094   x4[9] = vqaddq_s16(x2[9], x3[10]);
1095   x4[10] = vqsubq_s16(x2[9], x3[10]);
1096   x4[12] = vqsubq_s16(x2[15], x3[12]);
1097   x4[15] = vqaddq_s16(x2[15], x3[12]);
1098   x4[13] = vqsubq_s16(x2[14], x3[13]);
1099   x4[14] = vqaddq_s16(x2[14], x3[13]);
1100 
1101   btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
1102                     v_cos_bit);
1103   btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
1104                     v_cos_bit);
1105   btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
1106                      v_cos_bit);
1107   btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
1108                      v_cos_bit);
1109 
1110   // stage 5
1111   int16x8_t x5[32];
1112   btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
1113                     v_cos_bit);
1114   btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
1115                     v_cos_bit);
1116   x5[4] = vqaddq_s16(x3[4], x4[5]);
1117   x5[5] = vqsubq_s16(x3[4], x4[5]);
1118   x5[6] = vqsubq_s16(x3[7], x4[6]);
1119   x5[7] = vqaddq_s16(x3[7], x4[6]);
1120 
1121   btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
1122                     v_cos_bit);
1123   btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
1124                      v_cos_bit);
1125 
1126   x5[16] = vqaddq_s16(x3[16], x4[19]);
1127   x5[19] = vqsubq_s16(x3[16], x4[19]);
1128   x5[17] = vqaddq_s16(x3[17], x4[18]);
1129   x5[18] = vqsubq_s16(x3[17], x4[18]);
1130   x5[20] = vqsubq_s16(x3[23], x4[20]);
1131   x5[23] = vqaddq_s16(x3[23], x4[20]);
1132   x5[21] = vqsubq_s16(x3[22], x4[21]);
1133   x5[22] = vqaddq_s16(x3[22], x4[21]);
1134   x5[24] = vqaddq_s16(x3[24], x4[27]);
1135   x5[27] = vqsubq_s16(x3[24], x4[27]);
1136   x5[25] = vqaddq_s16(x3[25], x4[26]);
1137   x5[26] = vqsubq_s16(x3[25], x4[26]);
1138   x5[28] = vqsubq_s16(x3[31], x4[28]);
1139   x5[31] = vqaddq_s16(x3[31], x4[28]);
1140   x5[29] = vqsubq_s16(x3[30], x4[29]);
1141   x5[30] = vqaddq_s16(x3[30], x4[29]);
1142 
1143   // stage 6
1144   int16x8_t x6[32];
1145   btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
1146                     v_cos_bit);
1147   btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
1148                     v_cos_bit);
1149   x6[8] = vqaddq_s16(x4[8], x5[9]);
1150   x6[9] = vqsubq_s16(x4[8], x5[9]);
1151   x6[10] = vqsubq_s16(x4[11], x5[10]);
1152   x6[11] = vqaddq_s16(x4[11], x5[10]);
1153   x6[12] = vqaddq_s16(x4[12], x5[13]);
1154   x6[13] = vqsubq_s16(x4[12], x5[13]);
1155   x6[14] = vqsubq_s16(x4[15], x5[14]);
1156   x6[15] = vqaddq_s16(x4[15], x5[14]);
1157   btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
1158                     v_cos_bit);
1159   btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
1160                      v_cos_bit);
1161   btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
1162                     v_cos_bit);
1163   btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
1164                      v_cos_bit);
1165 
1166   // stage 7
1167   int16x8_t x7[32];
1168   btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
1169                     v_cos_bit);
1170   btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
1171                     v_cos_bit);
1172   btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
1173                     output[22], v_cos_bit);
1174   btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
1175                     v_cos_bit);
1176   x7[16] = vqaddq_s16(x5[16], x6[17]);
1177   x7[17] = vqsubq_s16(x5[16], x6[17]);
1178   x7[18] = vqsubq_s16(x5[19], x6[18]);
1179   x7[19] = vqaddq_s16(x5[19], x6[18]);
1180   x7[20] = vqaddq_s16(x5[20], x6[21]);
1181   x7[21] = vqsubq_s16(x5[20], x6[21]);
1182   x7[22] = vqsubq_s16(x5[23], x6[22]);
1183   x7[23] = vqaddq_s16(x5[23], x6[22]);
1184   x7[24] = vqaddq_s16(x5[24], x6[25]);
1185   x7[25] = vqsubq_s16(x5[24], x6[25]);
1186   x7[26] = vqsubq_s16(x5[27], x6[26]);
1187   x7[27] = vqaddq_s16(x5[27], x6[26]);
1188   x7[28] = vqaddq_s16(x5[28], x6[29]);
1189   x7[29] = vqsubq_s16(x5[28], x6[29]);
1190   x7[30] = vqsubq_s16(x5[31], x6[30]);
1191   x7[31] = vqaddq_s16(x5[31], x6[30]);
1192 
1193   btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
1194                     v_cos_bit);
1195   btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
1196                     output[15], v_cos_bit);
1197   btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
1198                     v_cos_bit);
1199   btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
1200                     v_cos_bit);
1201   btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
1202                     v_cos_bit);
1203   btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
1204                     output[11], v_cos_bit);
1205   btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
1206                     output[19], v_cos_bit);
1207   btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
1208                     v_cos_bit);
1209 }
1210 
av1_fdct8x64_stage_1234_neon(const int16x8_t * input,int16x8_t * x3,int16x8_t * x4,const int32_t * cospi32,const int32x4_t * v_cos_bit)1211 void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
1212                                   int16x8_t *x4, const int32_t *cospi32,
1213                                   const int32x4_t *v_cos_bit) {
1214   int16x8_t x1[64];
1215   int16x8_t x2[64];
1216   x1[0] = vqaddq_s16(input[0], input[63]);
1217   x1[63] = vqsubq_s16(input[0], input[63]);
1218   x1[1] = vqaddq_s16(input[1], input[62]);
1219   x1[62] = vqsubq_s16(input[1], input[62]);
1220   x1[2] = vqaddq_s16(input[2], input[61]);
1221   x1[61] = vqsubq_s16(input[2], input[61]);
1222   x1[3] = vqaddq_s16(input[3], input[60]);
1223   x1[60] = vqsubq_s16(input[3], input[60]);
1224   x1[4] = vqaddq_s16(input[4], input[59]);
1225   x1[59] = vqsubq_s16(input[4], input[59]);
1226   x1[5] = vqaddq_s16(input[5], input[58]);
1227   x1[58] = vqsubq_s16(input[5], input[58]);
1228   x1[6] = vqaddq_s16(input[6], input[57]);
1229   x1[57] = vqsubq_s16(input[6], input[57]);
1230   x1[7] = vqaddq_s16(input[7], input[56]);
1231   x1[56] = vqsubq_s16(input[7], input[56]);
1232   x1[8] = vqaddq_s16(input[8], input[55]);
1233   x1[55] = vqsubq_s16(input[8], input[55]);
1234   x1[9] = vqaddq_s16(input[9], input[54]);
1235   x1[54] = vqsubq_s16(input[9], input[54]);
1236   x1[10] = vqaddq_s16(input[10], input[53]);
1237   x1[53] = vqsubq_s16(input[10], input[53]);
1238   x1[11] = vqaddq_s16(input[11], input[52]);
1239   x1[52] = vqsubq_s16(input[11], input[52]);
1240   x1[12] = vqaddq_s16(input[12], input[51]);
1241   x1[51] = vqsubq_s16(input[12], input[51]);
1242   x1[13] = vqaddq_s16(input[13], input[50]);
1243   x1[50] = vqsubq_s16(input[13], input[50]);
1244   x1[14] = vqaddq_s16(input[14], input[49]);
1245   x1[49] = vqsubq_s16(input[14], input[49]);
1246   x1[15] = vqaddq_s16(input[15], input[48]);
1247   x1[48] = vqsubq_s16(input[15], input[48]);
1248   x1[16] = vqaddq_s16(input[16], input[47]);
1249   x1[47] = vqsubq_s16(input[16], input[47]);
1250   x1[17] = vqaddq_s16(input[17], input[46]);
1251   x1[46] = vqsubq_s16(input[17], input[46]);
1252   x1[18] = vqaddq_s16(input[18], input[45]);
1253   x1[45] = vqsubq_s16(input[18], input[45]);
1254   x1[19] = vqaddq_s16(input[19], input[44]);
1255   x1[44] = vqsubq_s16(input[19], input[44]);
1256   x1[20] = vqaddq_s16(input[20], input[43]);
1257   x1[43] = vqsubq_s16(input[20], input[43]);
1258   x1[21] = vqaddq_s16(input[21], input[42]);
1259   x1[42] = vqsubq_s16(input[21], input[42]);
1260   x1[22] = vqaddq_s16(input[22], input[41]);
1261   x1[41] = vqsubq_s16(input[22], input[41]);
1262   x1[23] = vqaddq_s16(input[23], input[40]);
1263   x1[40] = vqsubq_s16(input[23], input[40]);
1264   x1[24] = vqaddq_s16(input[24], input[39]);
1265   x1[39] = vqsubq_s16(input[24], input[39]);
1266   x1[25] = vqaddq_s16(input[25], input[38]);
1267   x1[38] = vqsubq_s16(input[25], input[38]);
1268   x1[26] = vqaddq_s16(input[26], input[37]);
1269   x1[37] = vqsubq_s16(input[26], input[37]);
1270   x1[27] = vqaddq_s16(input[27], input[36]);
1271   x1[36] = vqsubq_s16(input[27], input[36]);
1272   x1[28] = vqaddq_s16(input[28], input[35]);
1273   x1[35] = vqsubq_s16(input[28], input[35]);
1274   x1[29] = vqaddq_s16(input[29], input[34]);
1275   x1[34] = vqsubq_s16(input[29], input[34]);
1276   x1[30] = vqaddq_s16(input[30], input[33]);
1277   x1[33] = vqsubq_s16(input[30], input[33]);
1278   x1[31] = vqaddq_s16(input[31], input[32]);
1279   x1[32] = vqsubq_s16(input[31], input[32]);
1280 
1281   x2[0] = vqaddq_s16(x1[0], x1[31]);
1282   x2[31] = vqsubq_s16(x1[0], x1[31]);
1283   x2[1] = vqaddq_s16(x1[1], x1[30]);
1284   x2[30] = vqsubq_s16(x1[1], x1[30]);
1285   x2[2] = vqaddq_s16(x1[2], x1[29]);
1286   x2[29] = vqsubq_s16(x1[2], x1[29]);
1287   x2[3] = vqaddq_s16(x1[3], x1[28]);
1288   x2[28] = vqsubq_s16(x1[3], x1[28]);
1289   x2[4] = vqaddq_s16(x1[4], x1[27]);
1290   x2[27] = vqsubq_s16(x1[4], x1[27]);
1291   x2[5] = vqaddq_s16(x1[5], x1[26]);
1292   x2[26] = vqsubq_s16(x1[5], x1[26]);
1293   x2[6] = vqaddq_s16(x1[6], x1[25]);
1294   x2[25] = vqsubq_s16(x1[6], x1[25]);
1295   x2[7] = vqaddq_s16(x1[7], x1[24]);
1296   x2[24] = vqsubq_s16(x1[7], x1[24]);
1297   x2[8] = vqaddq_s16(x1[8], x1[23]);
1298   x2[23] = vqsubq_s16(x1[8], x1[23]);
1299   x2[9] = vqaddq_s16(x1[9], x1[22]);
1300   x2[22] = vqsubq_s16(x1[9], x1[22]);
1301   x2[10] = vqaddq_s16(x1[10], x1[21]);
1302   x2[21] = vqsubq_s16(x1[10], x1[21]);
1303   x2[11] = vqaddq_s16(x1[11], x1[20]);
1304   x2[20] = vqsubq_s16(x1[11], x1[20]);
1305   x2[12] = vqaddq_s16(x1[12], x1[19]);
1306   x2[19] = vqsubq_s16(x1[12], x1[19]);
1307   x2[13] = vqaddq_s16(x1[13], x1[18]);
1308   x2[18] = vqsubq_s16(x1[13], x1[18]);
1309   x2[14] = vqaddq_s16(x1[14], x1[17]);
1310   x2[17] = vqsubq_s16(x1[14], x1[17]);
1311   x2[15] = vqaddq_s16(x1[15], x1[16]);
1312   x2[16] = vqsubq_s16(x1[15], x1[16]);
1313 
1314   btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
1315                     *v_cos_bit);
1316   btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
1317                     *v_cos_bit);
1318   btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
1319                     *v_cos_bit);
1320   btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
1321                     *v_cos_bit);
1322   btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
1323                     *v_cos_bit);
1324   btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
1325                     *v_cos_bit);
1326   btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
1327                     *v_cos_bit);
1328   btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
1329                     *v_cos_bit);
1330 
1331   // stage 3
1332   x3[0] = vqaddq_s16(x2[0], x2[15]);
1333   x3[15] = vqsubq_s16(x2[0], x2[15]);
1334   x3[1] = vqaddq_s16(x2[1], x2[14]);
1335   x3[14] = vqsubq_s16(x2[1], x2[14]);
1336   x3[2] = vqaddq_s16(x2[2], x2[13]);
1337   x3[13] = vqsubq_s16(x2[2], x2[13]);
1338   x3[3] = vqaddq_s16(x2[3], x2[12]);
1339   x3[12] = vqsubq_s16(x2[3], x2[12]);
1340   x3[4] = vqaddq_s16(x2[4], x2[11]);
1341   x3[11] = vqsubq_s16(x2[4], x2[11]);
1342   x3[5] = vqaddq_s16(x2[5], x2[10]);
1343   x3[10] = vqsubq_s16(x2[5], x2[10]);
1344   x3[6] = vqaddq_s16(x2[6], x2[9]);
1345   x3[9] = vqsubq_s16(x2[6], x2[9]);
1346   x3[7] = vqaddq_s16(x2[7], x2[8]);
1347   x3[8] = vqsubq_s16(x2[7], x2[8]);
1348   x3[16] = x2[16];
1349   x3[17] = x2[17];
1350   x3[18] = x2[18];
1351   x3[19] = x2[19];
1352   btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
1353                     *v_cos_bit);
1354   btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
1355                     *v_cos_bit);
1356   btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
1357                     *v_cos_bit);
1358   btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
1359                     *v_cos_bit);
1360   x3[28] = x2[28];
1361   x3[29] = x2[29];
1362   x3[30] = x2[30];
1363   x3[31] = x2[31];
1364   x3[32] = vqaddq_s16(x1[32], x2[47]);
1365   x3[47] = vqsubq_s16(x1[32], x2[47]);
1366   x3[33] = vqaddq_s16(x1[33], x2[46]);
1367   x3[46] = vqsubq_s16(x1[33], x2[46]);
1368   x3[34] = vqaddq_s16(x1[34], x2[45]);
1369   x3[45] = vqsubq_s16(x1[34], x2[45]);
1370   x3[35] = vqaddq_s16(x1[35], x2[44]);
1371   x3[44] = vqsubq_s16(x1[35], x2[44]);
1372   x3[36] = vqaddq_s16(x1[36], x2[43]);
1373   x3[43] = vqsubq_s16(x1[36], x2[43]);
1374   x3[37] = vqaddq_s16(x1[37], x2[42]);
1375   x3[42] = vqsubq_s16(x1[37], x2[42]);
1376   x3[38] = vqaddq_s16(x1[38], x2[41]);
1377   x3[41] = vqsubq_s16(x1[38], x2[41]);
1378   x3[39] = vqaddq_s16(x1[39], x2[40]);
1379   x3[40] = vqsubq_s16(x1[39], x2[40]);
1380   x3[48] = vqsubq_s16(x1[63], x2[48]);
1381   x3[63] = vqaddq_s16(x1[63], x2[48]);
1382   x3[49] = vqsubq_s16(x1[62], x2[49]);
1383   x3[62] = vqaddq_s16(x1[62], x2[49]);
1384   x3[50] = vqsubq_s16(x1[61], x2[50]);
1385   x3[61] = vqaddq_s16(x1[61], x2[50]);
1386   x3[51] = vqsubq_s16(x1[60], x2[51]);
1387   x3[60] = vqaddq_s16(x1[60], x2[51]);
1388   x3[52] = vqsubq_s16(x1[59], x2[52]);
1389   x3[59] = vqaddq_s16(x1[59], x2[52]);
1390   x3[53] = vqsubq_s16(x1[58], x2[53]);
1391   x3[58] = vqaddq_s16(x1[58], x2[53]);
1392   x3[54] = vqsubq_s16(x1[57], x2[54]);
1393   x3[57] = vqaddq_s16(x1[57], x2[54]);
1394   x3[55] = vqsubq_s16(x1[56], x2[55]);
1395   x3[56] = vqaddq_s16(x1[56], x2[55]);
1396 
1397   // stage 4
1398   x4[0] = vqaddq_s16(x3[0], x3[7]);
1399   x4[7] = vqsubq_s16(x3[0], x3[7]);
1400   x4[1] = vqaddq_s16(x3[1], x3[6]);
1401   x4[6] = vqsubq_s16(x3[1], x3[6]);
1402   x4[2] = vqaddq_s16(x3[2], x3[5]);
1403   x4[5] = vqsubq_s16(x3[2], x3[5]);
1404   x4[3] = vqaddq_s16(x3[3], x3[4]);
1405   x4[4] = vqsubq_s16(x3[3], x3[4]);
1406 
1407   btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
1408                     *v_cos_bit);
1409   btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
1410                     *v_cos_bit);
1411 
1412   x4[16] = vqaddq_s16(x3[16], x3[23]);
1413   x4[23] = vqsubq_s16(x3[16], x3[23]);
1414   x4[17] = vqaddq_s16(x3[17], x3[22]);
1415   x4[22] = vqsubq_s16(x3[17], x3[22]);
1416   x4[18] = vqaddq_s16(x3[18], x3[21]);
1417   x4[21] = vqsubq_s16(x3[18], x3[21]);
1418   x4[19] = vqaddq_s16(x3[19], x3[20]);
1419   x4[20] = vqsubq_s16(x3[19], x3[20]);
1420   x4[24] = vqsubq_s16(x3[31], x3[24]);
1421   x4[31] = vqaddq_s16(x3[31], x3[24]);
1422   x4[25] = vqsubq_s16(x3[30], x3[25]);
1423   x4[30] = vqaddq_s16(x3[30], x3[25]);
1424   x4[26] = vqsubq_s16(x3[29], x3[26]);
1425   x4[29] = vqaddq_s16(x3[29], x3[26]);
1426   x4[27] = vqsubq_s16(x3[28], x3[27]);
1427   x4[28] = vqaddq_s16(x3[28], x3[27]);
1428 }
1429 
av1_fdct8x64_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1430 void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
1431                        int8_t cos_bit, const int8_t *stage_range) {
1432   (void)stage_range;
1433   const int32_t *cospi = cospi_arr(cos_bit);
1434   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1435 
1436   int16x8_t x3[64];
1437   int16x8_t x4[64];
1438 
1439   av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
1440 
1441   btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
1442                     v_cos_bit);
1443   btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
1444                     v_cos_bit);
1445   btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
1446                     v_cos_bit);
1447   btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
1448                     v_cos_bit);
1449   btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
1450                      v_cos_bit);
1451   btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
1452                      v_cos_bit);
1453   btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
1454                      v_cos_bit);
1455   btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
1456                      v_cos_bit);
1457 
1458   // stage 5
1459   int16x8_t x5[64];
1460   x5[0] = vqaddq_s16(x4[0], x4[3]);
1461   x5[3] = vqsubq_s16(x4[0], x4[3]);
1462   x5[1] = vqaddq_s16(x4[1], x4[2]);
1463   x5[2] = vqsubq_s16(x4[1], x4[2]);
1464 
1465   btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
1466                     v_cos_bit);
1467 
1468   x5[8] = vqaddq_s16(x3[8], x4[11]);
1469   x5[11] = vqsubq_s16(x3[8], x4[11]);
1470   x5[9] = vqaddq_s16(x3[9], x4[10]);
1471   x5[10] = vqsubq_s16(x3[9], x4[10]);
1472   x5[12] = vqsubq_s16(x3[15], x4[12]);
1473   x5[15] = vqaddq_s16(x3[15], x4[12]);
1474   x5[13] = vqsubq_s16(x3[14], x4[13]);
1475   x5[14] = vqaddq_s16(x3[14], x4[13]);
1476 
1477   btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
1478                     v_cos_bit);
1479   btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
1480                     v_cos_bit);
1481   btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
1482                      v_cos_bit);
1483   btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
1484                      v_cos_bit);
1485 
1486   x5[32] = vqaddq_s16(x3[32], x4[39]);
1487   x5[39] = vqsubq_s16(x3[32], x4[39]);
1488   x5[33] = vqaddq_s16(x3[33], x4[38]);
1489   x5[38] = vqsubq_s16(x3[33], x4[38]);
1490   x5[34] = vqaddq_s16(x3[34], x4[37]);
1491   x5[37] = vqsubq_s16(x3[34], x4[37]);
1492   x5[35] = vqaddq_s16(x3[35], x4[36]);
1493   x5[36] = vqsubq_s16(x3[35], x4[36]);
1494   x5[40] = vqsubq_s16(x3[47], x4[40]);
1495   x5[47] = vqaddq_s16(x3[47], x4[40]);
1496   x5[41] = vqsubq_s16(x3[46], x4[41]);
1497   x5[46] = vqaddq_s16(x3[46], x4[41]);
1498   x5[42] = vqsubq_s16(x3[45], x4[42]);
1499   x5[45] = vqaddq_s16(x3[45], x4[42]);
1500   x5[43] = vqsubq_s16(x3[44], x4[43]);
1501   x5[44] = vqaddq_s16(x3[44], x4[43]);
1502   x5[48] = vqaddq_s16(x3[48], x4[55]);
1503   x5[55] = vqsubq_s16(x3[48], x4[55]);
1504   x5[49] = vqaddq_s16(x3[49], x4[54]);
1505   x5[54] = vqsubq_s16(x3[49], x4[54]);
1506   x5[50] = vqaddq_s16(x3[50], x4[53]);
1507   x5[53] = vqsubq_s16(x3[50], x4[53]);
1508   x5[51] = vqaddq_s16(x3[51], x4[52]);
1509   x5[52] = vqsubq_s16(x3[51], x4[52]);
1510   x5[56] = vqsubq_s16(x3[63], x4[56]);
1511   x5[63] = vqaddq_s16(x3[63], x4[56]);
1512   x5[57] = vqsubq_s16(x3[62], x4[57]);
1513   x5[62] = vqaddq_s16(x3[62], x4[57]);
1514   x5[58] = vqsubq_s16(x3[61], x4[58]);
1515   x5[61] = vqaddq_s16(x3[61], x4[58]);
1516   x5[59] = vqsubq_s16(x3[60], x4[59]);
1517   x5[60] = vqaddq_s16(x3[60], x4[59]);
1518 
1519   // stage 6
1520   int16x8_t x6[64];
1521   btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
1522                     v_cos_bit);
1523   btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
1524                     v_cos_bit);
1525   x6[4] = vqaddq_s16(x4[4], x5[5]);
1526   x6[5] = vqsubq_s16(x4[4], x5[5]);
1527   x6[6] = vqsubq_s16(x4[7], x5[6]);
1528   x6[7] = vqaddq_s16(x4[7], x5[6]);
1529 
1530   btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
1531                     v_cos_bit);
1532   btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
1533                      v_cos_bit);
1534 
1535   x6[16] = vqaddq_s16(x4[16], x5[19]);
1536   x6[19] = vqsubq_s16(x4[16], x5[19]);
1537   x6[17] = vqaddq_s16(x4[17], x5[18]);
1538   x6[18] = vqsubq_s16(x4[17], x5[18]);
1539   x6[20] = vqsubq_s16(x4[23], x5[20]);
1540   x6[23] = vqaddq_s16(x4[23], x5[20]);
1541   x6[21] = vqsubq_s16(x4[22], x5[21]);
1542   x6[22] = vqaddq_s16(x4[22], x5[21]);
1543   x6[24] = vqaddq_s16(x4[24], x5[27]);
1544   x6[27] = vqsubq_s16(x4[24], x5[27]);
1545   x6[25] = vqaddq_s16(x4[25], x5[26]);
1546   x6[26] = vqsubq_s16(x4[25], x5[26]);
1547   x6[28] = vqsubq_s16(x4[31], x5[28]);
1548   x6[31] = vqaddq_s16(x4[31], x5[28]);
1549   x6[29] = vqsubq_s16(x4[30], x5[29]);
1550   x6[30] = vqaddq_s16(x4[30], x5[29]);
1551 
1552   btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
1553                     v_cos_bit);
1554   btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
1555                     v_cos_bit);
1556   btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
1557                      v_cos_bit);
1558   btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
1559                      v_cos_bit);
1560   btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
1561                     v_cos_bit);
1562   btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
1563                     v_cos_bit);
1564   btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
1565                      v_cos_bit);
1566   btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
1567                      v_cos_bit);
1568 
1569   // stage 7
1570   int16x8_t x7[64];
1571 
1572   btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
1573   btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
1574                     v_cos_bit);
1575   x7[8] = vqaddq_s16(x5[8], x6[9]);
1576   x7[9] = vqsubq_s16(x5[8], x6[9]);
1577   x7[10] = vqsubq_s16(x5[11], x6[10]);
1578   x7[11] = vqaddq_s16(x5[11], x6[10]);
1579   x7[12] = vqaddq_s16(x5[12], x6[13]);
1580   x7[13] = vqsubq_s16(x5[12], x6[13]);
1581   x7[14] = vqsubq_s16(x5[15], x6[14]);
1582   x7[15] = vqaddq_s16(x5[15], x6[14]);
1583 
1584   btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
1585                     v_cos_bit);
1586   btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
1587                      v_cos_bit);
1588 
1589   btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
1590                     v_cos_bit);
1591   btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
1592                      v_cos_bit);
1593 
1594   x7[32] = vqaddq_s16(x5[32], x6[35]);
1595   x7[35] = vqsubq_s16(x5[32], x6[35]);
1596   x7[33] = vqaddq_s16(x5[33], x6[34]);
1597   x7[34] = vqsubq_s16(x5[33], x6[34]);
1598   x7[36] = vqsubq_s16(x5[39], x6[36]);
1599   x7[39] = vqaddq_s16(x5[39], x6[36]);
1600   x7[37] = vqsubq_s16(x5[38], x6[37]);
1601   x7[38] = vqaddq_s16(x5[38], x6[37]);
1602   x7[40] = vqaddq_s16(x5[40], x6[43]);
1603   x7[43] = vqsubq_s16(x5[40], x6[43]);
1604   x7[41] = vqaddq_s16(x5[41], x6[42]);
1605   x7[42] = vqsubq_s16(x5[41], x6[42]);
1606   x7[44] = vqsubq_s16(x5[47], x6[44]);
1607   x7[47] = vqaddq_s16(x5[47], x6[44]);
1608   x7[45] = vqsubq_s16(x5[46], x6[45]);
1609   x7[46] = vqaddq_s16(x5[46], x6[45]);
1610   x7[48] = vqaddq_s16(x5[48], x6[51]);
1611   x7[51] = vqsubq_s16(x5[48], x6[51]);
1612   x7[49] = vqaddq_s16(x5[49], x6[50]);
1613   x7[50] = vqsubq_s16(x5[49], x6[50]);
1614   x7[52] = vqsubq_s16(x5[55], x6[52]);
1615   x7[55] = vqaddq_s16(x5[55], x6[52]);
1616   x7[53] = vqsubq_s16(x5[54], x6[53]);
1617   x7[54] = vqaddq_s16(x5[54], x6[53]);
1618   x7[56] = vqaddq_s16(x5[56], x6[59]);
1619   x7[59] = vqsubq_s16(x5[56], x6[59]);
1620   x7[57] = vqaddq_s16(x5[57], x6[58]);
1621   x7[58] = vqsubq_s16(x5[57], x6[58]);
1622   x7[60] = vqsubq_s16(x5[63], x6[60]);
1623   x7[63] = vqaddq_s16(x5[63], x6[60]);
1624   x7[61] = vqsubq_s16(x5[62], x6[61]);
1625   x7[62] = vqaddq_s16(x5[62], x6[61]);
1626 
1627   // stage 8
1628   int16x8_t x8[64];
1629 
1630   btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
1631                     v_cos_bit);
1632   btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
1633                     v_cos_bit);
1634   btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
1635                     v_cos_bit);
1636   btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
1637                     v_cos_bit);
1638   x8[16] = vqaddq_s16(x6[16], x7[17]);
1639   x8[17] = vqsubq_s16(x6[16], x7[17]);
1640   x8[18] = vqsubq_s16(x6[19], x7[18]);
1641   x8[19] = vqaddq_s16(x6[19], x7[18]);
1642   x8[20] = vqaddq_s16(x6[20], x7[21]);
1643   x8[21] = vqsubq_s16(x6[20], x7[21]);
1644   x8[22] = vqsubq_s16(x6[23], x7[22]);
1645   x8[23] = vqaddq_s16(x6[23], x7[22]);
1646   x8[24] = vqaddq_s16(x6[24], x7[25]);
1647   x8[25] = vqsubq_s16(x6[24], x7[25]);
1648   x8[26] = vqsubq_s16(x6[27], x7[26]);
1649   x8[27] = vqaddq_s16(x6[27], x7[26]);
1650   x8[28] = vqaddq_s16(x6[28], x7[29]);
1651   x8[29] = vqsubq_s16(x6[28], x7[29]);
1652   x8[30] = vqsubq_s16(x6[31], x7[30]);
1653   x8[31] = vqaddq_s16(x6[31], x7[30]);
1654 
1655   btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
1656                     v_cos_bit);
1657   btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
1658                      v_cos_bit);
1659   btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
1660                     v_cos_bit);
1661   btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
1662                      v_cos_bit);
1663   btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
1664                     v_cos_bit);
1665   btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
1666                      v_cos_bit);
1667   btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
1668                     v_cos_bit);
1669   btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
1670                      v_cos_bit);
1671 
1672   // stage 9
1673   int16x8_t x9[64];
1674 
1675   btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
1676                     v_cos_bit);
1677   btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
1678                     v_cos_bit);
1679   btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
1680                     v_cos_bit);
1681   btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
1682                     v_cos_bit);
1683   btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
1684                     v_cos_bit);
1685   btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
1686                     v_cos_bit);
1687   btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
1688                     v_cos_bit);
1689   btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
1690                     v_cos_bit);
1691   x9[32] = vqaddq_s16(x7[32], x8[33]);
1692   x9[33] = vqsubq_s16(x7[32], x8[33]);
1693   x9[34] = vqsubq_s16(x7[35], x8[34]);
1694   x9[35] = vqaddq_s16(x7[35], x8[34]);
1695   x9[36] = vqaddq_s16(x7[36], x8[37]);
1696   x9[37] = vqsubq_s16(x7[36], x8[37]);
1697   x9[38] = vqsubq_s16(x7[39], x8[38]);
1698   x9[39] = vqaddq_s16(x7[39], x8[38]);
1699   x9[40] = vqaddq_s16(x7[40], x8[41]);
1700   x9[41] = vqsubq_s16(x7[40], x8[41]);
1701   x9[42] = vqsubq_s16(x7[43], x8[42]);
1702   x9[43] = vqaddq_s16(x7[43], x8[42]);
1703   x9[44] = vqaddq_s16(x7[44], x8[45]);
1704   x9[45] = vqsubq_s16(x7[44], x8[45]);
1705   x9[46] = vqsubq_s16(x7[47], x8[46]);
1706   x9[47] = vqaddq_s16(x7[47], x8[46]);
1707   x9[48] = vqaddq_s16(x7[48], x8[49]);
1708   x9[49] = vqsubq_s16(x7[48], x8[49]);
1709   x9[50] = vqsubq_s16(x7[51], x8[50]);
1710   x9[51] = vqaddq_s16(x7[51], x8[50]);
1711   x9[52] = vqaddq_s16(x7[52], x8[53]);
1712   x9[53] = vqsubq_s16(x7[52], x8[53]);
1713   x9[54] = vqsubq_s16(x7[55], x8[54]);
1714   x9[55] = vqaddq_s16(x7[55], x8[54]);
1715   x9[56] = vqaddq_s16(x7[56], x8[57]);
1716   x9[57] = vqsubq_s16(x7[56], x8[57]);
1717   x9[58] = vqsubq_s16(x7[59], x8[58]);
1718   x9[59] = vqaddq_s16(x7[59], x8[58]);
1719   x9[60] = vqaddq_s16(x7[60], x8[61]);
1720   x9[61] = vqsubq_s16(x7[60], x8[61]);
1721   x9[62] = vqsubq_s16(x7[63], x8[62]);
1722   x9[63] = vqaddq_s16(x7[63], x8[62]);
1723 
1724   // stage 10
1725   btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
1726                     v_cos_bit);
1727 
1728   btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
1729                     output[31], v_cos_bit);
1730 
1731   btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
1732                     output[47], v_cos_bit);
1733 
1734   btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
1735                     output[15], v_cos_bit);
1736 
1737   btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
1738                     v_cos_bit);
1739 
1740   btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
1741                     output[23], v_cos_bit);
1742 
1743   btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
1744                     output[39], v_cos_bit);
1745 
1746   btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
1747                     v_cos_bit);
1748 
1749   btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
1750                     v_cos_bit);
1751 
1752   btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
1753                     output[27], v_cos_bit);
1754 
1755   btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
1756                     output[43], v_cos_bit);
1757 
1758   btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
1759                     output[11], v_cos_bit);
1760 
1761   btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
1762                     output[51], v_cos_bit);
1763 
1764   btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
1765                     output[19], v_cos_bit);
1766 
1767   btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
1768                     output[35], v_cos_bit);
1769 
1770   btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
1771                     v_cos_bit);
1772 
1773   // stage 11
1774   output[0] = x6[0];
1775   output[2] = x9[16];
1776   output[4] = x8[8];
1777   output[6] = x9[24];
1778   output[8] = x7[4];
1779   output[10] = x9[20];
1780   output[12] = x8[12];
1781   output[14] = x9[28];
1782   output[16] = x6[2];
1783   output[18] = x9[18];
1784   output[20] = x8[10];
1785   output[22] = x9[26];
1786   output[24] = x7[6];
1787   output[26] = x9[22];
1788   output[28] = x8[14];
1789   output[30] = x9[30];
1790   output[32] = x6[1];
1791   output[34] = x9[17];
1792   output[36] = x8[9];
1793   output[38] = x9[25];
1794   output[40] = x7[5];
1795   output[42] = x9[21];
1796   output[44] = x8[13];
1797   output[46] = x9[29];
1798   output[48] = x6[3];
1799   output[52] = x8[11];
1800   output[54] = x9[27];
1801   output[56] = x7[7];
1802   output[58] = x9[23];
1803   output[60] = x8[15];
1804   output[62] = x9[31];
1805 }
1806 
fadst_8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1807 void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
1808                     const int8_t *stage_range) {
1809   (void)stage_range;
1810   const int32_t *cospi = cospi_arr(cos_bit);
1811   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1812 
1813   // stage 1
1814   int16x8_t x1[4];
1815 
1816   x1[0] = vqnegq_s16(input[7]);
1817   x1[1] = vqnegq_s16(input[3]);
1818   x1[2] = vqnegq_s16(input[1]);
1819   x1[3] = vqnegq_s16(input[5]);
1820 
1821   // stage 2
1822   int16x8_t x2[8];
1823 
1824   btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
1825                     v_cos_bit);
1826   btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
1827                     v_cos_bit);
1828   // stage 3
1829   int16x8_t x3[8];
1830   x3[0] = vqaddq_s16(input[0], x2[2]);
1831   x3[2] = vqsubq_s16(input[0], x2[2]);
1832   x3[1] = vqaddq_s16(x1[0], x2[3]);
1833   x3[3] = vqsubq_s16(x1[0], x2[3]);
1834   x3[4] = vqaddq_s16(x1[2], x2[6]);
1835   x3[6] = vqsubq_s16(x1[2], x2[6]);
1836   x3[5] = vqaddq_s16(input[6], x2[7]);
1837   x3[7] = vqsubq_s16(input[6], x2[7]);
1838 
1839   // stage 4
1840   btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
1841                     v_cos_bit);
1842   btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
1843                     v_cos_bit);
1844 
1845   // stage 5
1846   int16x8_t x5[8];
1847   x5[0] = vqaddq_s16(x3[0], x3[4]);
1848   x5[4] = vqsubq_s16(x3[0], x3[4]);
1849   x5[1] = vqaddq_s16(x3[1], x3[5]);
1850   x5[5] = vqsubq_s16(x3[1], x3[5]);
1851   x5[2] = vqaddq_s16(x3[2], x3[6]);
1852   x5[6] = vqsubq_s16(x3[2], x3[6]);
1853   x5[3] = vqaddq_s16(x3[3], x3[7]);
1854   x5[7] = vqsubq_s16(x3[3], x3[7]);
1855 
1856   // stage 6
1857   btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
1858                     v_cos_bit);
1859   btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
1860                     v_cos_bit);
1861   btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
1862                     v_cos_bit);
1863   btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
1864                     v_cos_bit);
1865 }
1866 
fadst8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1867 static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
1868                            int8_t cos_bit, const int8_t *stage_range) {
1869   (void)stage_range;
1870   const int32_t *cospi = cospi_arr(cos_bit);
1871   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1872 
1873   // stage 1
1874   int16x8_t x1[12];
1875   x1[0] = vqnegq_s16(input[15]);
1876   x1[1] = vqnegq_s16(input[3]);
1877   x1[2] = vqnegq_s16(input[1]);
1878   x1[3] = vqnegq_s16(input[13]);
1879 
1880   // stage 2
1881   btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
1882               x1[4], x1[5]);
1883   btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
1884                     v_cos_bit);
1885   btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
1886                     v_cos_bit);
1887   btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
1888               input[10], x1[10], x1[11]);
1889   // stage 3
1890   int16x8_t x3[16];
1891   x3[0] = vqaddq_s16(input[0], x1[4]);
1892   x3[2] = vqsubq_s16(input[0], x1[4]);
1893   x3[1] = vqaddq_s16(x1[0], x1[5]);
1894   x3[3] = vqsubq_s16(x1[0], x1[5]);
1895   x3[4] = vqaddq_s16(x1[1], x1[6]);
1896   x3[6] = vqsubq_s16(x1[1], x1[6]);
1897   x3[5] = vqaddq_s16(input[12], x1[7]);
1898   x3[7] = vqsubq_s16(input[12], x1[7]);
1899   x3[8] = vqaddq_s16(x1[2], x1[8]);
1900   x3[10] = vqsubq_s16(x1[2], x1[8]);
1901   x3[9] = vqaddq_s16(input[14], x1[9]);
1902   x3[11] = vqsubq_s16(input[14], x1[9]);
1903   x3[12] = vqaddq_s16(input[2], x1[10]);
1904   x3[14] = vqsubq_s16(input[2], x1[10]);
1905   x3[13] = vqaddq_s16(x1[3], x1[11]);
1906   x3[15] = vqsubq_s16(x1[3], x1[11]);
1907 
1908   // stage 4
1909   btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
1910                     v_cos_bit);
1911   btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
1912                     v_cos_bit);
1913   btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
1914                     v_cos_bit);
1915   btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
1916                     v_cos_bit);
1917 
1918   // stage 5
1919   int16x8_t x5[16];
1920   x5[0] = vqaddq_s16(x3[0], x3[4]);
1921   x5[4] = vqsubq_s16(x3[0], x3[4]);
1922   x5[1] = vqaddq_s16(x3[1], x3[5]);
1923   x5[5] = vqsubq_s16(x3[1], x3[5]);
1924   x5[2] = vqaddq_s16(x3[2], x3[6]);
1925   x5[6] = vqsubq_s16(x3[2], x3[6]);
1926   x5[3] = vqaddq_s16(x3[3], x3[7]);
1927   x5[7] = vqsubq_s16(x3[3], x3[7]);
1928   x5[8] = vqaddq_s16(x3[8], x3[12]);
1929   x5[12] = vqsubq_s16(x3[8], x3[12]);
1930   x5[9] = vqaddq_s16(x3[9], x3[13]);
1931   x5[13] = vqsubq_s16(x3[9], x3[13]);
1932   x5[10] = vqaddq_s16(x3[10], x3[14]);
1933   x5[14] = vqsubq_s16(x3[10], x3[14]);
1934   x5[11] = vqaddq_s16(x3[11], x3[15]);
1935   x5[15] = vqsubq_s16(x3[11], x3[15]);
1936 
1937   // stage 6
1938   btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
1939   btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
1940                     v_cos_bit);
1941   btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
1942                     v_cos_bit);
1943   btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
1944                     v_cos_bit);
1945 
1946   // stage 7
1947   int16x8_t x7[16];
1948   x7[0] = vqaddq_s16(x5[0], x5[8]);
1949   x7[8] = vqsubq_s16(x5[0], x5[8]);
1950   x7[1] = vqaddq_s16(x5[1], x5[9]);
1951   x7[9] = vqsubq_s16(x5[1], x5[9]);
1952   x7[2] = vqaddq_s16(x5[2], x5[10]);
1953   x7[10] = vqsubq_s16(x5[2], x5[10]);
1954   x7[3] = vqaddq_s16(x5[3], x5[11]);
1955   x7[11] = vqsubq_s16(x5[3], x5[11]);
1956   x7[4] = vqaddq_s16(x5[4], x5[12]);
1957   x7[12] = vqsubq_s16(x5[4], x5[12]);
1958   x7[5] = vqaddq_s16(x5[5], x5[13]);
1959   x7[13] = vqsubq_s16(x5[5], x5[13]);
1960   x7[6] = vqaddq_s16(x5[6], x5[14]);
1961   x7[14] = vqsubq_s16(x5[6], x5[14]);
1962   x7[7] = vqaddq_s16(x5[7], x5[15]);
1963   x7[15] = vqsubq_s16(x5[7], x5[15]);
1964 
1965   // stage 8
1966   btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
1967                     v_cos_bit);
1968   btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
1969                     v_cos_bit);
1970   btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
1971                     v_cos_bit);
1972   btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
1973                     v_cos_bit);
1974   btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
1975                     v_cos_bit);
1976   btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
1977                     v_cos_bit);
1978   btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
1979                     v_cos_bit);
1980   btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
1981                     v_cos_bit);
1982 }
1983 
av1_fidentity4x4_neon(const int16x8_t * const input,int16x8_t * const output,const int8_t cos_bit,const int8_t * stage_range)1984 void av1_fidentity4x4_neon(const int16x8_t *const input,
1985                            int16x8_t *const output, const int8_t cos_bit,
1986                            const int8_t *stage_range) {
1987   (void)cos_bit;
1988   (void)stage_range;
1989   const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
1990   for (int i = 0; i < 4; ++i) {
1991     const int16x4_t b = vqrshrn_n_s32(
1992         vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
1993     output[i] = vcombine_s16(b, b);
1994   }
1995 }
1996 
fidentity8x4_neon(const int16x8_t * const input,int16x8_t * const output,const int8_t cos_bit,const int8_t * stage_range)1997 static INLINE void fidentity8x4_neon(const int16x8_t *const input,
1998                                      int16x8_t *const output,
1999                                      const int8_t cos_bit,
2000                                      const int8_t *stage_range) {
2001   (void)stage_range;
2002   (void)cos_bit;
2003   const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
2004   for (int i = 0; i < 4; ++i) {
2005     const int16x4_t b_lo = vqrshrn_n_s32(
2006         vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2007     const int16x4_t b_hi = vqrshrn_n_s32(
2008         vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2009     output[i] = vcombine_s16(b_lo, b_hi);
2010   }
2011 }
2012 
fidentity8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2013 void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
2014                        int8_t cos_bit, const int8_t *stage_range) {
2015   (void)cos_bit;
2016   (void)stage_range;
2017   int16x8_t one = vdupq_n_s16(1);
2018   output[0] = vqrshlq_s16(input[0], one);
2019   output[1] = vqrshlq_s16(input[1], one);
2020   output[2] = vqrshlq_s16(input[2], one);
2021   output[3] = vqrshlq_s16(input[3], one);
2022   output[4] = vqrshlq_s16(input[4], one);
2023   output[5] = vqrshlq_s16(input[5], one);
2024   output[6] = vqrshlq_s16(input[6], one);
2025   output[7] = vqrshlq_s16(input[7], one);
2026 }
2027 
fidentity8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2028 static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
2029                                       int8_t cos_bit,
2030                                       const int8_t *stage_range) {
2031   (void)stage_range;
2032   (void)cos_bit;
2033   const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
2034   for (int i = 0; i < 16; ++i) {
2035     const int16x4_t b_lo = vqrshrn_n_s32(
2036         vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2037     const int16x4_t b_hi = vqrshrn_n_s32(
2038         vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2039     output[i] = vcombine_s16(b_lo, b_hi);
2040   }
2041 }
2042 
fidentity8x32_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2043 static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
2044                                       int8_t cos_bit,
2045                                       const int8_t *stage_range) {
2046   (void)stage_range;
2047   (void)cos_bit;
2048   for (int i = 0; i < 32; ++i) {
2049     output[i] = vshlq_n_s16(input[i], 2);
2050   }
2051 }
2052 
2053 typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
2054                                       int8_t cos_bit,
2055                                       const int8_t *stage_range);
2056 
2057 static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
2058   av1_fdct4x4_neon,       // DCT_DCT
2059   av1_fadst4x4_neon,      // ADST_DCT
2060   av1_fdct4x4_neon,       // DCT_ADST
2061   av1_fadst4x4_neon,      // ADST_ADST
2062   av1_fadst4x4_neon,      // FLIPADST_DCT
2063   av1_fdct4x4_neon,       // DCT_FLIPADST
2064   av1_fadst4x4_neon,      // FLIPADST_FLIPADST
2065   av1_fadst4x4_neon,      // ADST_FLIPADST
2066   av1_fadst4x4_neon,      // FLIPADST_ADST
2067   av1_fidentity4x4_neon,  // IDTX
2068   av1_fdct4x4_neon,       // V_DCT
2069   av1_fidentity4x4_neon,  // H_DCT
2070   av1_fadst4x4_neon,      // V_ADST
2071   av1_fidentity4x4_neon,  // H_ADST
2072   av1_fadst4x4_neon,      // V_FLIPADST
2073   av1_fidentity4x4_neon   // H_FLIPADST
2074 };
2075 
2076 static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
2077   av1_fdct4x4_neon,       // DCT_DCT
2078   av1_fdct4x4_neon,       // ADST_DCT
2079   av1_fadst4x4_neon,      // DCT_ADST
2080   av1_fadst4x4_neon,      // ADST_ADST
2081   av1_fdct4x4_neon,       // FLIPADST_DCT
2082   av1_fadst4x4_neon,      // DCT_FLIPADST
2083   av1_fadst4x4_neon,      // FLIPADST_FLIPADST
2084   av1_fadst4x4_neon,      // ADST_FLIPADST
2085   av1_fadst4x4_neon,      // FLIPADST_ADST
2086   av1_fidentity4x4_neon,  // IDTX
2087   av1_fidentity4x4_neon,  // V_DCT
2088   av1_fdct4x4_neon,       // H_DCT
2089   av1_fidentity4x4_neon,  // V_ADST
2090   av1_fadst4x4_neon,      // H_ADST
2091   av1_fidentity4x4_neon,  // V_FLIPADST
2092   av1_fadst4x4_neon       // H_FLIPADST
2093 };
2094 
2095 static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
2096   fdct4x8_neon,       // DCT_DCT
2097   fadst4x8_neon,      // ADST_DCT
2098   fdct4x8_neon,       // DCT_ADST
2099   fadst4x8_neon,      // ADST_ADST
2100   fadst4x8_neon,      // FLIPADST_DCT
2101   fdct4x8_neon,       // DCT_FLIPADST
2102   fadst4x8_neon,      // FLIPADST_FLIPADST
2103   fadst4x8_neon,      // ADST_FLIPADST
2104   fadst4x8_neon,      // FLIPADST_ADST
2105   fidentity8x8_neon,  // IDTX
2106   fdct4x8_neon,       // V_DCT
2107   fidentity8x8_neon,  // H_DCT
2108   fadst4x8_neon,      // V_ADST
2109   fidentity8x8_neon,  // H_ADST
2110   fadst4x8_neon,      // V_FLIPADST
2111   fidentity8x8_neon   // H_FLIPADST
2112 };
2113 
2114 static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
2115   fdct8x4_neon,       // DCT_DCT
2116   fdct8x4_neon,       // ADST_DCT
2117   fadst8x4_neon,      // DCT_ADST
2118   fadst8x4_neon,      // ADST_ADST
2119   fdct8x4_neon,       // FLIPADST_DCT
2120   fadst8x4_neon,      // DCT_FLIPADST
2121   fadst8x4_neon,      // FLIPADST_FLIPADST
2122   fadst8x4_neon,      // ADST_FLIPADST
2123   fadst8x4_neon,      // FLIPADST_ADST
2124   fidentity8x4_neon,  // IDTX
2125   fidentity8x4_neon,  // V_DCT
2126   fdct8x4_neon,       // H_DCT
2127   fidentity8x4_neon,  // V_ADST
2128   fadst8x4_neon,      // H_ADST
2129   fidentity8x4_neon,  // V_FLIPADST
2130   fadst8x4_neon       // H_FLIPADST
2131 };
2132 
2133 static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
2134   fdct8x4_neon,       // DCT_DCT
2135   fadst8x4_neon,      // ADST_DCT
2136   fdct8x4_neon,       // DCT_ADST
2137   fadst8x4_neon,      // ADST_ADST
2138   fadst8x4_neon,      // FLIPADST_DCT
2139   fdct8x4_neon,       // DCT_FLIPADST
2140   fadst8x4_neon,      // FLIPADST_FLIPADST
2141   fadst8x4_neon,      // ADST_FLIPADST
2142   fadst8x4_neon,      // FLIPADST_ADST
2143   fidentity8x4_neon,  // IDTX
2144   fdct8x4_neon,       // V_DCT
2145   fidentity8x4_neon,  // H_DCT
2146   fadst8x4_neon,      // V_ADST
2147   fidentity8x4_neon,  // H_ADST
2148   fadst8x4_neon,      // V_FLIPADST
2149   fidentity8x4_neon   // H_FLIPADST
2150 };
2151 
2152 static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
2153   fdct4x8_neon,       // DCT_DCT
2154   fdct4x8_neon,       // ADST_DCT
2155   fadst4x8_neon,      // DCT_ADST
2156   fadst4x8_neon,      // ADST_ADST
2157   fdct4x8_neon,       // FLIPADST_DCT
2158   fadst4x8_neon,      // DCT_FLIPADST
2159   fadst4x8_neon,      // FLIPADST_FLIPADST
2160   fadst4x8_neon,      // ADST_FLIPADST
2161   fadst4x8_neon,      // FLIPADST_ADST
2162   fidentity8x8_neon,  // IDTX
2163   fidentity8x8_neon,  // V_DCT
2164   fdct4x8_neon,       // H_DCT
2165   fidentity8x8_neon,  // V_ADST
2166   fadst4x8_neon,      // H_ADST
2167   fidentity8x8_neon,  // V_FLIPADST
2168   fadst4x8_neon       // H_FLIPADST
2169 };
2170 
2171 static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
2172   fdct8x8_neon,       // DCT_DCT
2173   fadst_8x8_neon,     // ADST_DCT
2174   fdct8x8_neon,       // DCT_ADST
2175   fadst_8x8_neon,     // ADST_ADST
2176   fadst_8x8_neon,     // FLIPADST_DCT
2177   fdct8x8_neon,       // DCT_FLIPADST
2178   fadst_8x8_neon,     // FLIPADST_FLIPADST
2179   fadst_8x8_neon,     // ADST_FLIPADST
2180   fadst_8x8_neon,     // FLIPADST_ADST
2181   fidentity8x8_neon,  // IDTX
2182   fdct8x8_neon,       // V_DCT
2183   fidentity8x8_neon,  // H_DCT
2184   fadst_8x8_neon,     // V_ADST
2185   fidentity8x8_neon,  // H_ADST
2186   fadst_8x8_neon,     // V_FLIPADST
2187   fidentity8x8_neon,  // H_FLIPADST
2188 };
2189 
2190 static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
2191   fdct8x8_neon,       // DCT_DCT
2192   fdct8x8_neon,       // ADST_DCT
2193   fadst_8x8_neon,     // DCT_ADST
2194   fadst_8x8_neon,     // ADST_ADST
2195   fdct8x8_neon,       // FLIPADST_DCT
2196   fadst_8x8_neon,     // DCT_FLIPADST
2197   fadst_8x8_neon,     // FLIPADST_FLIPADST
2198   fadst_8x8_neon,     // ADST_FLIPADST
2199   fadst_8x8_neon,     // FLIPADST_ADST
2200   fidentity8x8_neon,  // IDTX
2201   fidentity8x8_neon,  // V_DCT
2202   fdct8x8_neon,       // H_DCT
2203   fidentity8x8_neon,  // V_ADST
2204   fadst_8x8_neon,     // H_ADST
2205   fidentity8x8_neon,  // V_FLIPADST
2206   fadst_8x8_neon      // H_FLIPADST
2207 };
2208 
2209 static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
2210   fdct8x16_neon,       // DCT_DCT
2211   fadst8x16_neon,      // ADST_DCT
2212   fdct8x16_neon,       // DCT_ADST
2213   fadst8x16_neon,      // ADST_ADST
2214   fadst8x16_neon,      // FLIPADST_DCT
2215   fdct8x16_neon,       // DCT_FLIPADST
2216   fadst8x16_neon,      // FLIPADST_FLIPADST
2217   fadst8x16_neon,      // ADST_FLIPADST
2218   fadst8x16_neon,      // FLIPADST_ADST
2219   fidentity8x16_neon,  // IDTX
2220   fdct8x16_neon,       // V_DCT
2221   fidentity8x16_neon,  // H_DCT
2222   fadst8x16_neon,      // V_ADST
2223   fidentity8x16_neon,  // H_ADST
2224   fadst8x16_neon,      // V_FLIPADST
2225   fidentity8x16_neon   // H_FLIPADST
2226 };
2227 
2228 static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
2229   fdct8x16_neon,       // DCT_DCT
2230   fdct8x16_neon,       // ADST_DCT
2231   fadst8x16_neon,      // DCT_ADST
2232   fadst8x16_neon,      // ADST_ADST
2233   fdct8x16_neon,       // FLIPADST_DCT
2234   fadst8x16_neon,      // DCT_FLIPADST
2235   fadst8x16_neon,      // FLIPADST_FLIPADST
2236   fadst8x16_neon,      // ADST_FLIPADST
2237   fadst8x16_neon,      // FLIPADST_ADST
2238   fidentity8x16_neon,  // IDTX
2239   fidentity8x16_neon,  // V_DCT
2240   fdct8x16_neon,       // H_DCT
2241   fidentity8x16_neon,  // V_ADST
2242   fadst8x16_neon,      // H_ADST
2243   fidentity8x16_neon,  // V_FLIPADST
2244   fadst8x16_neon       // H_FLIPADST
2245 };
2246 
2247 static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
2248   av1_fdct8x32_neon,   // DCT_DCT
2249   NULL,                // ADST_DCT
2250   NULL,                // DCT_ADST
2251   NULL,                // ADST_ADST
2252   NULL,                // FLIPADST_DCT
2253   NULL,                // DCT_FLIPADST
2254   NULL,                // FLIPADST_FLIPADST
2255   NULL,                // ADST_FLIPADST
2256   NULL,                // FLIPADST_ADST
2257   fidentity8x32_neon,  // IDTX
2258   fidentity8x32_neon,  // V_DCT
2259   av1_fdct8x32_neon,   // H_DCT
2260   NULL,                // V_ADST
2261   NULL,                // H_ADST
2262   NULL,                // V_FLIPADST
2263   NULL                 // H_FLIPADST
2264 };
2265 
2266 static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
2267   av1_fdct8x32_neon,   // DCT_DCT
2268   NULL,                // ADST_DCT
2269   NULL,                // DCT_ADST
2270   NULL,                // ADST_ADST
2271   NULL,                // FLIPADST_DCT
2272   NULL,                // DCT_FLIPADST
2273   NULL,                // FLIPADST_FLIPADST
2274   NULL,                // ADST_FLIPADST
2275   NULL,                // FLIPADST_ADST
2276   fidentity8x32_neon,  // IDTX
2277   av1_fdct8x32_neon,   // V_DCT
2278   fidentity8x32_neon,  // H_DCT
2279   NULL,                // V_ADST
2280   NULL,                // H_ADST
2281   NULL,                // V_FLIPADST
2282   NULL                 // H_FLIPADST
2283 };
2284 
av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2285 void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
2286                                    int stride, TX_TYPE tx_type, int bd) {
2287   (void)bd;
2288   int16x8_t buf0[4], buf1[4], *buf;
2289   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
2290   const int txw_idx = get_txw_idx(TX_4X4);
2291   const int txh_idx = get_txh_idx(TX_4X4);
2292   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2293   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2294   const int width = 4;
2295   const int height = 4;
2296   const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
2297   const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
2298   int ud_flip, lr_flip;
2299 
2300   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2301   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2302   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2303   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2304   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2305   if (ud_flip) {
2306     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2307   } else {
2308     load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2309   }
2310   round_shift_16bit_vector(buf0, height, &v_shift0);
2311   col_txfm(buf0, buf0, cos_bit_col, NULL);
2312   round_shift_16bit_vector(buf0, height, &v_shift1);
2313   transpose_16bit_4x4(buf0, buf1);
2314 
2315   if (lr_flip) {
2316     buf = buf0;
2317     flip_buf_neon(buf1, buf, width);
2318   } else {
2319     buf = buf1;
2320   }
2321   row_txfm(buf, buf, cos_bit_row, NULL);
2322   round_shift_16bit_vector(buf0, height, &v_shift2);
2323 
2324   transpose_16bit_4x4(buf, buf);
2325   store_buffer_16bit_to_32bit_w4(buf, output, width, height);
2326 }
2327 
av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2328 void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
2329                                    int stride, TX_TYPE tx_type, int bd) {
2330   (void)stride;
2331   (void)bd;
2332   int16x8_t buf0[8], buf1[8], *buf;
2333   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
2334   const int txw_idx = get_txw_idx(TX_4X8);
2335   const int txh_idx = get_txh_idx(TX_4X8);
2336   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2337   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2338   const int width = 4;
2339   const int height = 8;
2340   const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
2341   const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
2342   int ud_flip, lr_flip;
2343 
2344   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2345   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2346   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2347   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2348   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2349   if (ud_flip) {
2350     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2351   } else {
2352     load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2353   }
2354   round_shift_16bit_vector(buf0, height, &v_shift0);
2355   col_txfm(buf0, buf0, cos_bit_col, NULL);
2356   round_shift_16bit_vector(buf0, height, &v_shift1);
2357   transpose_16bit_4x8(buf0, buf1);
2358 
2359   if (lr_flip) {
2360     buf = buf0;
2361     flip_buf_neon(buf1, buf, width);
2362   } else {
2363     buf = buf1;
2364   }
2365   row_txfm(buf, buf, cos_bit_row, NULL);
2366   round_shift_16bit_vector(buf0, height, &v_shift2);
2367   transpose_16bit_8x4(buf, buf);
2368   store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
2369 }
2370 
av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2371 void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
2372                                     int stride, TX_TYPE tx_type, int bd) {
2373   (void)bd;
2374   int16x8_t buf0[16], buf1[16];
2375   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
2376   const int txw_idx = get_txw_idx(TX_4X16);
2377   const int txh_idx = get_txh_idx(TX_4X16);
2378   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2379   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2380   const int width = 4;
2381   const int height = 16;
2382   const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2383   const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
2384   int ud_flip, lr_flip;
2385 
2386   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2387   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2388   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2389   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2390   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2391   if (ud_flip) {
2392     load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2393   } else {
2394     load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2395   }
2396   round_shift_16bit_vector(buf0, height, &v_shift0);
2397   col_txfm(buf0, buf0, cos_bit_col, NULL);
2398   round_shift_16bit_vector(buf0, height, &v_shift1);
2399   transpose_16bit_4x8(buf0, buf1);
2400   transpose_16bit_4x8(buf0 + 8, buf1 + 8);
2401 
2402   for (int i = 0; i < 2; i++) {
2403     int16x8_t *buf;
2404     if (lr_flip) {
2405       buf = buf0;
2406       flip_buf_neon(buf1 + 8 * i, buf, width);
2407     } else {
2408       buf = buf1 + 8 * i;
2409     }
2410     row_txfm(buf, buf, cos_bit_row, NULL);
2411     round_shift_16bit_vector(buf0, height, &v_shift2);
2412     transpose_16bit_8x4(buf, buf);
2413     store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
2414   }
2415 }
2416 
av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2417 void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
2418                                    int stride, TX_TYPE tx_type, int bd) {
2419   (void)bd;
2420   int16x8_t buf0[8], buf1[8], *buf;
2421   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
2422   const int txw_idx = get_txw_idx(TX_8X4);
2423   const int txh_idx = get_txh_idx(TX_8X4);
2424   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2425   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2426   const int width = 8;
2427   const int height = 4;
2428   const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
2429   const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
2430   int ud_flip, lr_flip;
2431 
2432   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2433   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2434   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2435   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2436   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2437   if (ud_flip)
2438     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2439   else
2440     load_buffer_16bit_to_16bit(input, stride, buf0, height);
2441   round_shift_16bit_vector(buf0, height, &v_shift0);
2442   col_txfm(buf0, buf0, cos_bit_col, NULL);
2443   round_shift_16bit_vector(buf0, height, &v_shift1);
2444   transpose_16bit_8x8(buf0, buf1);
2445 
2446   if (lr_flip) {
2447     buf = buf0;
2448     flip_buf_neon(buf1, buf, width);
2449   } else {
2450     buf = buf1;
2451   }
2452   row_txfm(buf, buf, cos_bit_row, NULL);
2453   round_shift_16bit_vector(buf0, height, &v_shift2);
2454   transpose_16bit_8x8(buf, buf);
2455   store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
2456 }
2457 
av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2458 void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
2459                                    int stride, TX_TYPE tx_type, int bd) {
2460   (void)bd;
2461   int16x8_t buf0[8], buf1[8], *buf;
2462   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
2463   const int txw_idx = get_txw_idx(TX_8X8);
2464   const int txh_idx = get_txh_idx(TX_8X8);
2465   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2466   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2467   const int width = 8;
2468   const int height = 8;
2469   const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2470   const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2471   int ud_flip, lr_flip;
2472 
2473   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2474   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2475   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2476   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2477   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2478   if (ud_flip)
2479     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2480   else
2481     load_buffer_16bit_to_16bit(input, stride, buf0, height);
2482   round_shift_16bit_vector(buf0, height, &v_shift0);
2483   col_txfm(buf0, buf0, cos_bit_col, NULL);
2484   round_shift_16bit_vector(buf0, height, &v_shift1);
2485   transpose_16bit_8x8(buf0, buf1);
2486 
2487   if (lr_flip) {
2488     buf = buf0;
2489     flip_buf_neon(buf1, buf, width);
2490   } else {
2491     buf = buf1;
2492   }
2493   row_txfm(buf, buf, cos_bit_row, NULL);
2494   round_shift_16bit_vector(buf0, height, &v_shift2);
2495   transpose_16bit_8x8(buf, buf);
2496   store_buffer_16bit_to_32bit_w8(buf, output, width, height);
2497 }
2498 
av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2499 void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
2500                                     int stride, TX_TYPE tx_type, int bd) {
2501   (void)bd;
2502   int16x8_t buf0[16], buf1[16];
2503   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2504   const int txw_idx = get_txw_idx(TX_8X16);
2505   const int txh_idx = get_txh_idx(TX_8X16);
2506   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2507   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2508   const int width = 8;
2509   const int height = 16;
2510   const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2511   const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2512   int ud_flip, lr_flip;
2513 
2514   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2515   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2516   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2517   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2518   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2519   if (ud_flip) {
2520     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2521   } else {
2522     load_buffer_16bit_to_16bit(input, stride, buf0, height);
2523   }
2524   round_shift_16bit_vector(buf0, height, &v_shift0);
2525   col_txfm(buf0, buf0, cos_bit_col, NULL);
2526   round_shift_16bit_vector(buf0, height, &v_shift1);
2527   transpose_16bit_8x8(buf0, buf1);
2528   transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2529 
2530   for (int i = 0; i < 2; i++) {
2531     int16x8_t *buf;
2532     if (lr_flip) {
2533       buf = buf0;
2534       flip_buf_neon(buf1 + width * i, buf, width);
2535     } else {
2536       buf = buf1 + width * i;
2537     }
2538     row_txfm(buf, buf, cos_bit_row, NULL);
2539     round_shift_16bit_vector(buf0, height, &v_shift2);
2540     transpose_16bit_8x8(buf, buf);
2541     store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2542   }
2543 }
2544 
av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2545 void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
2546                                     int stride, TX_TYPE tx_type, int bd) {
2547   (void)bd;
2548   int16x8_t buf0[32], buf1[32];
2549   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
2550   const int txw_idx = get_txw_idx(TX_8X32);
2551   const int txh_idx = get_txh_idx(TX_8X32);
2552   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2553   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2554   const int width = 8;
2555   const int height = 32;
2556   const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2557   const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2558   int ud_flip, lr_flip;
2559 
2560   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2561   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2562   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2563   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2564   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2565   if (ud_flip) {
2566     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2567   } else {
2568     load_buffer_16bit_to_16bit(input, stride, buf0, height);
2569   }
2570   round_shift_16bit_vector(buf0, height, &v_shift0);
2571   col_txfm(buf0, buf0, cos_bit_col, NULL);
2572   round_shift_16bit_vector(buf0, height, &v_shift1);
2573   transpose_16bit_8x8(buf0, buf1);
2574   transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2575   transpose_16bit_8x8(buf0 + 16, buf1 + 16);
2576   transpose_16bit_8x8(buf0 + 24, buf1 + 24);
2577 
2578   for (int i = 0; i < 4; i++) {
2579     int16x8_t *buf;
2580     if (lr_flip) {
2581       buf = buf0;
2582       flip_buf_neon(buf1 + width * i, buf, width);
2583     } else {
2584       buf = buf1 + width * i;
2585     }
2586     row_txfm(buf, buf, cos_bit_row, NULL);
2587     round_shift_16bit_vector(buf0, height, &v_shift2);
2588     transpose_16bit_8x8(buf, buf);
2589     store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2590   }
2591 }
2592 
av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2593 void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
2594                                     int stride, TX_TYPE tx_type, int bd) {
2595   (void)bd;
2596   int16x8_t buf0[16], buf1[16];
2597   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
2598   const int txw_idx = get_txw_idx(TX_16X4);
2599   const int txh_idx = get_txh_idx(TX_16X4);
2600   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2601   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2602   const int width = 16;
2603   const int height = 4;
2604   const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
2605   const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2606   int16x8_t *buf;
2607   int ud_flip, lr_flip;
2608 
2609   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2610   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2611   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2612   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2613   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2614   for (int i = 0; i < 2; i++) {
2615     if (ud_flip) {
2616       load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2617     } else {
2618       load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2619     }
2620     round_shift_16bit_vector(buf0, height, &v_shift0);
2621     col_txfm(buf0, buf0, cos_bit_col, NULL);
2622     round_shift_16bit_vector(buf0, height, &v_shift1);
2623     transpose_16bit_8x4(buf0, buf1 + 8 * i);
2624   }
2625 
2626   if (lr_flip) {
2627     buf = buf0;
2628     flip_buf_neon(buf1, buf, width);
2629   } else {
2630     buf = buf1;
2631   }
2632   row_txfm(buf, buf, cos_bit_row, NULL);
2633   round_shift_16bit_vector(buf0, height, &v_shift2);
2634   transpose_16bit_4x8(buf, buf);
2635   store_buffer_16bit_to_32bit_w8(buf, output, width, height);
2636   transpose_16bit_4x8(buf + 8, buf + 8);
2637   store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
2638 }
2639 
av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2640 void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
2641                                     int stride, TX_TYPE tx_type, int bd) {
2642   (void)bd;
2643   int16x8_t buf0[16], buf1[16];
2644   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2645   const int txw_idx = get_txw_idx(TX_16X8);
2646   const int txh_idx = get_txh_idx(TX_16X8);
2647   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2648   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2649   const int width = 16;
2650   const int height = 8;
2651   const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2652   const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2653   int16x8_t *buf;
2654   int ud_flip, lr_flip;
2655 
2656   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2657   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2658   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2659   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2660   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2661   for (int i = 0; i < 2; i++) {
2662     if (ud_flip) {
2663       load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2664     } else {
2665       load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2666     }
2667     round_shift_16bit_vector(buf0, height, &v_shift0);
2668     col_txfm(buf0, buf0, cos_bit_col, NULL);
2669     round_shift_16bit_vector(buf0, height, &v_shift1);
2670     transpose_16bit_8x8(buf0, buf1 + 8 * i);
2671   }
2672 
2673   if (lr_flip) {
2674     buf = buf0;
2675     flip_buf_neon(buf1, buf, width);
2676   } else {
2677     buf = buf1;
2678   }
2679   row_txfm(buf, buf, cos_bit_row, NULL);
2680   round_shift_16bit_vector(buf0, height, &v_shift2);
2681   transpose_16bit_8x8(buf, buf);
2682   store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
2683   transpose_16bit_8x8(buf + 8, buf + 8);
2684   store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
2685 }
2686 
av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2687 void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
2688                                      int stride, TX_TYPE tx_type, int bd) {
2689   (void)bd;
2690   int16x8_t buf0[16], buf1[32];
2691   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
2692   const int txw_idx = get_txw_idx(TX_16X16);
2693   const int txh_idx = get_txh_idx(TX_16X16);
2694   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2695   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2696   const int width = 16;
2697   const int height = 16;
2698   const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2699   const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2700   int ud_flip, lr_flip;
2701 
2702   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2703   const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2704   const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2705   const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2706   const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2707 
2708   for (int i = 0; i < 2; i++) {
2709     if (ud_flip) {
2710       load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2711     } else {
2712       load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2713     }
2714     round_shift_16bit_vector(buf0, height, &v_shift0);
2715     col_txfm(buf0, buf0, cos_bit_col, NULL);
2716     round_shift_16bit_vector(buf0, height, &v_shift1);
2717     transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2718     transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
2719   }
2720 
2721   for (int i = 0; i < 2; i++) {
2722     int16x8_t *buf;
2723     if (lr_flip) {
2724       buf = buf0;
2725       flip_buf_neon(buf1 + width * i, buf, width);
2726     } else {
2727       buf = buf1 + width * i;
2728     }
2729     row_txfm(buf, buf, cos_bit_row, NULL);
2730     round_shift_16bit_vector(buf0, height, &v_shift2);
2731     transpose_16bit_8x8(buf, buf);
2732     store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2733     transpose_16bit_8x8(buf + 8, buf + 8);
2734     store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2735                                    8);
2736   }
2737 }
2738 
av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2739 void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
2740                                      int stride, TX_TYPE tx_type, int bd) {
2741   (void)bd;
2742   int16x8_t buf0[32], buf1[64];
2743   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
2744   const int txw_idx = get_txw_idx(TX_16X32);
2745   const int txh_idx = get_txh_idx(TX_16X32);
2746   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2747   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2748   const int width = 16;
2749   const int height = 32;
2750   const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2751   const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2752 
2753   if (col_txfm != NULL && row_txfm != NULL) {
2754     int ud_flip, lr_flip;
2755     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2756     const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2757     const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2758     const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2759     const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2760 
2761     for (int i = 0; i < 2; i++) {
2762       if (ud_flip) {
2763         load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2764       } else {
2765         load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2766       }
2767       round_shift_16bit_vector(buf0, height, &v_shift0);
2768       col_txfm(buf0, buf0, cos_bit_col, NULL);
2769       round_shift_16bit_vector(buf0, height, &v_shift1);
2770       transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
2771       transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
2772       transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
2773       transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
2774     }
2775 
2776     for (int i = 0; i < 4; i++) {
2777       int16x8_t *buf;
2778       if (lr_flip) {
2779         buf = buf0;
2780         flip_buf_neon(buf1 + width * i, buf, width);
2781       } else {
2782         buf = buf1 + width * i;
2783       }
2784       row_txfm(buf, buf, cos_bit_row, NULL);
2785       round_shift_16bit_vector(buf0, height, &v_shift2);
2786       transpose_16bit_8x8(buf, buf);
2787       store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2788                                           8);
2789       transpose_16bit_8x8(buf + 8, buf + 8);
2790       store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
2791                                           width, 8);
2792     }
2793   } else {
2794     av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
2795   }
2796 }
2797 
av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2798 void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
2799                                     int stride, TX_TYPE tx_type, int bd) {
2800   (void)bd;
2801   int16x8_t buf0[32], buf1[32];
2802   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
2803   const int txw_idx = get_txw_idx(TX_32X8);
2804   const int txh_idx = get_txh_idx(TX_32X8);
2805   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2806   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2807   const int width = 32;
2808   const int height = 8;
2809   const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2810   const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2811 
2812   if (col_txfm != NULL && row_txfm != NULL) {
2813     int ud_flip, lr_flip;
2814     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2815     const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2816     const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2817     const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2818     const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2819 
2820     for (int i = 0; i < 4; i++) {
2821       if (ud_flip) {
2822         load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2823       } else {
2824         load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2825       }
2826       round_shift_16bit_vector(buf0, height, &v_shift0);
2827       col_txfm(buf0, buf0, cos_bit_col, NULL);
2828       round_shift_16bit_vector(buf0, height, &v_shift1);
2829       transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2830     }
2831 
2832     for (int i = 0; i < 1; i++) {
2833       int16x8_t *buf;
2834       if (lr_flip) {
2835         buf = buf0;
2836         flip_buf_neon(buf1 + width * i, buf, width);
2837       } else {
2838         buf = buf1 + width * i;
2839       }
2840       row_txfm(buf, buf, cos_bit_row, NULL);
2841       round_shift_16bit_vector(buf, width, &v_shift2);
2842       transpose_16bit_8x8(buf, buf);
2843       store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2844                                      height);
2845       transpose_16bit_8x8(buf + 8, buf + 8);
2846       store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2847                                      height);
2848       transpose_16bit_8x8(buf + 16, buf + 16);
2849       store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2850                                      width, height);
2851       transpose_16bit_8x8(buf + 24, buf + 24);
2852       store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2853                                      width, height);
2854     }
2855   } else {
2856     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2857   }
2858 }
2859 
av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2860 void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
2861                                      int stride, TX_TYPE tx_type, int bd) {
2862   (void)bd;
2863   int16x8_t buf0[32], buf1[64];
2864   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
2865   const int txw_idx = get_txw_idx(TX_32X16);
2866   const int txh_idx = get_txh_idx(TX_32X16);
2867   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2868   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2869   const int width = 32;
2870   const int height = 16;
2871   const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2872   const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2873 
2874   if (col_txfm != NULL && row_txfm != NULL) {
2875     const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2876     const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2877     const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2878     const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2879     int ud_flip, lr_flip;
2880     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2881 
2882     for (int i = 0; i < 4; i++) {
2883       if (ud_flip) {
2884         load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2885       } else {
2886         load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2887       }
2888       round_shift_16bit_vector(buf0, height, &v_shift0);
2889       col_txfm(buf0, buf0, cos_bit_col, NULL);
2890       round_shift_16bit_vector(buf0, height, &v_shift1);
2891       transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2892       transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
2893     }
2894 
2895     for (int i = 0; i < 2; i++) {
2896       int16x8_t *buf;
2897       if (lr_flip) {
2898         buf = buf0;
2899         flip_buf_neon(buf1 + width * i, buf, width);
2900       } else {
2901         buf = buf1 + width * i;
2902       }
2903       row_txfm(buf, buf, cos_bit_row, NULL);
2904       round_shift_16bit_vector(buf, width, &v_shift2);
2905       transpose_16bit_8x8(buf, buf);
2906       store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2907                                           8);
2908       transpose_16bit_8x8(buf + 8, buf + 8);
2909       store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
2910                                           width, 8);
2911       transpose_16bit_8x8(buf + 16, buf + 16);
2912       store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2913                                           width, 8);
2914       transpose_16bit_8x8(buf + 24, buf + 24);
2915       store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2916                                           width, 8);
2917     }
2918   } else {
2919     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2920   }
2921 }
2922 
av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2923 void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2924                                      int stride, TX_TYPE tx_type, int bd) {
2925   (void)bd;
2926   int16x8_t buf0[32], buf1[128];
2927   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
2928   const int txw_idx = get_txw_idx(TX_32X32);
2929   const int txh_idx = get_txh_idx(TX_32X32);
2930   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2931   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2932   const int width = 32;
2933   const int height = 32;
2934   const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2935   const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2936 
2937   if (col_txfm != NULL && row_txfm != NULL) {
2938     int ud_flip, lr_flip;
2939     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2940 
2941     for (int i = 0; i < 4; i++) {
2942       if (ud_flip) {
2943         load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2944       } else {
2945         load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2946       }
2947       round_shift_16bit(buf0, height, shift[0]);
2948       col_txfm(buf0, buf0, cos_bit_col, NULL);
2949       round_shift_16bit(buf0, height, shift[1]);
2950       transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
2951       transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
2952       transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
2953       transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
2954     }
2955 
2956     for (int i = 0; i < 4; i++) {
2957       int16x8_t *buf;
2958       if (lr_flip) {
2959         buf = buf0;
2960         flip_buf_neon(buf1 + width * i, buf, width);
2961       } else {
2962         buf = buf1 + width * i;
2963       }
2964       row_txfm(buf, buf, cos_bit_row, NULL);
2965       round_shift_16bit(buf, width, shift[2]);
2966       transpose_16bit_8x8(buf, buf);
2967       store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2968       transpose_16bit_8x8(buf + 8, buf + 8);
2969       store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2970                                      8);
2971       transpose_16bit_8x8(buf + 16, buf + 16);
2972       store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2973                                      width, 8);
2974       transpose_16bit_8x8(buf + 24, buf + 24);
2975       store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2976                                      width, 8);
2977     }
2978   } else {
2979     av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
2980   }
2981 }
2982 
av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2983 void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
2984                                      int stride, TX_TYPE tx_type, int bd) {
2985   (void)bd;
2986   (void)tx_type;
2987   assert(tx_type == DCT_DCT);
2988   const TX_SIZE tx_size = TX_64X16;
2989   int16x8_t buf0[64], buf1[128];
2990   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2991   const int txw_idx = get_txw_idx(tx_size);
2992   const int txh_idx = get_txh_idx(tx_size);
2993   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2994   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2995   const int width = tx_size_wide[tx_size];
2996   const int height = tx_size_high[tx_size];
2997   const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
2998   const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
2999   const int width_div8 = (width >> 3);
3000   const int height_div8 = (height >> 3);
3001 
3002   for (int i = 0; i < width_div8; i++) {
3003     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
3004     round_shift_16bit(buf0, height, shift[0]);
3005     col_txfm(buf0, buf0, cos_bit_col, NULL);
3006     round_shift_16bit(buf0, height, shift[1]);
3007     for (int j = 0; j < height_div8; ++j) {
3008       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
3009     }
3010   }
3011 
3012   for (int i = 0; i < height_div8; i++) {
3013     int16x8_t *buf = buf1 + width * i;
3014     row_txfm(buf, buf, cos_bit_row, NULL);
3015     round_shift_16bit(buf, width, shift[2]);
3016     int32_t *output8 = output + 8 * 32 * i;
3017     for (int j = 0; j < 4; ++j) {
3018       int16x8_t *buf8 = buf + 8 * j;
3019       transpose_16bit_8x8(buf8, buf8);
3020       store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
3021     }
3022   }
3023 }
3024 
av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)3025 void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
3026                                      int stride, TX_TYPE tx_type, int bd) {
3027   (void)bd;
3028   (void)tx_type;
3029   assert(tx_type == DCT_DCT);
3030   const TX_SIZE tx_size = TX_16X64;
3031   int16x8_t buf0[64], buf1[128];
3032   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
3033   const int txw_idx = get_txw_idx(tx_size);
3034   const int txh_idx = get_txh_idx(tx_size);
3035   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
3036   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
3037   const int width = tx_size_wide[tx_size];
3038   const int height = tx_size_high[tx_size];
3039   const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
3040   const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
3041   const int width_div8 = (width >> 3);
3042   const int height_div8 = (height >> 3);
3043 
3044   for (int i = 0; i < width_div8; i++) {
3045     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
3046     round_shift_16bit(buf0, height, shift[0]);
3047     col_txfm(buf0, buf0, cos_bit_col, NULL);
3048     round_shift_16bit(buf0, height, shift[1]);
3049     for (int j = 0; j < height_div8; ++j) {
3050       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
3051     }
3052   }
3053 
3054   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
3055     int16x8_t *buf = buf1 + width * i;
3056     row_txfm(buf, buf, cos_bit_row, NULL);
3057     round_shift_16bit(buf, width, shift[2]);
3058     int32_t *output8 = output + 8 * width * i;
3059     for (int j = 0; j < width_div8; ++j) {
3060       int16x8_t *buf8 = buf + 8 * j;
3061       transpose_16bit_8x8(buf8, buf8);
3062       store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
3063     }
3064   }
3065   // Zero out the bottom 16x32 area.
3066   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
3067 }
3068 
3069 #define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3)      \
3070   do {                                                         \
3071     int32x4x2_t temp01 = vzipq_s32(x0, x1);                    \
3072     int32x4x2_t temp23 = vzipq_s32(x2, x3);                    \
3073     int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
3074     int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
3075     y0 = y01.val[0];                                           \
3076     y1 = y01.val[1];                                           \
3077     y2 = y23.val[0];                                           \
3078     y3 = y23.val[1];                                           \
3079   } while (0)
3080 
transpose_32_4x4x2(int stride,const int32x4_t * inputA,const int32x4_t * inputB,int32x4_t * output)3081 static INLINE void transpose_32_4x4x2(int stride, const int32x4_t *inputA,
3082                                       const int32x4_t *inputB,
3083                                       int32x4_t *output) {
3084   TRANSPOSE_4X4_L32(inputA[0], inputA[2], inputA[1], inputA[3],
3085                     output[0 * stride], output[1 * stride], output[2 * stride],
3086                     output[3 * stride]);
3087   TRANSPOSE_4X4_L32(inputB[0], inputB[2], inputB[1], inputB[3],
3088                     output[4 * stride], output[5 * stride], output[6 * stride],
3089                     output[7 * stride]);
3090 }
3091 
av1_fdct32_new_neon(int32x4_t * input,int32x4_t * output,int cos_bit,const int stride,const int8_t * stage_range)3092 static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
3093                                 int cos_bit, const int stride,
3094                                 const int8_t *stage_range) {
3095   (void)stage_range;
3096   int32x4_t buf0[32];
3097   int32x4_t buf1[32];
3098   const int32_t *cospi;
3099   cospi = cospi_arr(cos_bit);
3100   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
3101 
3102   int startidx = 0 * stride;
3103   int endidx = 31 * stride;
3104   // stage 0
3105   // stage 1
3106   buf1[0] = vaddq_s32(input[startidx], input[endidx]);
3107   buf1[31] = vsubq_s32(input[startidx], input[endidx]);
3108   startidx += stride;
3109   endidx -= stride;
3110   buf1[1] = vaddq_s32(input[startidx], input[endidx]);
3111   buf1[30] = vsubq_s32(input[startidx], input[endidx]);
3112   startidx += stride;
3113   endidx -= stride;
3114   buf1[2] = vaddq_s32(input[startidx], input[endidx]);
3115   buf1[29] = vsubq_s32(input[startidx], input[endidx]);
3116   startidx += stride;
3117   endidx -= stride;
3118   buf1[3] = vaddq_s32(input[startidx], input[endidx]);
3119   buf1[28] = vsubq_s32(input[startidx], input[endidx]);
3120   startidx += stride;
3121   endidx -= stride;
3122   buf1[4] = vaddq_s32(input[startidx], input[endidx]);
3123   buf1[27] = vsubq_s32(input[startidx], input[endidx]);
3124   startidx += stride;
3125   endidx -= stride;
3126   buf1[5] = vaddq_s32(input[startidx], input[endidx]);
3127   buf1[26] = vsubq_s32(input[startidx], input[endidx]);
3128   startidx += stride;
3129   endidx -= stride;
3130   buf1[6] = vaddq_s32(input[startidx], input[endidx]);
3131   buf1[25] = vsubq_s32(input[startidx], input[endidx]);
3132   startidx += stride;
3133   endidx -= stride;
3134   buf1[7] = vaddq_s32(input[startidx], input[endidx]);
3135   buf1[24] = vsubq_s32(input[startidx], input[endidx]);
3136   startidx += stride;
3137   endidx -= stride;
3138   buf1[8] = vaddq_s32(input[startidx], input[endidx]);
3139   buf1[23] = vsubq_s32(input[startidx], input[endidx]);
3140   startidx += stride;
3141   endidx -= stride;
3142   buf1[9] = vaddq_s32(input[startidx], input[endidx]);
3143   buf1[22] = vsubq_s32(input[startidx], input[endidx]);
3144   startidx += stride;
3145   endidx -= stride;
3146   buf1[10] = vaddq_s32(input[startidx], input[endidx]);
3147   buf1[21] = vsubq_s32(input[startidx], input[endidx]);
3148   startidx += stride;
3149   endidx -= stride;
3150   buf1[11] = vaddq_s32(input[startidx], input[endidx]);
3151   buf1[20] = vsubq_s32(input[startidx], input[endidx]);
3152   startidx += stride;
3153   endidx -= stride;
3154   buf1[12] = vaddq_s32(input[startidx], input[endidx]);
3155   buf1[19] = vsubq_s32(input[startidx], input[endidx]);
3156   startidx += stride;
3157   endidx -= stride;
3158   buf1[13] = vaddq_s32(input[startidx], input[endidx]);
3159   buf1[18] = vsubq_s32(input[startidx], input[endidx]);
3160   startidx += stride;
3161   endidx -= stride;
3162   buf1[14] = vaddq_s32(input[startidx], input[endidx]);
3163   buf1[17] = vsubq_s32(input[startidx], input[endidx]);
3164   startidx += stride;
3165   endidx -= stride;
3166   buf1[15] = vaddq_s32(input[startidx], input[endidx]);
3167   buf1[16] = vsubq_s32(input[startidx], input[endidx]);
3168 
3169   // stage 2
3170   buf0[0] = vaddq_s32(buf1[0], buf1[15]);
3171   buf0[15] = vsubq_s32(buf1[0], buf1[15]);
3172   buf0[1] = vaddq_s32(buf1[1], buf1[14]);
3173   buf0[14] = vsubq_s32(buf1[1], buf1[14]);
3174   buf0[2] = vaddq_s32(buf1[2], buf1[13]);
3175   buf0[13] = vsubq_s32(buf1[2], buf1[13]);
3176   buf0[3] = vaddq_s32(buf1[3], buf1[12]);
3177   buf0[12] = vsubq_s32(buf1[3], buf1[12]);
3178   buf0[4] = vaddq_s32(buf1[4], buf1[11]);
3179   buf0[11] = vsubq_s32(buf1[4], buf1[11]);
3180   buf0[5] = vaddq_s32(buf1[5], buf1[10]);
3181   buf0[10] = vsubq_s32(buf1[5], buf1[10]);
3182   buf0[6] = vaddq_s32(buf1[6], buf1[9]);
3183   buf0[9] = vsubq_s32(buf1[6], buf1[9]);
3184   buf0[7] = vaddq_s32(buf1[7], buf1[8]);
3185   buf0[8] = vsubq_s32(buf1[7], buf1[8]);
3186   buf0[16] = buf1[16];
3187   buf0[17] = buf1[17];
3188   buf0[18] = buf1[18];
3189   buf0[19] = buf1[19];
3190   btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
3191                     buf0[27], v_cos_bit);
3192   btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
3193                     buf0[26], v_cos_bit);
3194   btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
3195                     buf0[25], v_cos_bit);
3196   btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
3197                     buf0[24], v_cos_bit);
3198   buf0[28] = buf1[28];
3199   buf0[29] = buf1[29];
3200   buf0[30] = buf1[30];
3201   buf0[31] = buf1[31];
3202 
3203   // stage 3
3204   cospi = cospi_arr(cos_bit);
3205   buf1[0] = vaddq_s32(buf0[0], buf0[7]);
3206   buf1[7] = vsubq_s32(buf0[0], buf0[7]);
3207   buf1[1] = vaddq_s32(buf0[1], buf0[6]);
3208   buf1[6] = vsubq_s32(buf0[1], buf0[6]);
3209   buf1[2] = vaddq_s32(buf0[2], buf0[5]);
3210   buf1[5] = vsubq_s32(buf0[2], buf0[5]);
3211   buf1[3] = vaddq_s32(buf0[3], buf0[4]);
3212   buf1[4] = vsubq_s32(buf0[3], buf0[4]);
3213   buf1[8] = buf0[8];
3214   buf1[9] = buf0[9];
3215   btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
3216                     buf1[13], v_cos_bit);
3217   btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
3218                     buf1[12], v_cos_bit);
3219   buf1[14] = buf0[14];
3220   buf1[15] = buf0[15];
3221   buf1[16] = vaddq_s32(buf0[16], buf0[23]);
3222   buf1[23] = vsubq_s32(buf0[16], buf0[23]);
3223   buf1[17] = vaddq_s32(buf0[17], buf0[22]);
3224   buf1[22] = vsubq_s32(buf0[17], buf0[22]);
3225   buf1[18] = vaddq_s32(buf0[18], buf0[21]);
3226   buf1[21] = vsubq_s32(buf0[18], buf0[21]);
3227   buf1[19] = vaddq_s32(buf0[19], buf0[20]);
3228   buf1[20] = vsubq_s32(buf0[19], buf0[20]);
3229   buf1[24] = vsubq_s32(buf0[31], buf0[24]);
3230   buf1[31] = vaddq_s32(buf0[31], buf0[24]);
3231   buf1[25] = vsubq_s32(buf0[30], buf0[25]);
3232   buf1[30] = vaddq_s32(buf0[30], buf0[25]);
3233   buf1[26] = vsubq_s32(buf0[29], buf0[26]);
3234   buf1[29] = vaddq_s32(buf0[29], buf0[26]);
3235   buf1[27] = vsubq_s32(buf0[28], buf0[27]);
3236   buf1[28] = vaddq_s32(buf0[28], buf0[27]);
3237 
3238   // stage 4
3239   cospi = cospi_arr(cos_bit);
3240   buf0[0] = vaddq_s32(buf1[0], buf1[3]);
3241   buf0[3] = vsubq_s32(buf1[0], buf1[3]);
3242   buf0[1] = vaddq_s32(buf1[1], buf1[2]);
3243   buf0[2] = vsubq_s32(buf1[1], buf1[2]);
3244   buf0[4] = buf1[4];
3245   btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
3246                     v_cos_bit);
3247   buf0[7] = buf1[7];
3248   buf0[8] = vaddq_s32(buf1[8], buf1[11]);
3249   buf0[11] = vsubq_s32(buf1[8], buf1[11]);
3250   buf0[9] = vaddq_s32(buf1[9], buf1[10]);
3251   buf0[10] = vsubq_s32(buf1[9], buf1[10]);
3252   buf0[12] = vsubq_s32(buf1[15], buf1[12]);
3253   buf0[15] = vaddq_s32(buf1[15], buf1[12]);
3254   buf0[13] = vsubq_s32(buf1[14], buf1[13]);
3255   buf0[14] = vaddq_s32(buf1[14], buf1[13]);
3256   buf0[16] = buf1[16];
3257   buf0[17] = buf1[17];
3258   btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
3259                     buf0[29], v_cos_bit);
3260   btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
3261                     buf0[28], v_cos_bit);
3262   btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
3263                      buf0[27], v_cos_bit);
3264   btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
3265                      buf0[26], v_cos_bit);
3266   buf0[22] = buf1[22];
3267   buf0[23] = buf1[23];
3268   buf0[24] = buf1[24];
3269   buf0[25] = buf1[25];
3270   buf0[30] = buf1[30];
3271   buf0[31] = buf1[31];
3272 
3273   // stage 5
3274   cospi = cospi_arr(cos_bit);
3275   btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
3276               v_cos_bit);
3277   btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
3278                     v_cos_bit);
3279   buf1[4] = vaddq_s32(buf0[4], buf0[5]);
3280   buf1[5] = vsubq_s32(buf0[4], buf0[5]);
3281   buf1[6] = vsubq_s32(buf0[7], buf0[6]);
3282   buf1[7] = vaddq_s32(buf0[7], buf0[6]);
3283   buf1[8] = buf0[8];
3284   btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
3285                     v_cos_bit);
3286   btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
3287                      buf1[13], v_cos_bit);
3288   buf1[11] = buf0[11];
3289   buf1[12] = buf0[12];
3290   buf1[15] = buf0[15];
3291   buf1[16] = vaddq_s32(buf0[16], buf0[19]);
3292   buf1[19] = vsubq_s32(buf0[16], buf0[19]);
3293   buf1[17] = vaddq_s32(buf0[17], buf0[18]);
3294   buf1[18] = vsubq_s32(buf0[17], buf0[18]);
3295   buf1[20] = vsubq_s32(buf0[23], buf0[20]);
3296   buf1[23] = vaddq_s32(buf0[23], buf0[20]);
3297   buf1[21] = vsubq_s32(buf0[22], buf0[21]);
3298   buf1[22] = vaddq_s32(buf0[22], buf0[21]);
3299   buf1[24] = vaddq_s32(buf0[24], buf0[27]);
3300   buf1[27] = vsubq_s32(buf0[24], buf0[27]);
3301   buf1[25] = vaddq_s32(buf0[25], buf0[26]);
3302   buf1[26] = vsubq_s32(buf0[25], buf0[26]);
3303   buf1[28] = vsubq_s32(buf0[31], buf0[28]);
3304   buf1[31] = vaddq_s32(buf0[31], buf0[28]);
3305   buf1[29] = vsubq_s32(buf0[30], buf0[29]);
3306   buf1[30] = vaddq_s32(buf0[30], buf0[29]);
3307 
3308   // stage 6
3309   cospi = cospi_arr(cos_bit);
3310   buf0[0] = buf1[0];
3311   buf0[1] = buf1[1];
3312   buf0[2] = buf1[2];
3313   buf0[3] = buf1[3];
3314   btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
3315                     v_cos_bit);
3316   btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
3317                     v_cos_bit);
3318   buf0[8] = vaddq_s32(buf1[8], buf1[9]);
3319   buf0[9] = vsubq_s32(buf1[8], buf1[9]);
3320   buf0[10] = vsubq_s32(buf1[11], buf1[10]);
3321   buf0[11] = vaddq_s32(buf1[11], buf1[10]);
3322   buf0[12] = vaddq_s32(buf1[12], buf1[13]);
3323   buf0[13] = vsubq_s32(buf1[12], buf1[13]);
3324   buf0[14] = vsubq_s32(buf1[15], buf1[14]);
3325   buf0[15] = vaddq_s32(buf1[15], buf1[14]);
3326   buf0[16] = buf1[16];
3327   btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
3328                     v_cos_bit);
3329   btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
3330                      buf0[29], v_cos_bit);
3331   buf0[19] = buf1[19];
3332   buf0[20] = buf1[20];
3333   btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
3334                     buf0[26], v_cos_bit);
3335   btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
3336                      buf0[25], v_cos_bit);
3337   buf0[23] = buf1[23];
3338   buf0[24] = buf1[24];
3339   buf0[27] = buf1[27];
3340   buf0[28] = buf1[28];
3341   buf0[31] = buf1[31];
3342 
3343   // stage 7
3344   cospi = cospi_arr(cos_bit);
3345   buf1[0] = buf0[0];
3346   buf1[1] = buf0[1];
3347   buf1[2] = buf0[2];
3348   buf1[3] = buf0[3];
3349   buf1[4] = buf0[4];
3350   buf1[5] = buf0[5];
3351   buf1[6] = buf0[6];
3352   buf1[7] = buf0[7];
3353 
3354   btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
3355                     v_cos_bit);
3356   btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
3357                     v_cos_bit);
3358   btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
3359                     buf1[13], v_cos_bit);
3360   btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
3361                     buf1[12], v_cos_bit);
3362   buf1[16] = vaddq_s32(buf0[16], buf0[17]);
3363   buf1[17] = vsubq_s32(buf0[16], buf0[17]);
3364   buf1[18] = vsubq_s32(buf0[19], buf0[18]);
3365   buf1[19] = vaddq_s32(buf0[19], buf0[18]);
3366   buf1[20] = vaddq_s32(buf0[20], buf0[21]);
3367   buf1[21] = vsubq_s32(buf0[20], buf0[21]);
3368   buf1[22] = vsubq_s32(buf0[23], buf0[22]);
3369   buf1[23] = vaddq_s32(buf0[23], buf0[22]);
3370   buf1[24] = vaddq_s32(buf0[24], buf0[25]);
3371   buf1[25] = vsubq_s32(buf0[24], buf0[25]);
3372   buf1[26] = vsubq_s32(buf0[27], buf0[26]);
3373   buf1[27] = vaddq_s32(buf0[27], buf0[26]);
3374   buf1[28] = vaddq_s32(buf0[28], buf0[29]);
3375   buf1[29] = vsubq_s32(buf0[28], buf0[29]);
3376   buf1[30] = vsubq_s32(buf0[31], buf0[30]);
3377   buf1[31] = vaddq_s32(buf0[31], buf0[30]);
3378 
3379   // stage 8
3380   cospi = cospi_arr(cos_bit);
3381   buf0[0] = buf1[0];
3382   buf0[1] = buf1[1];
3383   buf0[2] = buf1[2];
3384   buf0[3] = buf1[3];
3385   buf0[4] = buf1[4];
3386   buf0[5] = buf1[5];
3387   buf0[6] = buf1[6];
3388   buf0[7] = buf1[7];
3389   buf0[8] = buf1[8];
3390   buf0[9] = buf1[9];
3391   buf0[10] = buf1[10];
3392   buf0[11] = buf1[11];
3393   buf0[12] = buf1[12];
3394   buf0[13] = buf1[13];
3395   buf0[14] = buf1[14];
3396   buf0[15] = buf1[15];
3397 
3398   btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
3399                     v_cos_bit);
3400   btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
3401                     buf0[30], v_cos_bit);
3402   btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
3403                     buf0[29], v_cos_bit);
3404   btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
3405                     buf0[28], v_cos_bit);
3406   btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
3407                     buf0[27], v_cos_bit);
3408   btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
3409                     buf0[26], v_cos_bit);
3410   btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
3411                     buf0[25], v_cos_bit);
3412   btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
3413                     v_cos_bit);
3414 
3415   startidx = 0 * stride;
3416   endidx = 31 * stride;
3417   // stage 9
3418   output[startidx] = buf0[0];
3419   output[endidx] = buf0[31];
3420   startidx += stride;
3421   endidx -= stride;
3422   output[startidx] = buf0[16];
3423   output[endidx] = buf0[15];
3424   startidx += stride;
3425   endidx -= stride;
3426   output[startidx] = buf0[8];
3427   output[endidx] = buf0[23];
3428   startidx += stride;
3429   endidx -= stride;
3430   output[startidx] = buf0[24];
3431   output[endidx] = buf0[7];
3432   startidx += stride;
3433   endidx -= stride;
3434   output[startidx] = buf0[4];
3435   output[endidx] = buf0[27];
3436   startidx += stride;
3437   endidx -= stride;
3438   output[startidx] = buf0[20];
3439   output[endidx] = buf0[11];
3440   startidx += stride;
3441   endidx -= stride;
3442   output[startidx] = buf0[12];
3443   output[endidx] = buf0[19];
3444   startidx += stride;
3445   endidx -= stride;
3446   output[startidx] = buf0[28];
3447   output[endidx] = buf0[3];
3448   startidx += stride;
3449   endidx -= stride;
3450   output[startidx] = buf0[2];
3451   output[endidx] = buf0[29];
3452   startidx += stride;
3453   endidx -= stride;
3454   output[startidx] = buf0[18];
3455   output[endidx] = buf0[13];
3456   startidx += stride;
3457   endidx -= stride;
3458   output[startidx] = buf0[10];
3459   output[endidx] = buf0[21];
3460   startidx += stride;
3461   endidx -= stride;
3462   output[startidx] = buf0[26];
3463   output[endidx] = buf0[5];
3464   startidx += stride;
3465   endidx -= stride;
3466   output[startidx] = buf0[6];
3467   output[endidx] = buf0[25];
3468   startidx += stride;
3469   endidx -= stride;
3470   output[startidx] = buf0[22];
3471   output[endidx] = buf0[9];
3472   startidx += stride;
3473   endidx -= stride;
3474   output[startidx] = buf0[14];
3475   output[endidx] = buf0[17];
3476   startidx += stride;
3477   endidx -= stride;
3478   output[startidx] = buf0[30];
3479   output[endidx] = buf0[1];
3480 }
3481 
av1_fdct64_new_stage1234_neon(int32x4_t * input,const int instride,int32x4_t * x3,int32x4_t * x4,const int32_t * cospi,const int32x4_t * v_cos_bit,int * startidx,int * endidx)3482 static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
3483                                           int32x4_t *x3, int32x4_t *x4,
3484                                           const int32_t *cospi,
3485                                           const int32x4_t *v_cos_bit,
3486                                           int *startidx, int *endidx) {
3487   // stage 1
3488   int32x4_t x1[64];
3489   x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
3490   x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
3491   *startidx += instride;
3492   *endidx -= instride;
3493   x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
3494   x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
3495   *startidx += instride;
3496   *endidx -= instride;
3497   x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
3498   x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
3499   *startidx += instride;
3500   *endidx -= instride;
3501   x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
3502   x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
3503   *startidx += instride;
3504   *endidx -= instride;
3505   x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
3506   x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
3507   *startidx += instride;
3508   *endidx -= instride;
3509   x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
3510   x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
3511   *startidx += instride;
3512   *endidx -= instride;
3513   x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
3514   x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
3515   *startidx += instride;
3516   *endidx -= instride;
3517   x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
3518   x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
3519   *startidx += instride;
3520   *endidx -= instride;
3521   x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
3522   x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
3523   *startidx += instride;
3524   *endidx -= instride;
3525   x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
3526   x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
3527   *startidx += instride;
3528   *endidx -= instride;
3529   x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
3530   x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
3531   *startidx += instride;
3532   *endidx -= instride;
3533   x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
3534   x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
3535   *startidx += instride;
3536   *endidx -= instride;
3537   x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
3538   x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
3539   *startidx += instride;
3540   *endidx -= instride;
3541   x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
3542   x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
3543   *startidx += instride;
3544   *endidx -= instride;
3545   x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
3546   x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
3547   *startidx += instride;
3548   *endidx -= instride;
3549   x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
3550   x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
3551   *startidx += instride;
3552   *endidx -= instride;
3553   x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
3554   x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
3555   *startidx += instride;
3556   *endidx -= instride;
3557   x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
3558   x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
3559   *startidx += instride;
3560   *endidx -= instride;
3561   x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
3562   x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
3563   *startidx += instride;
3564   *endidx -= instride;
3565   x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
3566   x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
3567   *startidx += instride;
3568   *endidx -= instride;
3569   x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
3570   x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
3571   *startidx += instride;
3572   *endidx -= instride;
3573   x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
3574   x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
3575   *startidx += instride;
3576   *endidx -= instride;
3577   x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
3578   x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
3579   *startidx += instride;
3580   *endidx -= instride;
3581   x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
3582   x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
3583   *startidx += instride;
3584   *endidx -= instride;
3585   x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
3586   x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
3587   *startidx += instride;
3588   *endidx -= instride;
3589   x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
3590   x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
3591   *startidx += instride;
3592   *endidx -= instride;
3593   x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
3594   x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
3595   *startidx += instride;
3596   *endidx -= instride;
3597   x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
3598   x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
3599   *startidx += instride;
3600   *endidx -= instride;
3601   x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
3602   x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
3603   *startidx += instride;
3604   *endidx -= instride;
3605   x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
3606   x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
3607   *startidx += instride;
3608   *endidx -= instride;
3609   x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
3610   x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
3611   *startidx += instride;
3612   *endidx -= instride;
3613   x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
3614   x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
3615 
3616   // stage 2
3617   int32x4_t x2[64];
3618   x2[0] = vaddq_s32(x1[0], x1[31]);
3619   x2[31] = vsubq_s32(x1[0], x1[31]);
3620   x2[1] = vaddq_s32(x1[1], x1[30]);
3621   x2[30] = vsubq_s32(x1[1], x1[30]);
3622   x2[2] = vaddq_s32(x1[2], x1[29]);
3623   x2[29] = vsubq_s32(x1[2], x1[29]);
3624   x2[3] = vaddq_s32(x1[3], x1[28]);
3625   x2[28] = vsubq_s32(x1[3], x1[28]);
3626   x2[4] = vaddq_s32(x1[4], x1[27]);
3627   x2[27] = vsubq_s32(x1[4], x1[27]);
3628   x2[5] = vaddq_s32(x1[5], x1[26]);
3629   x2[26] = vsubq_s32(x1[5], x1[26]);
3630   x2[6] = vaddq_s32(x1[6], x1[25]);
3631   x2[25] = vsubq_s32(x1[6], x1[25]);
3632   x2[7] = vaddq_s32(x1[7], x1[24]);
3633   x2[24] = vsubq_s32(x1[7], x1[24]);
3634   x2[8] = vaddq_s32(x1[8], x1[23]);
3635   x2[23] = vsubq_s32(x1[8], x1[23]);
3636   x2[9] = vaddq_s32(x1[9], x1[22]);
3637   x2[22] = vsubq_s32(x1[9], x1[22]);
3638   x2[10] = vaddq_s32(x1[10], x1[21]);
3639   x2[21] = vsubq_s32(x1[10], x1[21]);
3640   x2[11] = vaddq_s32(x1[11], x1[20]);
3641   x2[20] = vsubq_s32(x1[11], x1[20]);
3642   x2[12] = vaddq_s32(x1[12], x1[19]);
3643   x2[19] = vsubq_s32(x1[12], x1[19]);
3644   x2[13] = vaddq_s32(x1[13], x1[18]);
3645   x2[18] = vsubq_s32(x1[13], x1[18]);
3646   x2[14] = vaddq_s32(x1[14], x1[17]);
3647   x2[17] = vsubq_s32(x1[14], x1[17]);
3648   x2[15] = vaddq_s32(x1[15], x1[16]);
3649   x2[16] = vsubq_s32(x1[15], x1[16]);
3650 
3651   btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
3652                     *v_cos_bit);
3653   btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
3654                     *v_cos_bit);
3655   btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
3656                     *v_cos_bit);
3657   btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
3658                     *v_cos_bit);
3659   btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
3660                     *v_cos_bit);
3661   btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
3662                     *v_cos_bit);
3663   btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
3664                     *v_cos_bit);
3665   btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
3666                     *v_cos_bit);
3667 
3668   // stage 3
3669   x3[0] = vaddq_s32(x2[0], x2[15]);
3670   x3[15] = vsubq_s32(x2[0], x2[15]);
3671   x3[1] = vaddq_s32(x2[1], x2[14]);
3672   x3[14] = vsubq_s32(x2[1], x2[14]);
3673   x3[2] = vaddq_s32(x2[2], x2[13]);
3674   x3[13] = vsubq_s32(x2[2], x2[13]);
3675   x3[3] = vaddq_s32(x2[3], x2[12]);
3676   x3[12] = vsubq_s32(x2[3], x2[12]);
3677   x3[4] = vaddq_s32(x2[4], x2[11]);
3678   x3[11] = vsubq_s32(x2[4], x2[11]);
3679   x3[5] = vaddq_s32(x2[5], x2[10]);
3680   x3[10] = vsubq_s32(x2[5], x2[10]);
3681   x3[6] = vaddq_s32(x2[6], x2[9]);
3682   x3[9] = vsubq_s32(x2[6], x2[9]);
3683   x3[7] = vaddq_s32(x2[7], x2[8]);
3684   x3[8] = vsubq_s32(x2[7], x2[8]);
3685 
3686   btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
3687                     *v_cos_bit);
3688   btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
3689                     *v_cos_bit);
3690   btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
3691                     *v_cos_bit);
3692   btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
3693                     *v_cos_bit);
3694 
3695   x3[32] = vaddq_s32(x1[32], x2[47]);
3696   x3[47] = vsubq_s32(x1[32], x2[47]);
3697   x3[33] = vaddq_s32(x1[33], x2[46]);
3698   x3[46] = vsubq_s32(x1[33], x2[46]);
3699   x3[34] = vaddq_s32(x1[34], x2[45]);
3700   x3[45] = vsubq_s32(x1[34], x2[45]);
3701   x3[35] = vaddq_s32(x1[35], x2[44]);
3702   x3[44] = vsubq_s32(x1[35], x2[44]);
3703   x3[36] = vaddq_s32(x1[36], x2[43]);
3704   x3[43] = vsubq_s32(x1[36], x2[43]);
3705   x3[37] = vaddq_s32(x1[37], x2[42]);
3706   x3[42] = vsubq_s32(x1[37], x2[42]);
3707   x3[38] = vaddq_s32(x1[38], x2[41]);
3708   x3[41] = vsubq_s32(x1[38], x2[41]);
3709   x3[39] = vaddq_s32(x1[39], x2[40]);
3710   x3[40] = vsubq_s32(x1[39], x2[40]);
3711   x3[48] = vsubq_s32(x1[63], x2[48]);
3712   x3[63] = vaddq_s32(x1[63], x2[48]);
3713   x3[49] = vsubq_s32(x1[62], x2[49]);
3714   x3[62] = vaddq_s32(x1[62], x2[49]);
3715   x3[50] = vsubq_s32(x1[61], x2[50]);
3716   x3[61] = vaddq_s32(x1[61], x2[50]);
3717   x3[51] = vsubq_s32(x1[60], x2[51]);
3718   x3[60] = vaddq_s32(x1[60], x2[51]);
3719   x3[52] = vsubq_s32(x1[59], x2[52]);
3720   x3[59] = vaddq_s32(x1[59], x2[52]);
3721   x3[53] = vsubq_s32(x1[58], x2[53]);
3722   x3[58] = vaddq_s32(x1[58], x2[53]);
3723   x3[54] = vsubq_s32(x1[57], x2[54]);
3724   x3[57] = vaddq_s32(x1[57], x2[54]);
3725   x3[55] = vsubq_s32(x1[56], x2[55]);
3726   x3[56] = vaddq_s32(x1[56], x2[55]);
3727 
3728   // stage 4
3729   x4[0] = vaddq_s32(x3[0], x3[7]);
3730   x4[7] = vsubq_s32(x3[0], x3[7]);
3731   x4[1] = vaddq_s32(x3[1], x3[6]);
3732   x4[6] = vsubq_s32(x3[1], x3[6]);
3733   x4[2] = vaddq_s32(x3[2], x3[5]);
3734   x4[5] = vsubq_s32(x3[2], x3[5]);
3735   x4[3] = vaddq_s32(x3[3], x3[4]);
3736   x4[4] = vsubq_s32(x3[3], x3[4]);
3737 
3738   btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
3739                     *v_cos_bit);
3740   btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
3741                     *v_cos_bit);
3742 
3743   x4[16] = vaddq_s32(x2[16], x3[23]);
3744   x4[23] = vsubq_s32(x2[16], x3[23]);
3745   x4[17] = vaddq_s32(x2[17], x3[22]);
3746   x4[22] = vsubq_s32(x2[17], x3[22]);
3747   x4[18] = vaddq_s32(x2[18], x3[21]);
3748   x4[21] = vsubq_s32(x2[18], x3[21]);
3749   x4[19] = vaddq_s32(x2[19], x3[20]);
3750   x4[20] = vsubq_s32(x2[19], x3[20]);
3751   x4[24] = vsubq_s32(x2[31], x3[24]);
3752   x4[31] = vaddq_s32(x2[31], x3[24]);
3753   x4[25] = vsubq_s32(x2[30], x3[25]);
3754   x4[30] = vaddq_s32(x2[30], x3[25]);
3755   x4[26] = vsubq_s32(x2[29], x3[26]);
3756   x4[29] = vaddq_s32(x2[29], x3[26]);
3757   x4[27] = vsubq_s32(x2[28], x3[27]);
3758   x4[28] = vaddq_s32(x2[28], x3[27]);
3759 
3760   btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
3761                     *v_cos_bit);
3762   btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
3763                     *v_cos_bit);
3764   btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
3765                     *v_cos_bit);
3766   btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
3767                     *v_cos_bit);
3768   btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
3769                      *v_cos_bit);
3770   btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
3771                      *v_cos_bit);
3772   btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
3773                      *v_cos_bit);
3774   btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
3775                      *v_cos_bit);
3776 }
3777 
av1_fdct64_new_neon(int32x4_t * input,int32x4_t * output,int8_t cos_bit,const int instride,const int outstride,const int8_t * stage_range)3778 static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
3779                                 int8_t cos_bit, const int instride,
3780                                 const int outstride,
3781                                 const int8_t *stage_range) {
3782   (void)stage_range;
3783   const int32_t *cospi = cospi_arr(cos_bit);
3784   const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
3785 
3786   int startidx = 0 * instride;
3787   int endidx = 63 * instride;
3788 
3789   // stage 1-2-3-4
3790   int32x4_t x3[64], x4[64];
3791   av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
3792                                 &startidx, &endidx);
3793 
3794   // stage 5
3795   int32x4_t x5[64];
3796   x5[0] = vaddq_s32(x4[0], x4[3]);
3797   x5[3] = vsubq_s32(x4[0], x4[3]);
3798   x5[1] = vaddq_s32(x4[1], x4[2]);
3799   x5[2] = vsubq_s32(x4[1], x4[2]);
3800 
3801   btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
3802                     v_cos_bit);
3803 
3804   x5[8] = vaddq_s32(x3[8], x4[11]);
3805   x5[11] = vsubq_s32(x3[8], x4[11]);
3806   x5[9] = vaddq_s32(x3[9], x4[10]);
3807   x5[10] = vsubq_s32(x3[9], x4[10]);
3808   x5[12] = vsubq_s32(x3[15], x4[12]);
3809   x5[15] = vaddq_s32(x3[15], x4[12]);
3810   x5[13] = vsubq_s32(x3[14], x4[13]);
3811   x5[14] = vaddq_s32(x3[14], x4[13]);
3812 
3813   btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
3814                     v_cos_bit);
3815   btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
3816                     v_cos_bit);
3817   btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
3818                      v_cos_bit);
3819   btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
3820                      v_cos_bit);
3821 
3822   x5[32] = vaddq_s32(x3[32], x4[39]);
3823   x5[39] = vsubq_s32(x3[32], x4[39]);
3824   x5[33] = vaddq_s32(x3[33], x4[38]);
3825   x5[38] = vsubq_s32(x3[33], x4[38]);
3826   x5[34] = vaddq_s32(x3[34], x4[37]);
3827   x5[37] = vsubq_s32(x3[34], x4[37]);
3828   x5[35] = vaddq_s32(x3[35], x4[36]);
3829   x5[36] = vsubq_s32(x3[35], x4[36]);
3830   x5[40] = vsubq_s32(x3[47], x4[40]);
3831   x5[47] = vaddq_s32(x3[47], x4[40]);
3832   x5[41] = vsubq_s32(x3[46], x4[41]);
3833   x5[46] = vaddq_s32(x3[46], x4[41]);
3834   x5[42] = vsubq_s32(x3[45], x4[42]);
3835   x5[45] = vaddq_s32(x3[45], x4[42]);
3836   x5[43] = vsubq_s32(x3[44], x4[43]);
3837   x5[44] = vaddq_s32(x3[44], x4[43]);
3838   x5[48] = vaddq_s32(x3[48], x4[55]);
3839   x5[55] = vsubq_s32(x3[48], x4[55]);
3840   x5[49] = vaddq_s32(x3[49], x4[54]);
3841   x5[54] = vsubq_s32(x3[49], x4[54]);
3842   x5[50] = vaddq_s32(x3[50], x4[53]);
3843   x5[53] = vsubq_s32(x3[50], x4[53]);
3844   x5[51] = vaddq_s32(x3[51], x4[52]);
3845   x5[52] = vsubq_s32(x3[51], x4[52]);
3846   x5[56] = vsubq_s32(x3[63], x4[56]);
3847   x5[63] = vaddq_s32(x3[63], x4[56]);
3848   x5[57] = vsubq_s32(x3[62], x4[57]);
3849   x5[62] = vaddq_s32(x3[62], x4[57]);
3850   x5[58] = vsubq_s32(x3[61], x4[58]);
3851   x5[61] = vaddq_s32(x3[61], x4[58]);
3852   x5[59] = vsubq_s32(x3[60], x4[59]);
3853   x5[60] = vaddq_s32(x3[60], x4[59]);
3854 
3855   // stage 6
3856   int32x4_t x6[64];
3857   btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
3858   btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
3859                     v_cos_bit);
3860   x6[4] = vaddq_s32(x4[4], x5[5]);
3861   x6[5] = vsubq_s32(x4[4], x5[5]);
3862   x6[6] = vsubq_s32(x4[7], x5[6]);
3863   x6[7] = vaddq_s32(x4[7], x5[6]);
3864   btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
3865                     v_cos_bit);
3866   btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
3867                      v_cos_bit);
3868 
3869   x6[16] = vaddq_s32(x4[16], x5[19]);
3870   x6[19] = vsubq_s32(x4[16], x5[19]);
3871   x6[17] = vaddq_s32(x4[17], x5[18]);
3872   x6[18] = vsubq_s32(x4[17], x5[18]);
3873   x6[20] = vsubq_s32(x4[23], x5[20]);
3874   x6[23] = vaddq_s32(x4[23], x5[20]);
3875   x6[21] = vsubq_s32(x4[22], x5[21]);
3876   x6[22] = vaddq_s32(x4[22], x5[21]);
3877   x6[24] = vaddq_s32(x4[24], x5[27]);
3878   x6[27] = vsubq_s32(x4[24], x5[27]);
3879   x6[25] = vaddq_s32(x4[25], x5[26]);
3880   x6[26] = vsubq_s32(x4[25], x5[26]);
3881   x6[28] = vsubq_s32(x4[31], x5[28]);
3882   x6[31] = vaddq_s32(x4[31], x5[28]);
3883   x6[29] = vsubq_s32(x4[30], x5[29]);
3884   x6[30] = vaddq_s32(x4[30], x5[29]);
3885 
3886   btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
3887                     v_cos_bit);
3888   btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
3889                     v_cos_bit);
3890   btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
3891                      v_cos_bit);
3892   btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
3893                      v_cos_bit);
3894   btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
3895                     v_cos_bit);
3896   btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
3897                     v_cos_bit);
3898   btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
3899                      v_cos_bit);
3900   btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
3901                      v_cos_bit);
3902 
3903   // stage 7
3904   int32x4_t x7[64];
3905 
3906   btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
3907   btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
3908                     v_cos_bit);
3909   x7[8] = vaddq_s32(x5[8], x6[9]);
3910   x7[9] = vsubq_s32(x5[8], x6[9]);
3911   x7[10] = vsubq_s32(x5[11], x6[10]);
3912   x7[11] = vaddq_s32(x5[11], x6[10]);
3913   x7[12] = vaddq_s32(x5[12], x6[13]);
3914   x7[13] = vsubq_s32(x5[12], x6[13]);
3915   x7[14] = vsubq_s32(x5[15], x6[14]);
3916   x7[15] = vaddq_s32(x5[15], x6[14]);
3917 
3918   btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
3919                     v_cos_bit);
3920   btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
3921                      v_cos_bit);
3922 
3923   btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
3924                     v_cos_bit);
3925   btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
3926                      v_cos_bit);
3927 
3928   x7[32] = vaddq_s32(x5[32], x6[35]);
3929   x7[35] = vsubq_s32(x5[32], x6[35]);
3930   x7[33] = vaddq_s32(x5[33], x6[34]);
3931   x7[34] = vsubq_s32(x5[33], x6[34]);
3932   x7[36] = vsubq_s32(x5[39], x6[36]);
3933   x7[39] = vaddq_s32(x5[39], x6[36]);
3934   x7[37] = vsubq_s32(x5[38], x6[37]);
3935   x7[38] = vaddq_s32(x5[38], x6[37]);
3936   x7[40] = vaddq_s32(x5[40], x6[43]);
3937   x7[43] = vsubq_s32(x5[40], x6[43]);
3938   x7[41] = vaddq_s32(x5[41], x6[42]);
3939   x7[42] = vsubq_s32(x5[41], x6[42]);
3940   x7[44] = vsubq_s32(x5[47], x6[44]);
3941   x7[47] = vaddq_s32(x5[47], x6[44]);
3942   x7[45] = vsubq_s32(x5[46], x6[45]);
3943   x7[46] = vaddq_s32(x5[46], x6[45]);
3944   x7[48] = vaddq_s32(x5[48], x6[51]);
3945   x7[51] = vsubq_s32(x5[48], x6[51]);
3946   x7[49] = vaddq_s32(x5[49], x6[50]);
3947   x7[50] = vsubq_s32(x5[49], x6[50]);
3948   x7[52] = vsubq_s32(x5[55], x6[52]);
3949   x7[55] = vaddq_s32(x5[55], x6[52]);
3950   x7[53] = vsubq_s32(x5[54], x6[53]);
3951   x7[54] = vaddq_s32(x5[54], x6[53]);
3952   x7[56] = vaddq_s32(x5[56], x6[59]);
3953   x7[59] = vsubq_s32(x5[56], x6[59]);
3954   x7[57] = vaddq_s32(x5[57], x6[58]);
3955   x7[58] = vsubq_s32(x5[57], x6[58]);
3956   x7[60] = vsubq_s32(x5[63], x6[60]);
3957   x7[63] = vaddq_s32(x5[63], x6[60]);
3958   x7[61] = vsubq_s32(x5[62], x6[61]);
3959   x7[62] = vaddq_s32(x5[62], x6[61]);
3960 
3961   // stage 8
3962   int32x4_t x8[64];
3963 
3964   btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
3965                     v_cos_bit);
3966   btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
3967                     v_cos_bit);
3968   btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
3969                     v_cos_bit);
3970   btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
3971                     v_cos_bit);
3972   x8[16] = vaddq_s32(x6[16], x7[17]);
3973   x8[17] = vsubq_s32(x6[16], x7[17]);
3974   x8[18] = vsubq_s32(x6[19], x7[18]);
3975   x8[19] = vaddq_s32(x6[19], x7[18]);
3976   x8[20] = vaddq_s32(x6[20], x7[21]);
3977   x8[21] = vsubq_s32(x6[20], x7[21]);
3978   x8[22] = vsubq_s32(x6[23], x7[22]);
3979   x8[23] = vaddq_s32(x6[23], x7[22]);
3980   x8[24] = vaddq_s32(x6[24], x7[25]);
3981   x8[25] = vsubq_s32(x6[24], x7[25]);
3982   x8[26] = vsubq_s32(x6[27], x7[26]);
3983   x8[27] = vaddq_s32(x6[27], x7[26]);
3984   x8[28] = vaddq_s32(x6[28], x7[29]);
3985   x8[29] = vsubq_s32(x6[28], x7[29]);
3986   x8[30] = vsubq_s32(x6[31], x7[30]);
3987   x8[31] = vaddq_s32(x6[31], x7[30]);
3988 
3989   btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
3990                     v_cos_bit);
3991   btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
3992                      v_cos_bit);
3993   btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
3994                     v_cos_bit);
3995   btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
3996                      v_cos_bit);
3997   btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
3998                     v_cos_bit);
3999   btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
4000                      v_cos_bit);
4001   btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
4002                     v_cos_bit);
4003   btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
4004                      v_cos_bit);
4005 
4006   // stage 9
4007   int32x4_t x9[64];
4008 
4009   btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
4010                     v_cos_bit);
4011   btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
4012                     v_cos_bit);
4013   btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
4014                     v_cos_bit);
4015   btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
4016                     v_cos_bit);
4017   btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
4018                     v_cos_bit);
4019   btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
4020                     v_cos_bit);
4021   btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
4022                     v_cos_bit);
4023   btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
4024                     v_cos_bit);
4025   x9[32] = vaddq_s32(x7[32], x8[33]);
4026   x9[33] = vsubq_s32(x7[32], x8[33]);
4027   x9[34] = vsubq_s32(x7[35], x8[34]);
4028   x9[35] = vaddq_s32(x7[35], x8[34]);
4029   x9[36] = vaddq_s32(x7[36], x8[37]);
4030   x9[37] = vsubq_s32(x7[36], x8[37]);
4031   x9[38] = vsubq_s32(x7[39], x8[38]);
4032   x9[39] = vaddq_s32(x7[39], x8[38]);
4033   x9[40] = vaddq_s32(x7[40], x8[41]);
4034   x9[41] = vsubq_s32(x7[40], x8[41]);
4035   x9[42] = vsubq_s32(x7[43], x8[42]);
4036   x9[43] = vaddq_s32(x7[43], x8[42]);
4037   x9[44] = vaddq_s32(x7[44], x8[45]);
4038   x9[45] = vsubq_s32(x7[44], x8[45]);
4039   x9[46] = vsubq_s32(x7[47], x8[46]);
4040   x9[47] = vaddq_s32(x7[47], x8[46]);
4041   x9[48] = vaddq_s32(x7[48], x8[49]);
4042   x9[49] = vsubq_s32(x7[48], x8[49]);
4043   x9[50] = vsubq_s32(x7[51], x8[50]);
4044   x9[51] = vaddq_s32(x7[51], x8[50]);
4045   x9[52] = vaddq_s32(x7[52], x8[53]);
4046   x9[53] = vsubq_s32(x7[52], x8[53]);
4047   x9[54] = vsubq_s32(x7[55], x8[54]);
4048   x9[55] = vaddq_s32(x7[55], x8[54]);
4049   x9[56] = vaddq_s32(x7[56], x8[57]);
4050   x9[57] = vsubq_s32(x7[56], x8[57]);
4051   x9[58] = vsubq_s32(x7[59], x8[58]);
4052   x9[59] = vaddq_s32(x7[59], x8[58]);
4053   x9[60] = vaddq_s32(x7[60], x8[61]);
4054   x9[61] = vsubq_s32(x7[60], x8[61]);
4055   x9[62] = vsubq_s32(x7[63], x8[62]);
4056   x9[63] = vaddq_s32(x7[63], x8[62]);
4057 
4058   // stage 10
4059   int32x4_t x10[64];
4060 
4061   btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
4062                     v_cos_bit);
4063   btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
4064                     v_cos_bit);
4065   btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
4066                     v_cos_bit);
4067   btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
4068                     v_cos_bit);
4069   btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
4070                     v_cos_bit);
4071   btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
4072                     v_cos_bit);
4073   btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
4074                     v_cos_bit);
4075   btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
4076                     v_cos_bit);
4077   btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
4078                     v_cos_bit);
4079   btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
4080                     v_cos_bit);
4081   btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
4082                     v_cos_bit);
4083   btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
4084                     v_cos_bit);
4085   btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
4086                     v_cos_bit);
4087   btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
4088                     v_cos_bit);
4089   btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
4090                     v_cos_bit);
4091   btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
4092                     v_cos_bit);
4093 
4094   startidx = 0 * outstride;
4095   endidx = 63 * outstride;
4096   // stage 11
4097   output[startidx] = x6[0];
4098   output[endidx] = x10[63];
4099   startidx += outstride;
4100   endidx -= outstride;
4101   output[startidx] = x10[32];
4102   output[endidx] = x9[31];
4103   startidx += outstride;
4104   endidx -= outstride;
4105   output[startidx] = x9[16];
4106   output[endidx] = x10[47];
4107   startidx += outstride;
4108   endidx -= outstride;
4109   output[startidx] = x10[48];
4110   output[endidx] = x8[15];
4111   startidx += outstride;
4112   endidx -= outstride;
4113   output[startidx] = x8[8];
4114   output[endidx] = x10[55];
4115   startidx += outstride;
4116   endidx -= outstride;
4117   output[startidx] = x10[40];
4118   output[endidx] = x9[23];
4119   startidx += outstride;
4120   endidx -= outstride;
4121   output[startidx] = x9[24];
4122   output[endidx] = x10[39];
4123   startidx += outstride;
4124   endidx -= outstride;
4125   output[startidx] = x10[56];
4126   output[endidx] = x7[7];
4127   startidx += outstride;
4128   endidx -= outstride;
4129   output[startidx] = x7[4];
4130   output[endidx] = x10[59];
4131   startidx += outstride;
4132   endidx -= outstride;
4133   output[startidx] = x10[36];
4134   output[endidx] = x9[27];
4135   startidx += outstride;
4136   endidx -= outstride;
4137   output[startidx] = x9[20];
4138   output[endidx] = x10[43];
4139   startidx += outstride;
4140   endidx -= outstride;
4141   output[startidx] = x10[52];
4142   output[endidx] = x8[11];
4143   startidx += outstride;
4144   endidx -= outstride;
4145   output[startidx] = x8[12];
4146   output[endidx] = x10[51];
4147   startidx += outstride;
4148   endidx -= outstride;
4149   output[startidx] = x10[44];
4150   output[endidx] = x9[19];
4151   startidx += outstride;
4152   endidx -= outstride;
4153   output[startidx] = x9[28];
4154   output[endidx] = x10[35];
4155   startidx += outstride;
4156   endidx -= outstride;
4157   output[startidx] = x10[60];
4158   output[endidx] = x6[3];
4159   startidx += outstride;
4160   endidx -= outstride;
4161   output[startidx] = x6[2];
4162   output[endidx] = x10[61];
4163   startidx += outstride;
4164   endidx -= outstride;
4165   output[startidx] = x10[34];
4166   output[endidx] = x9[29];
4167   startidx += outstride;
4168   endidx -= outstride;
4169   output[startidx] = x9[18];
4170   output[endidx] = x10[45];
4171   startidx += outstride;
4172   endidx -= outstride;
4173   output[startidx] = x10[50];
4174   output[endidx] = x8[13];
4175   startidx += outstride;
4176   endidx -= outstride;
4177   output[startidx] = x8[10];
4178   output[endidx] = x10[53];
4179   startidx += outstride;
4180   endidx -= outstride;
4181   output[startidx] = x10[42];
4182   output[endidx] = x9[21];
4183   startidx += outstride;
4184   endidx -= outstride;
4185   output[startidx] = x9[26];
4186   output[endidx] = x10[37];
4187   startidx += outstride;
4188   endidx -= outstride;
4189   output[startidx] = x10[58];
4190   output[endidx] = x7[5];
4191   startidx += outstride;
4192   endidx -= outstride;
4193   output[startidx] = x7[6];
4194   output[endidx] = x10[57];
4195   startidx += outstride;
4196   endidx -= outstride;
4197   output[startidx] = x10[38];
4198   output[endidx] = x9[25];
4199   startidx += outstride;
4200   endidx -= outstride;
4201   output[startidx] = x9[22];
4202   output[endidx] = x10[41];
4203   startidx += outstride;
4204   endidx -= outstride;
4205   output[startidx] = x10[54];
4206   output[endidx] = x8[9];
4207   startidx += outstride;
4208   endidx -= outstride;
4209   output[startidx] = x8[14];
4210   output[endidx] = x10[49];
4211   startidx += outstride;
4212   endidx -= outstride;
4213   output[startidx] = x10[46];
4214   output[endidx] = x9[17];
4215   startidx += outstride;
4216   endidx -= outstride;
4217   output[startidx] = x9[30];
4218   output[endidx] = x10[33];
4219   startidx += outstride;
4220   endidx -= outstride;
4221   output[startidx] = x10[62];
4222   output[endidx] = x6[1];
4223 }
4224 
av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4225 static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
4226                                             int32_t *output, int stride,
4227                                             TX_TYPE tx_type, int bd) {
4228   (void)bd;
4229   (void)tx_type;
4230   assert(tx_type == DCT_DCT);
4231   const TX_SIZE tx_size = TX_64X64;
4232   int16x8_t buf0[64], buf1[512];
4233   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4234   const int txw_idx = get_txw_idx(tx_size);
4235   const int txh_idx = get_txh_idx(tx_size);
4236   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4237   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4238   const int width = tx_size_wide[tx_size];
4239   const int height = tx_size_high[tx_size];
4240   const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
4241   const int width_div8 = (width >> 3);
4242   const int height_div8 = (height >> 3);
4243 
4244   for (int i = 0; i < width_div8; i++) {
4245     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4246     round_shift_16bit(buf0, height, shift[0]);
4247     col_txfm(buf0, buf0, cos_bit_col, NULL);
4248     round_shift_16bit(buf0, height, shift[1]);
4249     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4250       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4251     }
4252   }
4253   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4254     int32x4_t bufA[64];
4255     int32x4_t bufB[64];
4256     int16x8_t *buf = buf1 + width * i;
4257     for (int j = 0; j < width; ++j) {
4258       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4259       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4260     }
4261     av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
4262     av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
4263     av1_round_shift_array_32_neon(bufA, bufA, 32);
4264     av1_round_shift_array_32_neon(bufB, bufB, 32);
4265 
4266     int32_t *output8 = output + 8 * 32 * i;
4267     for (int j = 0; j < width_div8; ++j) {
4268       int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4269       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4270     }
4271   }
4272 }
av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4273 static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
4274                                             int32_t *output, int stride,
4275                                             TX_TYPE tx_type, int bd) {
4276   (void)bd;
4277   const TX_SIZE tx_size = TX_64X32;
4278   int16x8_t buf0[64], buf1[256];
4279   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4280   const int txw_idx = get_txw_idx(tx_size);
4281   const int txh_idx = get_txh_idx(tx_size);
4282   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4283   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4284   const int width = tx_size_wide[tx_size];
4285   const int height = tx_size_high[tx_size];
4286   const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
4287   const int width_div8 = (width >> 3);
4288   const int height_div8 = (height >> 3);
4289 
4290   for (int i = 0; i < width_div8; i++) {
4291     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4292     round_shift_16bit(buf0, height, shift[0]);
4293     col_txfm(buf0, buf0, cos_bit_col, NULL);
4294     round_shift_16bit(buf0, height, shift[1]);
4295     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4296       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4297     }
4298   }
4299   assert(tx_type == DCT_DCT);
4300   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4301     int32x4_t bufA[64];
4302     int32x4_t bufB[64];
4303     int16x8_t *buf = buf1 + width * i;
4304     for (int j = 0; j < width; ++j) {
4305       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4306       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4307     }
4308     av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
4309     av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
4310     av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
4311     av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
4312 
4313     int32_t *output8 = output + 8 * 32 * i;
4314     for (int j = 0; j < width_div8; ++j) {
4315       int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4316       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4317     }
4318   }
4319 }
4320 
av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4321 static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
4322                                             int32_t *output, int stride,
4323                                             TX_TYPE tx_type, int bd) {
4324   (void)bd;
4325   (void)tx_type;
4326   assert(tx_type == DCT_DCT);
4327   const TX_SIZE tx_size = TX_32X64;
4328   int16x8_t buf0[64], buf1[256];
4329   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4330   const int txw_idx = get_txw_idx(tx_size);
4331   const int txh_idx = get_txh_idx(tx_size);
4332   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4333   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4334   const int width = tx_size_wide[tx_size];
4335   const int height = tx_size_high[tx_size];
4336   const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
4337   const int width_div8 = (width >> 3);
4338   const int height_div8 = (height >> 3);
4339 
4340   for (int i = 0; i < width_div8; i++) {
4341     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4342     round_shift_16bit(buf0, height, shift[0]);
4343     col_txfm(buf0, buf0, cos_bit_col, NULL);
4344     round_shift_16bit(buf0, height, shift[1]);
4345     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4346       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4347     }
4348   }
4349 
4350   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4351     int32x4_t bufA[32];
4352     int32x4_t bufB[32];
4353     int16x8_t *buf = buf1 + width * i;
4354     for (int j = 0; j < width; ++j) {
4355       bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4356       bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4357     }
4358     av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
4359     av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
4360     av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
4361     av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
4362 
4363     int32_t *output8 = output + 8 * 32 * i;
4364     for (int j = 0; j < (32 / 4); ++j) {
4365       int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4366       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4367     }
4368   }
4369 }
4370 
4371 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
4372   av1_lowbd_fwd_txfm2d_4x4_neon,    // 4x4 transform
4373   av1_lowbd_fwd_txfm2d_8x8_neon,    // 8x8 transform
4374   av1_lowbd_fwd_txfm2d_16x16_neon,  // 16x16 transform
4375   av1_lowbd_fwd_txfm2d_32x32_neon,  // 32x32 transform
4376   av1_lowbd_fwd_txfm2d_64x64_neon,  // 64x64 transform
4377   av1_lowbd_fwd_txfm2d_4x8_neon,    // 4x8 transform
4378   av1_lowbd_fwd_txfm2d_8x4_neon,    // 8x4 transform
4379   av1_lowbd_fwd_txfm2d_8x16_neon,   // 8x16 transform
4380   av1_lowbd_fwd_txfm2d_16x8_neon,   // 16x8 transform
4381   av1_lowbd_fwd_txfm2d_16x32_neon,  // 16x32 transform
4382   av1_lowbd_fwd_txfm2d_32x16_neon,  // 32x16 transform
4383   av1_lowbd_fwd_txfm2d_32x64_neon,  // 32x64 transform
4384   av1_lowbd_fwd_txfm2d_64x32_neon,  // 64x32 transform
4385   av1_lowbd_fwd_txfm2d_4x16_neon,   // 4x16 transform
4386   av1_lowbd_fwd_txfm2d_16x4_neon,   // 16x4 transform
4387   av1_lowbd_fwd_txfm2d_8x32_neon,   // 8x32 transform
4388   av1_lowbd_fwd_txfm2d_32x8_neon,   // 32x8 transform
4389   av1_lowbd_fwd_txfm2d_16x64_neon,  // 16x64 transform
4390   av1_lowbd_fwd_txfm2d_64x16_neon,  // 64x16 transform
4391 };
4392 
av1_lowbd_fwd_txfm_neon(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)4393 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
4394                              int diff_stride, TxfmParam *txfm_param) {
4395   FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
4396   if ((fwd_txfm2d_func == NULL) ||
4397       (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
4398     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
4399   } else {
4400     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
4401                     txfm_param->bd);
4402   }
4403 }
4404