1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "aom_dsp/txfm_common.h"
16 #include "aom_dsp/arm/mem_neon.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_txfm.h"
19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
20 #include "config/aom_config.h"
21 #include "config/av1_rtcd.h"
22
23 #define custom_packs_s32(w0, w1) vcombine_s16(vqmovn_s32(w0), vqmovn_s32(w1));
24
transpose_16bit_4x4(const int16x8_t * const in,int16x8_t * const out)25 static INLINE void transpose_16bit_4x4(const int16x8_t *const in,
26 int16x8_t *const out) {
27 #if defined(__aarch64__)
28 const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
29 const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
30 #else
31 int16x4x2_t temp;
32 temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
33 const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
34 temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
35 const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
36 #endif
37
38 int32x4x2_t a01 =
39 vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
40 out[0] = vreinterpretq_s16_s32(a01.val[0]);
41 out[1] = vextq_s16(vreinterpretq_s16_s32(a01.val[0]), out[1], 4);
42 out[2] = vreinterpretq_s16_s32(a01.val[1]);
43 out[3] = vextq_s16(vreinterpretq_s16_s32(a01.val[1]), out[3], 4);
44 }
45
transpose_16bit_4x8(const int16x8_t * const in,int16x8_t * const out)46 static INLINE void transpose_16bit_4x8(const int16x8_t *const in,
47 int16x8_t *const out) {
48 #if defined(__aarch64__)
49 const int16x8_t a0 = vzip1q_s16(in[0], in[1]);
50 const int16x8_t a1 = vzip1q_s16(in[2], in[3]);
51 const int16x8_t a2 = vzip1q_s16(in[4], in[5]);
52 const int16x8_t a3 = vzip1q_s16(in[6], in[7]);
53 #else
54 int16x4x2_t temp;
55 temp = vzip_s16(vget_low_s16(in[0]), vget_low_s16(in[1]));
56 const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
57 temp = vzip_s16(vget_low_s16(in[2]), vget_low_s16(in[3]));
58 const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
59 temp = vzip_s16(vget_low_s16(in[4]), vget_low_s16(in[5]));
60 const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
61 temp = vzip_s16(vget_low_s16(in[6]), vget_low_s16(in[7]));
62 const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
63 #endif
64
65 const int32x4x2_t b02 =
66 vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
67 const int32x4x2_t b13 =
68 vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
69
70 #if defined(__aarch64__)
71 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
72 vreinterpretq_s64_s32(b13.val[0])));
73 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
74 vreinterpretq_s64_s32(b13.val[0])));
75 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
76 vreinterpretq_s64_s32(b13.val[1])));
77 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
78 vreinterpretq_s64_s32(b13.val[1])));
79 #else
80 out[0] = vreinterpretq_s16_s32(
81 vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
82 out[2] = vreinterpretq_s16_s32(
83 vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
84 out[1] = vreinterpretq_s16_s32(
85 vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
86 out[3] = vreinterpretq_s16_s32(
87 vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
88 #endif
89 }
90
transpose_16bit_8x4(const int16x8_t * const in,int16x8_t * const out)91 static INLINE void transpose_16bit_8x4(const int16x8_t *const in,
92 int16x8_t *const out) {
93 const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
94 const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
95
96 const int32x4x2_t b01 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
97 vreinterpretq_s32_s16(a15.val[0]));
98 const int32x4x2_t b45 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
99 vreinterpretq_s32_s16(a15.val[1]));
100
101 const int32x4_t zeros = vdupq_n_s32(0);
102
103 #if defined(__aarch64__)
104 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[0]),
105 vreinterpretq_s64_s32(zeros)));
106 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[0]),
107 vreinterpretq_s64_s32(zeros)));
108 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b01.val[1]),
109 vreinterpretq_s64_s32(zeros)));
110 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b01.val[1]),
111 vreinterpretq_s64_s32(zeros)));
112 out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[0]),
113 vreinterpretq_s64_s32(zeros)));
114 out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[0]),
115 vreinterpretq_s64_s32(zeros)));
116 out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b45.val[1]),
117 vreinterpretq_s64_s32(zeros)));
118 out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b45.val[1]),
119 vreinterpretq_s64_s32(zeros)));
120 #else
121 out[0] = vreinterpretq_s16_s32(
122 vextq_s32(vextq_s32(b01.val[0], b01.val[0], 2), zeros, 2));
123 out[1] = vreinterpretq_s16_s32(vextq_s32(b01.val[0], zeros, 2));
124 out[2] = vreinterpretq_s16_s32(
125 vextq_s32(vextq_s32(b01.val[1], b01.val[1], 2), zeros, 2));
126 out[3] = vreinterpretq_s16_s32(vextq_s32(b01.val[1], zeros, 2));
127 out[4] = vreinterpretq_s16_s32(
128 vextq_s32(vextq_s32(b45.val[0], b45.val[0], 2), zeros, 2));
129 out[5] = vreinterpretq_s16_s32(vextq_s32(b45.val[0], zeros, 2));
130 out[6] = vreinterpretq_s16_s32(
131 vextq_s32(vextq_s32(b45.val[1], b45.val[1], 2), zeros, 2));
132 out[7] = vreinterpretq_s16_s32(vextq_s32(b45.val[1], zeros, 2));
133 #endif
134 }
135
transpose_16bit_8x8(const int16x8_t * const in,int16x8_t * const out)136 static INLINE void transpose_16bit_8x8(const int16x8_t *const in,
137 int16x8_t *const out) {
138 const int16x8x2_t a04 = vzipq_s16(in[0], in[1]);
139 const int16x8x2_t a15 = vzipq_s16(in[2], in[3]);
140 const int16x8x2_t a26 = vzipq_s16(in[4], in[5]);
141 const int16x8x2_t a37 = vzipq_s16(in[6], in[7]);
142
143 const int32x4x2_t b04 = vzipq_s32(vreinterpretq_s32_s16(a04.val[0]),
144 vreinterpretq_s32_s16(a15.val[0]));
145 const int32x4x2_t b15 = vzipq_s32(vreinterpretq_s32_s16(a26.val[0]),
146 vreinterpretq_s32_s16(a37.val[0]));
147 const int32x4x2_t b26 = vzipq_s32(vreinterpretq_s32_s16(a04.val[1]),
148 vreinterpretq_s32_s16(a15.val[1]));
149 const int32x4x2_t b37 = vzipq_s32(vreinterpretq_s32_s16(a26.val[1]),
150 vreinterpretq_s32_s16(a37.val[1]));
151
152 #if defined(__aarch64__)
153 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[0]),
154 vreinterpretq_s64_s32(b15.val[0])));
155 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[0]),
156 vreinterpretq_s64_s32(b15.val[0])));
157 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b04.val[1]),
158 vreinterpretq_s64_s32(b15.val[1])));
159 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b04.val[1]),
160 vreinterpretq_s64_s32(b15.val[1])));
161 out[4] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[0]),
162 vreinterpretq_s64_s32(b37.val[0])));
163 out[5] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[0]),
164 vreinterpretq_s64_s32(b37.val[0])));
165 out[6] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b26.val[1]),
166 vreinterpretq_s64_s32(b37.val[1])));
167 out[7] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b26.val[1]),
168 vreinterpretq_s64_s32(b37.val[1])));
169 #else
170 out[0] = vreinterpretq_s16_s32(
171 vextq_s32(vextq_s32(b04.val[0], b04.val[0], 2), b15.val[0], 2));
172 out[1] = vreinterpretq_s16_s32(
173 vextq_s32(b04.val[0], vextq_s32(b15.val[0], b15.val[0], 2), 2));
174 out[2] = vreinterpretq_s16_s32(
175 vextq_s32(vextq_s32(b04.val[1], b04.val[1], 2), b15.val[1], 2));
176 out[3] = vreinterpretq_s16_s32(
177 vextq_s32(b04.val[1], vextq_s32(b15.val[1], b15.val[1], 2), 2));
178 out[4] = vreinterpretq_s16_s32(
179 vextq_s32(vextq_s32(b26.val[0], b26.val[0], 2), b37.val[0], 2));
180 out[5] = vreinterpretq_s16_s32(
181 vextq_s32(b26.val[0], vextq_s32(b37.val[0], b37.val[0], 2), 2));
182 out[6] = vreinterpretq_s16_s32(
183 vextq_s32(vextq_s32(b26.val[1], b26.val[1], 2), b37.val[1], 2));
184 out[7] = vreinterpretq_s16_s32(
185 vextq_s32(b26.val[1], vextq_s32(b37.val[1], b37.val[1], 2), 2));
186 #endif
187 }
188
av1_round_shift_rect_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)189 static INLINE void av1_round_shift_rect_array_32_neon(int32x4_t *input,
190 int32x4_t *output,
191 const int size) {
192 int i;
193 for (i = 0; i < size; i++) {
194 output[i] = vrshrq_n_s32(vmulq_n_s32(vrshrq_n_s32(input[i], 2), NewSqrt2),
195 NewSqrt2Bits);
196 }
197 }
198
av1_round_shift_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)199 static INLINE void av1_round_shift_array_32_neon(int32x4_t *input,
200 int32x4_t *output,
201 const int size) {
202 int i;
203 for (i = 0; i < size; i++) output[i] = vrshrq_n_s32(input[i], 2);
204 }
205
206 #define btf_32_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
207 do { \
208 out0 = vmulq_n_s32(in0, w0); \
209 out0 = vmlaq_n_s32(out0, in1, w1); \
210 out0 = vrshlq_s32(out0, v_cos_bit); \
211 out1 = vmulq_n_s32(in0, w1); \
212 out1 = vmlsq_n_s32(out1, in1, w0); \
213 out1 = vrshlq_s32(out1, v_cos_bit); \
214 } while (0)
215
216 #define btf_32_type1_neon(w0, w1, in0, in1, out0, out1, v_cos_bit) \
217 do { \
218 btf_32_neon(w1, w0, in1, in0, out0, out1, v_cos_bit); \
219 } while (0)
220
221 #define btf_32_neon_mode0(w0, w1, in0, in1, out0, out1, v_cos_bit) \
222 do { \
223 out0 = vmulq_n_s32(in1, w1); \
224 out0 = vmlsq_n_s32(out0, in0, w0); \
225 out0 = vrshlq_s32(out0, v_cos_bit); \
226 out1 = vmulq_n_s32(in0, w1); \
227 out1 = vmlaq_n_s32(out1, in1, w0); \
228 out1 = vrshlq_s32(out1, v_cos_bit); \
229 } while (0)
230
231 #define btf_32_neon_mode01(w0, w1, in0, in1, out0, out1, v_cos_bit) \
232 do { \
233 out0 = vmulq_n_s32(in1, w1); \
234 out0 = vmlaq_n_s32(out0, in0, w0); \
235 out0 = vrshlq_s32(vnegq_s32(out0), v_cos_bit); \
236 out1 = vmulq_n_s32(in1, w0); \
237 out1 = vmlsq_n_s32(out1, in0, w1); \
238 out1 = vrshlq_s32(out1, v_cos_bit); \
239 } while (0)
240
flip_buf_neon(int16x8_t * in,int16x8_t * out,int size)241 static INLINE void flip_buf_neon(int16x8_t *in, int16x8_t *out, int size) {
242 for (int i = 0; i < size; ++i) {
243 out[size - i - 1] = in[i];
244 }
245 }
246
store_16bit_to_32bit_w4(const int16x8_t a,int32_t * const b)247 static INLINE void store_16bit_to_32bit_w4(const int16x8_t a,
248 int32_t *const b) {
249 vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
250 }
251
store_16bit_to_32bit(int16x8_t a,int32_t * b)252 static INLINE void store_16bit_to_32bit(int16x8_t a, int32_t *b) {
253 vst1q_s32(b, vmovl_s16(vget_low_s16(a)));
254 vst1q_s32((b + 4), vmovl_s16(vget_high_s16(a)));
255 }
256
store_rect_16bit_to_32bit_w4(const int16x8_t a,int32_t * const b,const int16x4_t * v_newsqrt2,const int32x4_t * v_newsqrt2bits)257 static INLINE void store_rect_16bit_to_32bit_w4(
258 const int16x8_t a, int32_t *const b, const int16x4_t *v_newsqrt2,
259 const int32x4_t *v_newsqrt2bits) {
260 const int32x4_t b_lo =
261 vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
262 vst1q_s32(b, b_lo);
263 }
264
store_rect_16bit_to_32bit(const int16x8_t a,int32_t * const b,const int16x4_t * v_newsqrt2,const int32x4_t * v_newsqrt2bits)265 static INLINE void store_rect_16bit_to_32bit(const int16x8_t a,
266 int32_t *const b,
267 const int16x4_t *v_newsqrt2,
268 const int32x4_t *v_newsqrt2bits) {
269 const int32x4_t b_lo =
270 vrshlq_s32(vmull_s16(vget_low_s16(a), *v_newsqrt2), *v_newsqrt2bits);
271 const int32x4_t b_hi =
272 vrshlq_s32(vmull_s16(vget_high_s16(a), *v_newsqrt2), *v_newsqrt2bits);
273 vst1q_s32(b, b_lo);
274 vst1q_s32((b + 4), b_hi);
275 }
276
load_buffer_16bit_to_16bit_w4(const int16_t * const in,const int stride,int16x8_t * const out,const int out_size)277 static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
278 const int stride,
279 int16x8_t *const out,
280 const int out_size) {
281 for (int i = 0; i < out_size; ++i)
282 out[i] = vreinterpretq_s16_u64(vld1q_lane_u64(
283 (uint64_t *)(in + i * stride), vreinterpretq_u64_s16(out[i]), 0));
284 }
285
load_buffer_16bit_to_16bit_w4_flip(const int16_t * const in,const int stride,int16x8_t * const out,const int out_size)286 static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
287 const int stride,
288 int16x8_t *const out,
289 const int out_size) {
290 for (int i = 0; i < out_size; ++i)
291 out[out_size - i - 1] = vreinterpretq_s16_u64(
292 vld1q_lane_u64((uint64_t *)(in + i * stride),
293 vreinterpretq_u64_s16(out[out_size - i - 1]), 0));
294 }
295
load_buffer_16bit_to_16bit(const int16_t * in,int stride,int16x8_t * out,int out_size)296 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
297 int16x8_t *out, int out_size) {
298 for (int i = 0; i < out_size; ++i) {
299 out[i] = vld1q_s16(in + i * stride);
300 }
301 }
302
load_buffer_16bit_to_16bit_flip(const int16_t * in,int stride,int16x8_t * out,int out_size)303 static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
304 int stride, int16x8_t *out,
305 int out_size) {
306 for (int i = 0; i < out_size; ++i) {
307 out[out_size - i - 1] = vld1q_s16(in + i * stride);
308 }
309 }
310
store_buffer_16bit_to_32bit_w4(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)311 static INLINE void store_buffer_16bit_to_32bit_w4(const int16x8_t *const in,
312 int32_t *const out,
313 const int stride,
314 const int out_size) {
315 for (int i = 0; i < out_size; ++i) {
316 store_16bit_to_32bit_w4(in[i], out + i * stride);
317 }
318 }
319
store_buffer_16bit_to_32bit_w8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)320 static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
321 int32_t *const out,
322 const int stride,
323 const int out_size) {
324 for (int i = 0; i < out_size; ++i) {
325 store_16bit_to_32bit(in[i], out + i * stride);
326 }
327 }
328
store_rect_buffer_16bit_to_32bit_w4(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)329 static INLINE void store_rect_buffer_16bit_to_32bit_w4(
330 const int16x8_t *const in, int32_t *const out, const int stride,
331 const int out_size) {
332 const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
333 const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
334 for (int i = 0; i < out_size; ++i) {
335 store_rect_16bit_to_32bit_w4(in[i], out + i * stride, &v_newsqrt2,
336 &v_newsqrt2bits);
337 }
338 }
339
store_rect_buffer_16bit_to_32bit_w8(const int16x8_t * const in,int32_t * const out,const int stride,const int out_size)340 static INLINE void store_rect_buffer_16bit_to_32bit_w8(
341 const int16x8_t *const in, int32_t *const out, const int stride,
342 const int out_size) {
343 const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
344 const int32x4_t v_newsqrt2bits = vdupq_n_s32(-NewSqrt2Bits);
345 for (int i = 0; i < out_size; ++i) {
346 store_rect_16bit_to_32bit(in[i], out + i * stride, &v_newsqrt2,
347 &v_newsqrt2bits);
348 }
349 }
350
round_shift_16bit(int16x8_t * in,int size,int bit)351 static INLINE void round_shift_16bit(int16x8_t *in, int size, int bit) {
352 const int16x8_t vbit = vdupq_n_s16(bit);
353 for (int i = 0; i < size; ++i) {
354 in[i] = vrshlq_s16(in[i], vbit);
355 }
356 }
357
round_shift_16bit_vector(int16x8_t * in,int size,const int16x8_t * v_bit)358 static INLINE void round_shift_16bit_vector(int16x8_t *in, int size,
359 const int16x8_t *v_bit) {
360 for (int i = 0; i < size; ++i) {
361 in[i] = vrshlq_s16(in[i], *v_bit);
362 }
363 }
364
av1_fadst4x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)365 void av1_fadst4x4_neon(const int16x8_t *input, int16x8_t *output,
366 int8_t cos_bit, const int8_t *stage_range) {
367 (void)stage_range;
368 const int32_t *sinpi = sinpi_arr(cos_bit);
369
370 int32x4_t u[6], v[6];
371
372 u[0] = vmovl_s16(vget_low_s16(input[0]));
373 u[1] = vmovl_s16(vget_low_s16(input[1]));
374 u[2] = vmovl_s16(vget_low_s16(input[2]));
375 u[3] = vmovl_s16(vget_low_s16(input[3]));
376 u[4] = vaddq_s32(u[0], u[1]);
377 v[5] = vmulq_n_s32(u[2], sinpi[3]);
378 v[0] = vmulq_n_s32(u[1], sinpi[2]);
379 v[0] = vmlaq_n_s32(v[0], u[0], sinpi[1]);
380 v[1] = vmlaq_n_s32(v[5], u[3], sinpi[4]);
381 v[2] = vmulq_n_s32(u[4], sinpi[3]);
382 v[3] = vmulq_n_s32(u[0], sinpi[4]);
383 v[3] = vmlsq_n_s32(v[3], u[1], sinpi[1]);
384 v[4] = vmlsq_n_s32(v[5], u[3], sinpi[2]);
385
386 u[0] = vaddq_s32(v[0], v[1]);
387 u[1] = vmlsq_n_s32(v[2], u[3], sinpi[3]);
388 u[2] = vsubq_s32(v[3], v[4]);
389 u[3] = vsubq_s32(u[2], u[0]);
390 u[5] = vmlaq_n_s32(u[3], v[5], 3);
391
392 int32x4_t vshift = vdupq_n_s32(-cos_bit);
393 u[0] = vrshlq_s32(u[0], vshift);
394 u[1] = vrshlq_s32(u[1], vshift);
395 u[2] = vrshlq_s32(u[2], vshift);
396 u[3] = vrshlq_s32(u[5], vshift);
397
398 output[0] = custom_packs_s32(u[0], u[2]);
399
400 output[1] = custom_packs_s32(u[1], u[3]);
401 output[2] = vextq_s16(output[0], output[0], 4);
402 output[3] = vextq_s16(output[1], output[1], 4);
403 }
404
405 #define btf_16_w4_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1, \
406 v_cos_bit) \
407 { \
408 int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
409 int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
410 int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
411 u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
412 int32x4_t v0 = vmulq_n_s32(in0_l, w1_l); \
413 v0 = vmlaq_n_s32(v0, in1_l, w1_h); \
414 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
415 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
416 const int16x4_t c1 = vqmovn_s32(c0); \
417 const int16x4_t d1 = vqmovn_s32(d0); \
418 out0 = vcombine_s16(c1, c1); \
419 out1 = vcombine_s16(d1, c1); \
420 }
421
422 #define btf_16_w4_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
423 { \
424 int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
425 int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
426 int32x4_t u0 = vmulq_n_s32(in1_l, w0_h); \
427 u0 = vmlsq_n_s32(u0, in0_l, w0_l); \
428 int32x4_t v0 = vmulq_n_s32(in0_l, w0_h); \
429 v0 = vmlaq_n_s32(v0, in1_l, w0_l); \
430 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
431 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
432 const int16x4_t c1 = vqmovn_s32(c0); \
433 const int16x4_t d1 = vqmovn_s32(d0); \
434 out0 = vcombine_s16(c1, c1); \
435 out1 = vcombine_s16(d1, c1); \
436 }
437
438 #define btf_16_w4_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
439 { \
440 int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
441 int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
442 int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
443 u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
444 int32x4_t v0 = vmulq_n_s32(in1_l, w0_l); \
445 v0 = vmlsq_n_s32(v0, in0_l, w0_h); \
446 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
447 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
448 const int16x4_t c1 = vqmovn_s32(c0); \
449 const int16x4_t d1 = vqmovn_s32(d0); \
450 out0 = vcombine_s16(c1, c1); \
451 out1 = vcombine_s16(d1, c1); \
452 }
453
454 #define btf_16_w4_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
455 { \
456 int32x4_t in0_l = vmovl_s16(vget_low_s16(in0)); \
457 int32x4_t in1_l = vmovl_s16(vget_low_s16(in1)); \
458 int32x4_t u0 = vmulq_n_s32(in0_l, w0_l); \
459 u0 = vmlaq_n_s32(u0, in1_l, w0_h); \
460 int32x4_t v0 = vmulq_n_s32(in0_l, w0_h); \
461 v0 = vmlsq_n_s32(v0, in1_l, w0_l); \
462 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
463 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
464 const int16x4_t c1 = vqmovn_s32(c0); \
465 const int16x4_t d1 = vqmovn_s32(d0); \
466 out0 = vcombine_s16(c1, c1); \
467 out1 = vcombine_s16(d1, c1); \
468 }
469
fadst4x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)470 static void fadst4x8_neon(const int16x8_t *input, int16x8_t *output,
471 int8_t cos_bit, const int8_t *stage_range) {
472 (void)stage_range;
473 const int32_t *cospi = cospi_arr(cos_bit);
474 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
475
476 // stage 1-2
477 int16x8_t x2[8];
478 btf_16_w4_neon_mode3(cospi[32], cospi[32], vqnegq_s16(input[3]), input[4],
479 x2[2], x2[3], v_cos_bit);
480 btf_16_w4_neon_mode3(cospi[32], cospi[32], input[2], vqnegq_s16(input[5]),
481 x2[6], x2[7], v_cos_bit);
482
483 // stage 3
484 int16x8_t x3[8];
485 x3[0] = vqaddq_s16(input[0], x2[2]);
486 x3[2] = vqsubq_s16(input[0], x2[2]);
487 x3[1] = vqsubq_s16(x2[3], input[7]);
488 x3[3] = vqsubq_s16(vqnegq_s16(input[7]), x2[3]);
489 x3[4] = vqaddq_s16(vqnegq_s16(input[1]), x2[6]);
490 x3[6] = vqsubq_s16(vqnegq_s16(input[1]), x2[6]);
491 x3[5] = vqaddq_s16(input[6], x2[7]);
492 x3[7] = vqsubq_s16(input[6], x2[7]);
493
494 // stage 4
495 int16x8_t x4[8];
496
497 btf_16_w4_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x4[4], x4[5],
498 v_cos_bit);
499 btf_16_w4_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x4[6], x4[7],
500 v_cos_bit);
501
502 // stage 5
503 int16x8_t x5[8];
504 x5[0] = vqaddq_s16(x3[0], x4[4]);
505 x5[4] = vqsubq_s16(x3[0], x4[4]);
506 x5[1] = vqaddq_s16(x3[1], x4[5]);
507 x5[5] = vqsubq_s16(x3[1], x4[5]);
508 x5[2] = vqaddq_s16(x3[2], x4[6]);
509 x5[6] = vqsubq_s16(x3[2], x4[6]);
510 x5[3] = vqaddq_s16(x3[3], x4[7]);
511 x5[7] = vqsubq_s16(x3[3], x4[7]);
512
513 // stage 6-7
514 btf_16_w4_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
515 v_cos_bit);
516 btf_16_w4_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
517 v_cos_bit);
518 btf_16_w4_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
519 v_cos_bit);
520 btf_16_w4_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
521 v_cos_bit);
522 }
523
fadst8x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)524 static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
525 int8_t cos_bit, const int8_t *stage_range) {
526 (void)stage_range;
527 const int32_t *sinpi = sinpi_arr(cos_bit);
528
529 const int16x8_t in7 = vaddq_s16(input[0], input[1]);
530 int32x4_t u_lo[8], u_hi[8], v_hi[8];
531
532 int32x4_t in0_l = vmovl_s16(vget_low_s16(input[0]));
533 int32x4_t in0_h = vmovl_s16(vget_high_s16(input[0]));
534 int32x4_t in1_l = vmovl_s16(vget_low_s16(input[1]));
535 int32x4_t in1_h = vmovl_s16(vget_high_s16(input[1]));
536 int32x4_t in2_l = vmovl_s16(vget_low_s16(input[2]));
537 int32x4_t in2_h = vmovl_s16(vget_high_s16(input[2]));
538 int32x4_t in3_l = vmovl_s16(vget_low_s16(input[3]));
539 int32x4_t in3_h = vmovl_s16(vget_high_s16(input[3]));
540 int32x4_t in7_l = vmovl_s16(vget_low_s16(in7));
541 int32x4_t in7_h = vmovl_s16(vget_high_s16(in7));
542
543 u_lo[0] = vmulq_n_s32(in1_l, sinpi[2]);
544 u_lo[0] = vmlaq_n_s32(u_lo[0], in0_l, sinpi[1]);
545
546 u_hi[0] = vmulq_n_s32(in1_h, sinpi[2]);
547 u_hi[0] = vmlaq_n_s32(u_hi[0], in0_h, sinpi[1]);
548
549 u_lo[0] = vmlaq_n_s32(u_lo[0], in3_l, sinpi[4]);
550 u_lo[0] = vmlaq_n_s32(u_lo[0], in2_l, sinpi[3]);
551
552 u_hi[0] = vmlaq_n_s32(u_hi[0], in3_h, sinpi[4]);
553 u_hi[0] = vmlaq_n_s32(u_hi[0], in2_h, sinpi[3]);
554
555 u_lo[1] = vmulq_n_s32(in7_l, sinpi[3]);
556
557 v_hi[2] = vmulq_n_s32(in7_h, sinpi[3]);
558 u_lo[2] = vmulq_n_s32(in0_l, sinpi[4]);
559 u_lo[2] = vmlsq_n_s32(u_lo[2], in1_l, sinpi[1]);
560
561 u_hi[2] = vmulq_n_s32(in0_h, sinpi[4]);
562 u_hi[2] = vmlsq_n_s32(u_hi[2], in1_h, sinpi[1]);
563
564 u_lo[2] = vmlaq_n_s32(u_lo[2], in3_l, sinpi[2]);
565 u_lo[2] = vmlsq_n_s32(u_lo[2], in2_l, sinpi[3]);
566
567 u_hi[2] = vmlaq_n_s32(u_hi[2], in3_h, sinpi[2]);
568 u_hi[2] = vmlsq_n_s32(u_hi[2], in2_h, sinpi[3]);
569
570 u_lo[1] = vmlsq_n_s32(u_lo[1], in3_l, sinpi[3]);
571
572 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
573
574 u_hi[1] = vmlsq_n_s32(v_hi[2], in3_h, sinpi[3]);
575
576 u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
577 u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
578
579 u_lo[6] = vmlaq_n_s32(u_lo[3], in2_l, sinpi[3] * 3);
580 u_hi[6] = vmlaq_n_s32(u_hi[3], in2_h, sinpi[3] * 3);
581
582 u_lo[0] = vrshlq_s32(u_lo[0], v_cos_bit);
583 u_hi[0] = vrshlq_s32(u_hi[0], v_cos_bit);
584 u_lo[1] = vrshlq_s32(u_lo[1], v_cos_bit);
585 u_hi[1] = vrshlq_s32(u_hi[1], v_cos_bit);
586 u_lo[2] = vrshlq_s32(u_lo[2], v_cos_bit);
587 u_hi[2] = vrshlq_s32(u_hi[2], v_cos_bit);
588 u_lo[3] = vrshlq_s32(u_lo[6], v_cos_bit);
589 u_hi[3] = vrshlq_s32(u_hi[6], v_cos_bit);
590
591 output[0] = custom_packs_s32(u_lo[0], u_hi[0]);
592 output[1] = custom_packs_s32(u_lo[1], u_hi[1]);
593 output[2] = custom_packs_s32(u_lo[2], u_hi[2]);
594 output[3] = custom_packs_s32(u_lo[3], u_hi[3]);
595 }
596
av1_fdct4x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)597 void av1_fdct4x4_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
598 const int8_t *stage_range) {
599 (void)stage_range;
600 const int32_t *cospi = cospi_arr(cos_bit);
601 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
602
603 int32x4_t u[4];
604
605 int32x4_t in12a = vaddl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
606 int32x4_t in12s = vsubl_s16(vget_low_s16(input[1]), vget_low_s16(input[2]));
607 int32x4_t in03a = vaddl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
608 int32x4_t in03s = vsubl_s16(vget_low_s16(input[0]), vget_low_s16(input[3]));
609
610 int32x4_t u0ad1 = vmulq_n_s32(in12a, cospi[32]);
611 int32x4_t u0ad2 = vmulq_n_s32(in03a, cospi[32]);
612 u[0] = vaddq_s32(u0ad1, u0ad2);
613 u[1] = vsubq_s32(u0ad2, u0ad1);
614 u[2] = vmulq_n_s32(in12s, cospi[48]);
615 u[2] = vmlaq_n_s32(u[2], in03s, cospi[16]);
616
617 u[3] = vmulq_n_s32(in03s, cospi[48]);
618 u[3] = vmlsq_n_s32(u[3], in12s, cospi[16]);
619
620 u[0] = vrshlq_s32(u[0], v_cos_bit);
621 u[1] = vrshlq_s32(u[1], v_cos_bit);
622 u[2] = vrshlq_s32(u[2], v_cos_bit);
623 u[3] = vrshlq_s32(u[3], v_cos_bit);
624
625 output[0] = custom_packs_s32(u[0], u[1]);
626 output[1] = custom_packs_s32(u[2], u[3]);
627 output[2] = vextq_s16(output[0], output[0], 4);
628 output[3] = vextq_s16(output[1], output[1], 4);
629 }
630
631 #define btf_16_neon(w0_l, w0_h, w1_l, w1_h, in0, in1, out0, out1) \
632 { \
633 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
634 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
635 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
636 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
637 int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
638 u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
639 int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
640 u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
641 int32x4_t v0 = vmulq_n_s32(in_low1, w1_h); \
642 v0 = vmlaq_n_s32(v0, in_low0, w1_l); \
643 int32x4_t v1 = vmulq_n_s32(in_high1, w1_h); \
644 v1 = vmlaq_n_s32(v1, in_high0, w1_l); \
645 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
646 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
647 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
648 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
649 out0 = custom_packs_s32(c0, c1); \
650 out1 = custom_packs_s32(d0, d1); \
651 }
652
653 #define btf_16_neon_mode0(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
654 { \
655 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
656 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
657 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
658 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
659 int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
660 u0 = vmlsq_n_s32(u0, in_low0, w0_l); \
661 int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
662 u1 = vmlsq_n_s32(u1, in_high0, w0_l); \
663 int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
664 v0 = vmlaq_n_s32(v0, in_low0, w0_h); \
665 int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
666 v1 = vmlaq_n_s32(v1, in_high0, w0_h); \
667 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
668 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
669 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
670 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
671 out0 = custom_packs_s32(c0, c1); \
672 out1 = custom_packs_s32(d0, d1); \
673 }
674
675 #define btf_16_neon_mode1(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
676 { \
677 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
678 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
679 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
680 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
681 int32x4_t u0 = vmulq_n_s32(in_low0, w0_l); \
682 u0 = vmlsq_n_s32(u0, in_low1, w0_h); \
683 int32x4_t u1 = vmulq_n_s32(in_high0, w0_l); \
684 u1 = vmlsq_n_s32(u1, in_high1, w0_h); \
685 int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
686 v0 = vmlaq_n_s32(v0, in_low0, w0_h); \
687 int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
688 v1 = vmlaq_n_s32(v1, in_high0, w0_h); \
689 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
690 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
691 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
692 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
693 out0 = custom_packs_s32(c0, c1); \
694 out1 = custom_packs_s32(d0, d1); \
695 }
696
697 #define btf_16_neon_mode02(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
698 { \
699 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
700 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
701 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
702 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
703 int32x4_t u0 = vmulq_n_s32(in_low1, -w0_h); \
704 u0 = vmlsq_n_s32(u0, in_low0, w0_l); \
705 int32x4_t u1 = vmulq_n_s32(in_high1, -w0_h); \
706 u1 = vmlsq_n_s32(u1, in_high0, w0_l); \
707 int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
708 v0 = vmlsq_n_s32(v0, in_low0, w0_h); \
709 int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
710 v1 = vmlsq_n_s32(v1, in_high0, w0_h); \
711 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
712 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
713 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
714 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
715 out0 = custom_packs_s32(c0, c1); \
716 out1 = custom_packs_s32(d0, d1); \
717 }
718
719 #define btf_16_neon_mode2(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
720 { \
721 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
722 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
723 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
724 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
725 int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
726 u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
727 int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
728 u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
729 int32x4_t v0 = vmulq_n_s32(in_low1, w0_l); \
730 v0 = vmlsq_n_s32(v0, in_low0, w0_h); \
731 int32x4_t v1 = vmulq_n_s32(in_high1, w0_l); \
732 v1 = vmlsq_n_s32(v1, in_high0, w0_h); \
733 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
734 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
735 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
736 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
737 out0 = custom_packs_s32(c0, c1); \
738 out1 = custom_packs_s32(d0, d1); \
739 }
740
741 #define btf_16_neon_mode3(w0_l, w0_h, in0, in1, out0, out1, v_cos_bit) \
742 { \
743 int32x4_t in_low0 = vmovl_s16(vget_low_s16(in0)); \
744 int32x4_t in_high0 = vmovl_s16(vget_high_s16(in0)); \
745 int32x4_t in_low1 = vmovl_s16(vget_low_s16(in1)); \
746 int32x4_t in_high1 = vmovl_s16(vget_high_s16(in1)); \
747 int32x4_t u0 = vmulq_n_s32(in_low1, w0_h); \
748 u0 = vmlaq_n_s32(u0, in_low0, w0_l); \
749 int32x4_t u1 = vmulq_n_s32(in_high1, w0_h); \
750 u1 = vmlaq_n_s32(u1, in_high0, w0_l); \
751 int32x4_t v0 = vmulq_n_s32(in_low0, w0_h); \
752 v0 = vmlsq_n_s32(v0, in_low1, w0_l); \
753 int32x4_t v1 = vmulq_n_s32(in_high0, w0_h); \
754 v1 = vmlsq_n_s32(v1, in_high1, w0_l); \
755 int32x4_t c0 = vrshlq_s32(u0, v_cos_bit); \
756 int32x4_t c1 = vrshlq_s32(u1, v_cos_bit); \
757 int32x4_t d0 = vrshlq_s32(v0, v_cos_bit); \
758 int32x4_t d1 = vrshlq_s32(v1, v_cos_bit); \
759 out0 = custom_packs_s32(c0, c1); \
760 out1 = custom_packs_s32(d0, d1); \
761 }
762
fdct8x4_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)763 static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
764 int8_t cos_bit, const int8_t *stage_range) {
765 (void)stage_range;
766 const int32_t *cospi = cospi_arr(cos_bit);
767 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
768
769 // stage 1
770 int16x8_t x1[4];
771 x1[0] = vqaddq_s16(input[0], input[3]);
772 x1[3] = vqsubq_s16(input[0], input[3]);
773 x1[1] = vqaddq_s16(input[1], input[2]);
774 x1[2] = vqsubq_s16(input[1], input[2]);
775
776 // stage 2
777 int16x8_t x2[4];
778 btf_16_neon_mode3(cospi[32], cospi[32], x1[0], x1[1], x2[0], x2[1],
779 v_cos_bit);
780 btf_16_neon_mode2(cospi[48], cospi[16], x1[2], x1[3], x2[2], x2[3],
781 v_cos_bit);
782
783 // stage 3
784 output[0] = x2[0];
785 output[1] = x2[2];
786 output[2] = x2[1];
787 output[3] = x2[3];
788 }
789
fdct4x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)790 static void fdct4x8_neon(const int16x8_t *input, int16x8_t *output,
791 int8_t cos_bit, const int8_t *stage_range) {
792 (void)stage_range;
793 const int32_t *cospi = cospi_arr(cos_bit);
794 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
795
796 // stage 1
797 int16x8_t x1[8];
798 x1[0] = vqaddq_s16(input[0], input[7]);
799 x1[7] = vqsubq_s16(input[0], input[7]);
800 x1[1] = vqaddq_s16(input[1], input[6]);
801 x1[6] = vqsubq_s16(input[1], input[6]);
802 x1[2] = vqaddq_s16(input[2], input[5]);
803 x1[5] = vqsubq_s16(input[2], input[5]);
804 x1[3] = vqaddq_s16(input[3], input[4]);
805 x1[4] = vqsubq_s16(input[3], input[4]);
806
807 // stage 2
808 int16x8_t x2[8];
809 x2[0] = vqaddq_s16(x1[0], x1[3]);
810 x2[3] = vqsubq_s16(x1[0], x1[3]);
811 x2[1] = vqaddq_s16(x1[1], x1[2]);
812 x2[2] = vqsubq_s16(x1[1], x1[2]);
813
814 btf_16_w4_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
815 v_cos_bit);
816
817 // stage 3
818 int16x8_t x3[8];
819 btf_16_w4_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
820 v_cos_bit);
821
822 btf_16_w4_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
823 v_cos_bit);
824 x3[4] = vqaddq_s16(x1[4], x2[5]);
825 x3[5] = vqsubq_s16(x1[4], x2[5]);
826 x3[6] = vqsubq_s16(x1[7], x2[6]);
827 x3[7] = vqaddq_s16(x1[7], x2[6]);
828
829 // stage 4-5
830 btf_16_w4_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
831 v_cos_bit);
832 btf_16_w4_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
833 v_cos_bit);
834 }
835
fdct8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)836 void fdct8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
837 const int8_t *stage_range) {
838 (void)stage_range;
839 const int32_t *cospi = cospi_arr(cos_bit);
840 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
841
842 // stage 1
843 int16x8_t x1[8];
844 x1[0] = vqaddq_s16(input[0], input[7]);
845 x1[7] = vqsubq_s16(input[0], input[7]);
846 x1[1] = vqaddq_s16(input[1], input[6]);
847 x1[6] = vqsubq_s16(input[1], input[6]);
848 x1[2] = vqaddq_s16(input[2], input[5]);
849 x1[5] = vqsubq_s16(input[2], input[5]);
850 x1[3] = vqaddq_s16(input[3], input[4]);
851 x1[4] = vqsubq_s16(input[3], input[4]);
852
853 // stage 2
854 int16x8_t x2[8];
855 x2[0] = vqaddq_s16(x1[0], x1[3]);
856 x2[3] = vqsubq_s16(x1[0], x1[3]);
857 x2[1] = vqaddq_s16(x1[1], x1[2]);
858 x2[2] = vqsubq_s16(x1[1], x1[2]);
859 btf_16_neon_mode0(cospi[32], cospi[32], x1[5], x1[6], x2[5], x2[6],
860 v_cos_bit);
861
862 // stage 3
863 int16x8_t x3[8];
864 btf_16_neon_mode3(cospi[32], cospi[32], x2[0], x2[1], output[0], output[4],
865 v_cos_bit);
866 btf_16_neon_mode2(cospi[48], cospi[16], x2[2], x2[3], output[2], output[6],
867 v_cos_bit);
868 x3[4] = vqaddq_s16(x1[4], x2[5]);
869 x3[5] = vqsubq_s16(x1[4], x2[5]);
870 x3[6] = vqsubq_s16(x1[7], x2[6]);
871 x3[7] = vqaddq_s16(x1[7], x2[6]);
872
873 // stage 4-5
874 btf_16_neon_mode2(cospi[56], cospi[8], x3[4], x3[7], output[1], output[7],
875 v_cos_bit);
876 btf_16_neon_mode2(cospi[24], cospi[40], x3[5], x3[6], output[5], output[3],
877 v_cos_bit);
878 }
879
fdct8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)880 static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
881 int8_t cos_bit, const int8_t *stage_range) {
882 (void)stage_range;
883 const int32_t *cospi = cospi_arr(cos_bit);
884 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
885
886 // stage 1
887 int16x8_t x1[16];
888 x1[0] = vqaddq_s16(input[0], input[15]);
889 x1[15] = vqsubq_s16(input[0], input[15]);
890 x1[1] = vqaddq_s16(input[1], input[14]);
891 x1[14] = vqsubq_s16(input[1], input[14]);
892 x1[2] = vqaddq_s16(input[2], input[13]);
893 x1[13] = vqsubq_s16(input[2], input[13]);
894 x1[3] = vqaddq_s16(input[3], input[12]);
895 x1[12] = vqsubq_s16(input[3], input[12]);
896 x1[4] = vqaddq_s16(input[4], input[11]);
897 x1[11] = vqsubq_s16(input[4], input[11]);
898 x1[5] = vqaddq_s16(input[5], input[10]);
899 x1[10] = vqsubq_s16(input[5], input[10]);
900 x1[6] = vqaddq_s16(input[6], input[9]);
901 x1[9] = vqsubq_s16(input[6], input[9]);
902 x1[7] = vqaddq_s16(input[7], input[8]);
903 x1[8] = vqsubq_s16(input[7], input[8]);
904
905 // stage 2
906 int16x8_t x2[16];
907 x2[0] = vqaddq_s16(x1[0], x1[7]);
908 x2[7] = vqsubq_s16(x1[0], x1[7]);
909 x2[1] = vqaddq_s16(x1[1], x1[6]);
910 x2[6] = vqsubq_s16(x1[1], x1[6]);
911 x2[2] = vqaddq_s16(x1[2], x1[5]);
912 x2[5] = vqsubq_s16(x1[2], x1[5]);
913 x2[3] = vqaddq_s16(x1[3], x1[4]);
914 x2[4] = vqsubq_s16(x1[3], x1[4]);
915
916 btf_16_neon_mode0(cospi[32], cospi[32], x1[10], x1[13], x2[10], x2[13],
917 v_cos_bit);
918 btf_16_neon_mode0(cospi[32], cospi[32], x1[11], x1[12], x2[11], x2[12],
919 v_cos_bit);
920
921 // stage 3
922 int16x8_t x3[16];
923 x3[0] = vqaddq_s16(x2[0], x2[3]);
924 x3[3] = vqsubq_s16(x2[0], x2[3]);
925 x3[1] = vqaddq_s16(x2[1], x2[2]);
926 x3[2] = vqsubq_s16(x2[1], x2[2]);
927
928 btf_16_neon_mode0(cospi[32], cospi[32], x2[5], x2[6], x3[5], x3[6],
929 v_cos_bit);
930
931 x3[8] = vqaddq_s16(x1[8], x2[11]);
932 x3[11] = vqsubq_s16(x1[8], x2[11]);
933 x3[9] = vqaddq_s16(x1[9], x2[10]);
934 x3[10] = vqsubq_s16(x1[9], x2[10]);
935 x3[12] = vqsubq_s16(x1[15], x2[12]);
936 x3[15] = vqaddq_s16(x1[15], x2[12]);
937 x3[13] = vqsubq_s16(x1[14], x2[13]);
938 x3[14] = vqaddq_s16(x1[14], x2[13]);
939
940 // stage 4
941 int16x8_t x4[16];
942 btf_16_neon(cospi[32], cospi[32], cospi[32], -cospi[32], x3[0], x3[1],
943 output[0], output[8]);
944 btf_16_neon(cospi[48], cospi[16], -cospi[16], cospi[48], x3[2], x3[3],
945 output[4], output[12]);
946 x4[4] = vqaddq_s16(x2[4], x3[5]);
947 x4[5] = vqsubq_s16(x2[4], x3[5]);
948 x4[6] = vqsubq_s16(x2[7], x3[6]);
949 x4[7] = vqaddq_s16(x2[7], x3[6]);
950 btf_16_neon_mode0(cospi[16], cospi[48], x3[9], x3[14], x4[9], x4[14],
951 v_cos_bit);
952 btf_16_neon_mode02(cospi[48], cospi[16], x3[10], x3[13], x4[10], x4[13],
953 v_cos_bit);
954
955 // stage 5
956 int16x8_t x5[16];
957
958 btf_16_neon_mode2(cospi[56], cospi[8], x4[4], x4[7], output[2], output[14],
959 v_cos_bit);
960 btf_16_neon_mode2(cospi[24], cospi[40], x4[5], x4[6], output[10], output[6],
961 v_cos_bit);
962 x5[8] = vqaddq_s16(x3[8], x4[9]);
963 x5[9] = vqsubq_s16(x3[8], x4[9]);
964 x5[10] = vqsubq_s16(x3[11], x4[10]);
965 x5[11] = vqaddq_s16(x3[11], x4[10]);
966 x5[12] = vqaddq_s16(x3[12], x4[13]);
967 x5[13] = vqsubq_s16(x3[12], x4[13]);
968 x5[14] = vqsubq_s16(x3[15], x4[14]);
969 x5[15] = vqaddq_s16(x3[15], x4[14]);
970
971 // stage 6-7
972 btf_16_neon_mode2(cospi[60], cospi[4], x5[8], x5[15], output[1], output[15],
973 v_cos_bit);
974 btf_16_neon_mode2(cospi[28], cospi[36], x5[9], x5[14], output[9], output[7],
975 v_cos_bit);
976 btf_16_neon_mode2(cospi[44], cospi[20], x5[10], x5[13], output[5], output[11],
977 v_cos_bit);
978 btf_16_neon_mode2(cospi[12], cospi[52], x5[11], x5[12], output[13], output[3],
979 v_cos_bit);
980 }
981
av1_fdct8x32_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)982 void av1_fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
983 int8_t cos_bit, const int8_t *stage_range) {
984 (void)stage_range;
985 const int32_t *cospi = cospi_arr(cos_bit);
986 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
987
988 // stage 1
989 int16x8_t x1[32];
990 x1[0] = vqaddq_s16(input[0], input[31]);
991 x1[31] = vqsubq_s16(input[0], input[31]);
992 x1[1] = vqaddq_s16(input[1], input[30]);
993 x1[30] = vqsubq_s16(input[1], input[30]);
994 x1[2] = vqaddq_s16(input[2], input[29]);
995 x1[29] = vqsubq_s16(input[2], input[29]);
996 x1[3] = vqaddq_s16(input[3], input[28]);
997 x1[28] = vqsubq_s16(input[3], input[28]);
998 x1[4] = vqaddq_s16(input[4], input[27]);
999 x1[27] = vqsubq_s16(input[4], input[27]);
1000 x1[5] = vqaddq_s16(input[5], input[26]);
1001 x1[26] = vqsubq_s16(input[5], input[26]);
1002 x1[6] = vqaddq_s16(input[6], input[25]);
1003 x1[25] = vqsubq_s16(input[6], input[25]);
1004 x1[7] = vqaddq_s16(input[7], input[24]);
1005 x1[24] = vqsubq_s16(input[7], input[24]);
1006 x1[8] = vqaddq_s16(input[8], input[23]);
1007 x1[23] = vqsubq_s16(input[8], input[23]);
1008 x1[9] = vqaddq_s16(input[9], input[22]);
1009 x1[22] = vqsubq_s16(input[9], input[22]);
1010 x1[10] = vqaddq_s16(input[10], input[21]);
1011 x1[21] = vqsubq_s16(input[10], input[21]);
1012 x1[11] = vqaddq_s16(input[11], input[20]);
1013 x1[20] = vqsubq_s16(input[11], input[20]);
1014 x1[12] = vqaddq_s16(input[12], input[19]);
1015 x1[19] = vqsubq_s16(input[12], input[19]);
1016 x1[13] = vqaddq_s16(input[13], input[18]);
1017 x1[18] = vqsubq_s16(input[13], input[18]);
1018 x1[14] = vqaddq_s16(input[14], input[17]);
1019 x1[17] = vqsubq_s16(input[14], input[17]);
1020 x1[15] = vqaddq_s16(input[15], input[16]);
1021 x1[16] = vqsubq_s16(input[15], input[16]);
1022
1023 // stage 2
1024 int16x8_t x2[32];
1025 x2[0] = vqaddq_s16(x1[0], x1[15]);
1026 x2[15] = vqsubq_s16(x1[0], x1[15]);
1027 x2[1] = vqaddq_s16(x1[1], x1[14]);
1028 x2[14] = vqsubq_s16(x1[1], x1[14]);
1029 x2[2] = vqaddq_s16(x1[2], x1[13]);
1030 x2[13] = vqsubq_s16(x1[2], x1[13]);
1031 x2[3] = vqaddq_s16(x1[3], x1[12]);
1032 x2[12] = vqsubq_s16(x1[3], x1[12]);
1033 x2[4] = vqaddq_s16(x1[4], x1[11]);
1034 x2[11] = vqsubq_s16(x1[4], x1[11]);
1035 x2[5] = vqaddq_s16(x1[5], x1[10]);
1036 x2[10] = vqsubq_s16(x1[5], x1[10]);
1037 x2[6] = vqaddq_s16(x1[6], x1[9]);
1038 x2[9] = vqsubq_s16(x1[6], x1[9]);
1039 x2[7] = vqaddq_s16(x1[7], x1[8]);
1040 x2[8] = vqsubq_s16(x1[7], x1[8]);
1041
1042 btf_16_neon_mode0(cospi[32], cospi[32], x1[20], x1[27], x2[20], x2[27],
1043 v_cos_bit);
1044 btf_16_neon_mode0(cospi[32], cospi[32], x1[21], x1[26], x2[21], x2[26],
1045 v_cos_bit);
1046 btf_16_neon_mode0(cospi[32], cospi[32], x1[22], x1[25], x2[22], x2[25],
1047 v_cos_bit);
1048 btf_16_neon_mode0(cospi[32], cospi[32], x1[23], x1[24], x2[23], x2[24],
1049 v_cos_bit);
1050
1051 // stage 3
1052 int16x8_t x3[32];
1053 x3[0] = vqaddq_s16(x2[0], x2[7]);
1054 x3[7] = vqsubq_s16(x2[0], x2[7]);
1055 x3[1] = vqaddq_s16(x2[1], x2[6]);
1056 x3[6] = vqsubq_s16(x2[1], x2[6]);
1057 x3[2] = vqaddq_s16(x2[2], x2[5]);
1058 x3[5] = vqsubq_s16(x2[2], x2[5]);
1059 x3[3] = vqaddq_s16(x2[3], x2[4]);
1060 x3[4] = vqsubq_s16(x2[3], x2[4]);
1061
1062 btf_16_neon_mode0(cospi[32], cospi[32], x2[10], x2[13], x3[10], x3[13],
1063 v_cos_bit);
1064 btf_16_neon_mode0(cospi[32], cospi[32], x2[11], x2[12], x3[11], x3[12],
1065 v_cos_bit);
1066
1067 x3[16] = vqaddq_s16(x1[16], x2[23]);
1068 x3[23] = vqsubq_s16(x1[16], x2[23]);
1069 x3[17] = vqaddq_s16(x1[17], x2[22]);
1070 x3[22] = vqsubq_s16(x1[17], x2[22]);
1071 x3[18] = vqaddq_s16(x1[18], x2[21]);
1072 x3[21] = vqsubq_s16(x1[18], x2[21]);
1073 x3[19] = vqaddq_s16(x1[19], x2[20]);
1074 x3[20] = vqsubq_s16(x1[19], x2[20]);
1075 x3[24] = vqsubq_s16(x1[31], x2[24]);
1076 x3[31] = vqaddq_s16(x1[31], x2[24]);
1077 x3[25] = vqsubq_s16(x1[30], x2[25]);
1078 x3[30] = vqaddq_s16(x1[30], x2[25]);
1079 x3[26] = vqsubq_s16(x1[29], x2[26]);
1080 x3[29] = vqaddq_s16(x1[29], x2[26]);
1081 x3[27] = vqsubq_s16(x1[28], x2[27]);
1082 x3[28] = vqaddq_s16(x1[28], x2[27]);
1083
1084 // stage 4
1085 int16x8_t x4[32];
1086 x4[0] = vqaddq_s16(x3[0], x3[3]);
1087 x4[3] = vqsubq_s16(x3[0], x3[3]);
1088 x4[1] = vqaddq_s16(x3[1], x3[2]);
1089 x4[2] = vqsubq_s16(x3[1], x3[2]);
1090 btf_16_neon_mode0(cospi[32], cospi[32], x3[5], x3[6], x4[5], x4[6],
1091 v_cos_bit);
1092 x4[8] = vqaddq_s16(x2[8], x3[11]);
1093 x4[11] = vqsubq_s16(x2[8], x3[11]);
1094 x4[9] = vqaddq_s16(x2[9], x3[10]);
1095 x4[10] = vqsubq_s16(x2[9], x3[10]);
1096 x4[12] = vqsubq_s16(x2[15], x3[12]);
1097 x4[15] = vqaddq_s16(x2[15], x3[12]);
1098 x4[13] = vqsubq_s16(x2[14], x3[13]);
1099 x4[14] = vqaddq_s16(x2[14], x3[13]);
1100
1101 btf_16_neon_mode0(cospi[16], cospi[48], x3[18], x3[29], x4[18], x4[29],
1102 v_cos_bit);
1103 btf_16_neon_mode0(cospi[16], cospi[48], x3[19], x3[28], x4[19], x4[28],
1104 v_cos_bit);
1105 btf_16_neon_mode02(cospi[48], cospi[16], x3[20], x3[27], x4[20], x4[27],
1106 v_cos_bit);
1107 btf_16_neon_mode02(cospi[48], cospi[16], x3[21], x3[26], x4[21], x4[26],
1108 v_cos_bit);
1109
1110 // stage 5
1111 int16x8_t x5[32];
1112 btf_16_neon_mode3(cospi[32], cospi[32], x4[0], x4[1], output[0], output[16],
1113 v_cos_bit);
1114 btf_16_neon_mode2(cospi[48], cospi[16], x4[2], x4[3], output[8], output[24],
1115 v_cos_bit);
1116 x5[4] = vqaddq_s16(x3[4], x4[5]);
1117 x5[5] = vqsubq_s16(x3[4], x4[5]);
1118 x5[6] = vqsubq_s16(x3[7], x4[6]);
1119 x5[7] = vqaddq_s16(x3[7], x4[6]);
1120
1121 btf_16_neon_mode0(cospi[16], cospi[48], x4[9], x4[14], x5[9], x5[14],
1122 v_cos_bit);
1123 btf_16_neon_mode02(cospi[48], cospi[16], x4[10], x4[13], x5[10], x5[13],
1124 v_cos_bit);
1125
1126 x5[16] = vqaddq_s16(x3[16], x4[19]);
1127 x5[19] = vqsubq_s16(x3[16], x4[19]);
1128 x5[17] = vqaddq_s16(x3[17], x4[18]);
1129 x5[18] = vqsubq_s16(x3[17], x4[18]);
1130 x5[20] = vqsubq_s16(x3[23], x4[20]);
1131 x5[23] = vqaddq_s16(x3[23], x4[20]);
1132 x5[21] = vqsubq_s16(x3[22], x4[21]);
1133 x5[22] = vqaddq_s16(x3[22], x4[21]);
1134 x5[24] = vqaddq_s16(x3[24], x4[27]);
1135 x5[27] = vqsubq_s16(x3[24], x4[27]);
1136 x5[25] = vqaddq_s16(x3[25], x4[26]);
1137 x5[26] = vqsubq_s16(x3[25], x4[26]);
1138 x5[28] = vqsubq_s16(x3[31], x4[28]);
1139 x5[31] = vqaddq_s16(x3[31], x4[28]);
1140 x5[29] = vqsubq_s16(x3[30], x4[29]);
1141 x5[30] = vqaddq_s16(x3[30], x4[29]);
1142
1143 // stage 6
1144 int16x8_t x6[32];
1145 btf_16_neon_mode2(cospi[56], cospi[8], x5[4], x5[7], output[4], output[28],
1146 v_cos_bit);
1147 btf_16_neon_mode2(cospi[24], cospi[40], x5[5], x5[6], output[20], output[12],
1148 v_cos_bit);
1149 x6[8] = vqaddq_s16(x4[8], x5[9]);
1150 x6[9] = vqsubq_s16(x4[8], x5[9]);
1151 x6[10] = vqsubq_s16(x4[11], x5[10]);
1152 x6[11] = vqaddq_s16(x4[11], x5[10]);
1153 x6[12] = vqaddq_s16(x4[12], x5[13]);
1154 x6[13] = vqsubq_s16(x4[12], x5[13]);
1155 x6[14] = vqsubq_s16(x4[15], x5[14]);
1156 x6[15] = vqaddq_s16(x4[15], x5[14]);
1157 btf_16_neon_mode0(cospi[8], cospi[56], x5[17], x5[30], x6[17], x6[30],
1158 v_cos_bit);
1159 btf_16_neon_mode02(cospi[56], cospi[8], x5[18], x5[29], x6[18], x6[29],
1160 v_cos_bit);
1161 btf_16_neon_mode0(cospi[40], cospi[24], x5[21], x5[26], x6[21], x6[26],
1162 v_cos_bit);
1163 btf_16_neon_mode02(cospi[24], cospi[40], x5[22], x5[25], x6[22], x6[25],
1164 v_cos_bit);
1165
1166 // stage 7
1167 int16x8_t x7[32];
1168 btf_16_neon_mode2(cospi[60], cospi[4], x6[8], x6[15], output[2], output[30],
1169 v_cos_bit);
1170 btf_16_neon_mode2(cospi[28], cospi[36], x6[9], x6[14], output[18], output[14],
1171 v_cos_bit);
1172 btf_16_neon_mode2(cospi[44], cospi[20], x6[10], x6[13], output[10],
1173 output[22], v_cos_bit);
1174 btf_16_neon_mode2(cospi[12], cospi[52], x6[11], x6[12], output[26], output[6],
1175 v_cos_bit);
1176 x7[16] = vqaddq_s16(x5[16], x6[17]);
1177 x7[17] = vqsubq_s16(x5[16], x6[17]);
1178 x7[18] = vqsubq_s16(x5[19], x6[18]);
1179 x7[19] = vqaddq_s16(x5[19], x6[18]);
1180 x7[20] = vqaddq_s16(x5[20], x6[21]);
1181 x7[21] = vqsubq_s16(x5[20], x6[21]);
1182 x7[22] = vqsubq_s16(x5[23], x6[22]);
1183 x7[23] = vqaddq_s16(x5[23], x6[22]);
1184 x7[24] = vqaddq_s16(x5[24], x6[25]);
1185 x7[25] = vqsubq_s16(x5[24], x6[25]);
1186 x7[26] = vqsubq_s16(x5[27], x6[26]);
1187 x7[27] = vqaddq_s16(x5[27], x6[26]);
1188 x7[28] = vqaddq_s16(x5[28], x6[29]);
1189 x7[29] = vqsubq_s16(x5[28], x6[29]);
1190 x7[30] = vqsubq_s16(x5[31], x6[30]);
1191 x7[31] = vqaddq_s16(x5[31], x6[30]);
1192
1193 btf_16_neon_mode2(cospi[62], cospi[2], x7[16], x7[31], output[1], output[31],
1194 v_cos_bit);
1195 btf_16_neon_mode2(cospi[30], cospi[34], x7[17], x7[30], output[17],
1196 output[15], v_cos_bit);
1197 btf_16_neon_mode2(cospi[46], cospi[18], x7[18], x7[29], output[9], output[23],
1198 v_cos_bit);
1199 btf_16_neon_mode2(cospi[14], cospi[50], x7[19], x7[28], output[25], output[7],
1200 v_cos_bit);
1201 btf_16_neon_mode2(cospi[54], cospi[10], x7[20], x7[27], output[5], output[27],
1202 v_cos_bit);
1203 btf_16_neon_mode2(cospi[22], cospi[42], x7[21], x7[26], output[21],
1204 output[11], v_cos_bit);
1205 btf_16_neon_mode2(cospi[38], cospi[26], x7[22], x7[25], output[13],
1206 output[19], v_cos_bit);
1207 btf_16_neon_mode2(cospi[6], cospi[58], x7[23], x7[24], output[29], output[3],
1208 v_cos_bit);
1209 }
1210
av1_fdct8x64_stage_1234_neon(const int16x8_t * input,int16x8_t * x3,int16x8_t * x4,const int32_t * cospi32,const int32x4_t * v_cos_bit)1211 void av1_fdct8x64_stage_1234_neon(const int16x8_t *input, int16x8_t *x3,
1212 int16x8_t *x4, const int32_t *cospi32,
1213 const int32x4_t *v_cos_bit) {
1214 int16x8_t x1[64];
1215 int16x8_t x2[64];
1216 x1[0] = vqaddq_s16(input[0], input[63]);
1217 x1[63] = vqsubq_s16(input[0], input[63]);
1218 x1[1] = vqaddq_s16(input[1], input[62]);
1219 x1[62] = vqsubq_s16(input[1], input[62]);
1220 x1[2] = vqaddq_s16(input[2], input[61]);
1221 x1[61] = vqsubq_s16(input[2], input[61]);
1222 x1[3] = vqaddq_s16(input[3], input[60]);
1223 x1[60] = vqsubq_s16(input[3], input[60]);
1224 x1[4] = vqaddq_s16(input[4], input[59]);
1225 x1[59] = vqsubq_s16(input[4], input[59]);
1226 x1[5] = vqaddq_s16(input[5], input[58]);
1227 x1[58] = vqsubq_s16(input[5], input[58]);
1228 x1[6] = vqaddq_s16(input[6], input[57]);
1229 x1[57] = vqsubq_s16(input[6], input[57]);
1230 x1[7] = vqaddq_s16(input[7], input[56]);
1231 x1[56] = vqsubq_s16(input[7], input[56]);
1232 x1[8] = vqaddq_s16(input[8], input[55]);
1233 x1[55] = vqsubq_s16(input[8], input[55]);
1234 x1[9] = vqaddq_s16(input[9], input[54]);
1235 x1[54] = vqsubq_s16(input[9], input[54]);
1236 x1[10] = vqaddq_s16(input[10], input[53]);
1237 x1[53] = vqsubq_s16(input[10], input[53]);
1238 x1[11] = vqaddq_s16(input[11], input[52]);
1239 x1[52] = vqsubq_s16(input[11], input[52]);
1240 x1[12] = vqaddq_s16(input[12], input[51]);
1241 x1[51] = vqsubq_s16(input[12], input[51]);
1242 x1[13] = vqaddq_s16(input[13], input[50]);
1243 x1[50] = vqsubq_s16(input[13], input[50]);
1244 x1[14] = vqaddq_s16(input[14], input[49]);
1245 x1[49] = vqsubq_s16(input[14], input[49]);
1246 x1[15] = vqaddq_s16(input[15], input[48]);
1247 x1[48] = vqsubq_s16(input[15], input[48]);
1248 x1[16] = vqaddq_s16(input[16], input[47]);
1249 x1[47] = vqsubq_s16(input[16], input[47]);
1250 x1[17] = vqaddq_s16(input[17], input[46]);
1251 x1[46] = vqsubq_s16(input[17], input[46]);
1252 x1[18] = vqaddq_s16(input[18], input[45]);
1253 x1[45] = vqsubq_s16(input[18], input[45]);
1254 x1[19] = vqaddq_s16(input[19], input[44]);
1255 x1[44] = vqsubq_s16(input[19], input[44]);
1256 x1[20] = vqaddq_s16(input[20], input[43]);
1257 x1[43] = vqsubq_s16(input[20], input[43]);
1258 x1[21] = vqaddq_s16(input[21], input[42]);
1259 x1[42] = vqsubq_s16(input[21], input[42]);
1260 x1[22] = vqaddq_s16(input[22], input[41]);
1261 x1[41] = vqsubq_s16(input[22], input[41]);
1262 x1[23] = vqaddq_s16(input[23], input[40]);
1263 x1[40] = vqsubq_s16(input[23], input[40]);
1264 x1[24] = vqaddq_s16(input[24], input[39]);
1265 x1[39] = vqsubq_s16(input[24], input[39]);
1266 x1[25] = vqaddq_s16(input[25], input[38]);
1267 x1[38] = vqsubq_s16(input[25], input[38]);
1268 x1[26] = vqaddq_s16(input[26], input[37]);
1269 x1[37] = vqsubq_s16(input[26], input[37]);
1270 x1[27] = vqaddq_s16(input[27], input[36]);
1271 x1[36] = vqsubq_s16(input[27], input[36]);
1272 x1[28] = vqaddq_s16(input[28], input[35]);
1273 x1[35] = vqsubq_s16(input[28], input[35]);
1274 x1[29] = vqaddq_s16(input[29], input[34]);
1275 x1[34] = vqsubq_s16(input[29], input[34]);
1276 x1[30] = vqaddq_s16(input[30], input[33]);
1277 x1[33] = vqsubq_s16(input[30], input[33]);
1278 x1[31] = vqaddq_s16(input[31], input[32]);
1279 x1[32] = vqsubq_s16(input[31], input[32]);
1280
1281 x2[0] = vqaddq_s16(x1[0], x1[31]);
1282 x2[31] = vqsubq_s16(x1[0], x1[31]);
1283 x2[1] = vqaddq_s16(x1[1], x1[30]);
1284 x2[30] = vqsubq_s16(x1[1], x1[30]);
1285 x2[2] = vqaddq_s16(x1[2], x1[29]);
1286 x2[29] = vqsubq_s16(x1[2], x1[29]);
1287 x2[3] = vqaddq_s16(x1[3], x1[28]);
1288 x2[28] = vqsubq_s16(x1[3], x1[28]);
1289 x2[4] = vqaddq_s16(x1[4], x1[27]);
1290 x2[27] = vqsubq_s16(x1[4], x1[27]);
1291 x2[5] = vqaddq_s16(x1[5], x1[26]);
1292 x2[26] = vqsubq_s16(x1[5], x1[26]);
1293 x2[6] = vqaddq_s16(x1[6], x1[25]);
1294 x2[25] = vqsubq_s16(x1[6], x1[25]);
1295 x2[7] = vqaddq_s16(x1[7], x1[24]);
1296 x2[24] = vqsubq_s16(x1[7], x1[24]);
1297 x2[8] = vqaddq_s16(x1[8], x1[23]);
1298 x2[23] = vqsubq_s16(x1[8], x1[23]);
1299 x2[9] = vqaddq_s16(x1[9], x1[22]);
1300 x2[22] = vqsubq_s16(x1[9], x1[22]);
1301 x2[10] = vqaddq_s16(x1[10], x1[21]);
1302 x2[21] = vqsubq_s16(x1[10], x1[21]);
1303 x2[11] = vqaddq_s16(x1[11], x1[20]);
1304 x2[20] = vqsubq_s16(x1[11], x1[20]);
1305 x2[12] = vqaddq_s16(x1[12], x1[19]);
1306 x2[19] = vqsubq_s16(x1[12], x1[19]);
1307 x2[13] = vqaddq_s16(x1[13], x1[18]);
1308 x2[18] = vqsubq_s16(x1[13], x1[18]);
1309 x2[14] = vqaddq_s16(x1[14], x1[17]);
1310 x2[17] = vqsubq_s16(x1[14], x1[17]);
1311 x2[15] = vqaddq_s16(x1[15], x1[16]);
1312 x2[16] = vqsubq_s16(x1[15], x1[16]);
1313
1314 btf_16_neon_mode0(*cospi32, *cospi32, x1[40], x1[55], x2[40], x2[55],
1315 *v_cos_bit);
1316 btf_16_neon_mode0(*cospi32, *cospi32, x1[41], x1[54], x2[41], x2[54],
1317 *v_cos_bit);
1318 btf_16_neon_mode0(*cospi32, *cospi32, x1[42], x1[53], x2[42], x2[53],
1319 *v_cos_bit);
1320 btf_16_neon_mode0(*cospi32, *cospi32, x1[43], x1[52], x2[43], x2[52],
1321 *v_cos_bit);
1322 btf_16_neon_mode0(*cospi32, *cospi32, x1[44], x1[51], x2[44], x2[51],
1323 *v_cos_bit);
1324 btf_16_neon_mode0(*cospi32, *cospi32, x1[45], x1[50], x2[45], x2[50],
1325 *v_cos_bit);
1326 btf_16_neon_mode0(*cospi32, *cospi32, x1[46], x1[49], x2[46], x2[49],
1327 *v_cos_bit);
1328 btf_16_neon_mode0(*cospi32, *cospi32, x1[47], x1[48], x2[47], x2[48],
1329 *v_cos_bit);
1330
1331 // stage 3
1332 x3[0] = vqaddq_s16(x2[0], x2[15]);
1333 x3[15] = vqsubq_s16(x2[0], x2[15]);
1334 x3[1] = vqaddq_s16(x2[1], x2[14]);
1335 x3[14] = vqsubq_s16(x2[1], x2[14]);
1336 x3[2] = vqaddq_s16(x2[2], x2[13]);
1337 x3[13] = vqsubq_s16(x2[2], x2[13]);
1338 x3[3] = vqaddq_s16(x2[3], x2[12]);
1339 x3[12] = vqsubq_s16(x2[3], x2[12]);
1340 x3[4] = vqaddq_s16(x2[4], x2[11]);
1341 x3[11] = vqsubq_s16(x2[4], x2[11]);
1342 x3[5] = vqaddq_s16(x2[5], x2[10]);
1343 x3[10] = vqsubq_s16(x2[5], x2[10]);
1344 x3[6] = vqaddq_s16(x2[6], x2[9]);
1345 x3[9] = vqsubq_s16(x2[6], x2[9]);
1346 x3[7] = vqaddq_s16(x2[7], x2[8]);
1347 x3[8] = vqsubq_s16(x2[7], x2[8]);
1348 x3[16] = x2[16];
1349 x3[17] = x2[17];
1350 x3[18] = x2[18];
1351 x3[19] = x2[19];
1352 btf_16_neon_mode0(*cospi32, *cospi32, x2[20], x2[27], x3[20], x3[27],
1353 *v_cos_bit);
1354 btf_16_neon_mode0(*cospi32, *cospi32, x2[21], x2[26], x3[21], x3[26],
1355 *v_cos_bit);
1356 btf_16_neon_mode0(*cospi32, *cospi32, x2[22], x2[25], x3[22], x3[25],
1357 *v_cos_bit);
1358 btf_16_neon_mode0(*cospi32, *cospi32, x2[23], x2[24], x3[23], x3[24],
1359 *v_cos_bit);
1360 x3[28] = x2[28];
1361 x3[29] = x2[29];
1362 x3[30] = x2[30];
1363 x3[31] = x2[31];
1364 x3[32] = vqaddq_s16(x1[32], x2[47]);
1365 x3[47] = vqsubq_s16(x1[32], x2[47]);
1366 x3[33] = vqaddq_s16(x1[33], x2[46]);
1367 x3[46] = vqsubq_s16(x1[33], x2[46]);
1368 x3[34] = vqaddq_s16(x1[34], x2[45]);
1369 x3[45] = vqsubq_s16(x1[34], x2[45]);
1370 x3[35] = vqaddq_s16(x1[35], x2[44]);
1371 x3[44] = vqsubq_s16(x1[35], x2[44]);
1372 x3[36] = vqaddq_s16(x1[36], x2[43]);
1373 x3[43] = vqsubq_s16(x1[36], x2[43]);
1374 x3[37] = vqaddq_s16(x1[37], x2[42]);
1375 x3[42] = vqsubq_s16(x1[37], x2[42]);
1376 x3[38] = vqaddq_s16(x1[38], x2[41]);
1377 x3[41] = vqsubq_s16(x1[38], x2[41]);
1378 x3[39] = vqaddq_s16(x1[39], x2[40]);
1379 x3[40] = vqsubq_s16(x1[39], x2[40]);
1380 x3[48] = vqsubq_s16(x1[63], x2[48]);
1381 x3[63] = vqaddq_s16(x1[63], x2[48]);
1382 x3[49] = vqsubq_s16(x1[62], x2[49]);
1383 x3[62] = vqaddq_s16(x1[62], x2[49]);
1384 x3[50] = vqsubq_s16(x1[61], x2[50]);
1385 x3[61] = vqaddq_s16(x1[61], x2[50]);
1386 x3[51] = vqsubq_s16(x1[60], x2[51]);
1387 x3[60] = vqaddq_s16(x1[60], x2[51]);
1388 x3[52] = vqsubq_s16(x1[59], x2[52]);
1389 x3[59] = vqaddq_s16(x1[59], x2[52]);
1390 x3[53] = vqsubq_s16(x1[58], x2[53]);
1391 x3[58] = vqaddq_s16(x1[58], x2[53]);
1392 x3[54] = vqsubq_s16(x1[57], x2[54]);
1393 x3[57] = vqaddq_s16(x1[57], x2[54]);
1394 x3[55] = vqsubq_s16(x1[56], x2[55]);
1395 x3[56] = vqaddq_s16(x1[56], x2[55]);
1396
1397 // stage 4
1398 x4[0] = vqaddq_s16(x3[0], x3[7]);
1399 x4[7] = vqsubq_s16(x3[0], x3[7]);
1400 x4[1] = vqaddq_s16(x3[1], x3[6]);
1401 x4[6] = vqsubq_s16(x3[1], x3[6]);
1402 x4[2] = vqaddq_s16(x3[2], x3[5]);
1403 x4[5] = vqsubq_s16(x3[2], x3[5]);
1404 x4[3] = vqaddq_s16(x3[3], x3[4]);
1405 x4[4] = vqsubq_s16(x3[3], x3[4]);
1406
1407 btf_16_neon_mode0(*cospi32, *cospi32, x3[10], x3[13], x4[10], x4[13],
1408 *v_cos_bit);
1409 btf_16_neon_mode0(*cospi32, *cospi32, x3[11], x3[12], x4[11], x4[12],
1410 *v_cos_bit);
1411
1412 x4[16] = vqaddq_s16(x3[16], x3[23]);
1413 x4[23] = vqsubq_s16(x3[16], x3[23]);
1414 x4[17] = vqaddq_s16(x3[17], x3[22]);
1415 x4[22] = vqsubq_s16(x3[17], x3[22]);
1416 x4[18] = vqaddq_s16(x3[18], x3[21]);
1417 x4[21] = vqsubq_s16(x3[18], x3[21]);
1418 x4[19] = vqaddq_s16(x3[19], x3[20]);
1419 x4[20] = vqsubq_s16(x3[19], x3[20]);
1420 x4[24] = vqsubq_s16(x3[31], x3[24]);
1421 x4[31] = vqaddq_s16(x3[31], x3[24]);
1422 x4[25] = vqsubq_s16(x3[30], x3[25]);
1423 x4[30] = vqaddq_s16(x3[30], x3[25]);
1424 x4[26] = vqsubq_s16(x3[29], x3[26]);
1425 x4[29] = vqaddq_s16(x3[29], x3[26]);
1426 x4[27] = vqsubq_s16(x3[28], x3[27]);
1427 x4[28] = vqaddq_s16(x3[28], x3[27]);
1428 }
1429
av1_fdct8x64_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1430 void av1_fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
1431 int8_t cos_bit, const int8_t *stage_range) {
1432 (void)stage_range;
1433 const int32_t *cospi = cospi_arr(cos_bit);
1434 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1435
1436 int16x8_t x3[64];
1437 int16x8_t x4[64];
1438
1439 av1_fdct8x64_stage_1234_neon(input, x3, x4, &cospi[32], &v_cos_bit);
1440
1441 btf_16_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
1442 v_cos_bit);
1443 btf_16_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
1444 v_cos_bit);
1445 btf_16_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
1446 v_cos_bit);
1447 btf_16_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
1448 v_cos_bit);
1449 btf_16_neon_mode02(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
1450 v_cos_bit);
1451 btf_16_neon_mode02(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
1452 v_cos_bit);
1453 btf_16_neon_mode02(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
1454 v_cos_bit);
1455 btf_16_neon_mode02(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
1456 v_cos_bit);
1457
1458 // stage 5
1459 int16x8_t x5[64];
1460 x5[0] = vqaddq_s16(x4[0], x4[3]);
1461 x5[3] = vqsubq_s16(x4[0], x4[3]);
1462 x5[1] = vqaddq_s16(x4[1], x4[2]);
1463 x5[2] = vqsubq_s16(x4[1], x4[2]);
1464
1465 btf_16_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
1466 v_cos_bit);
1467
1468 x5[8] = vqaddq_s16(x3[8], x4[11]);
1469 x5[11] = vqsubq_s16(x3[8], x4[11]);
1470 x5[9] = vqaddq_s16(x3[9], x4[10]);
1471 x5[10] = vqsubq_s16(x3[9], x4[10]);
1472 x5[12] = vqsubq_s16(x3[15], x4[12]);
1473 x5[15] = vqaddq_s16(x3[15], x4[12]);
1474 x5[13] = vqsubq_s16(x3[14], x4[13]);
1475 x5[14] = vqaddq_s16(x3[14], x4[13]);
1476
1477 btf_16_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
1478 v_cos_bit);
1479 btf_16_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
1480 v_cos_bit);
1481 btf_16_neon_mode02(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
1482 v_cos_bit);
1483 btf_16_neon_mode02(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
1484 v_cos_bit);
1485
1486 x5[32] = vqaddq_s16(x3[32], x4[39]);
1487 x5[39] = vqsubq_s16(x3[32], x4[39]);
1488 x5[33] = vqaddq_s16(x3[33], x4[38]);
1489 x5[38] = vqsubq_s16(x3[33], x4[38]);
1490 x5[34] = vqaddq_s16(x3[34], x4[37]);
1491 x5[37] = vqsubq_s16(x3[34], x4[37]);
1492 x5[35] = vqaddq_s16(x3[35], x4[36]);
1493 x5[36] = vqsubq_s16(x3[35], x4[36]);
1494 x5[40] = vqsubq_s16(x3[47], x4[40]);
1495 x5[47] = vqaddq_s16(x3[47], x4[40]);
1496 x5[41] = vqsubq_s16(x3[46], x4[41]);
1497 x5[46] = vqaddq_s16(x3[46], x4[41]);
1498 x5[42] = vqsubq_s16(x3[45], x4[42]);
1499 x5[45] = vqaddq_s16(x3[45], x4[42]);
1500 x5[43] = vqsubq_s16(x3[44], x4[43]);
1501 x5[44] = vqaddq_s16(x3[44], x4[43]);
1502 x5[48] = vqaddq_s16(x3[48], x4[55]);
1503 x5[55] = vqsubq_s16(x3[48], x4[55]);
1504 x5[49] = vqaddq_s16(x3[49], x4[54]);
1505 x5[54] = vqsubq_s16(x3[49], x4[54]);
1506 x5[50] = vqaddq_s16(x3[50], x4[53]);
1507 x5[53] = vqsubq_s16(x3[50], x4[53]);
1508 x5[51] = vqaddq_s16(x3[51], x4[52]);
1509 x5[52] = vqsubq_s16(x3[51], x4[52]);
1510 x5[56] = vqsubq_s16(x3[63], x4[56]);
1511 x5[63] = vqaddq_s16(x3[63], x4[56]);
1512 x5[57] = vqsubq_s16(x3[62], x4[57]);
1513 x5[62] = vqaddq_s16(x3[62], x4[57]);
1514 x5[58] = vqsubq_s16(x3[61], x4[58]);
1515 x5[61] = vqaddq_s16(x3[61], x4[58]);
1516 x5[59] = vqsubq_s16(x3[60], x4[59]);
1517 x5[60] = vqaddq_s16(x3[60], x4[59]);
1518
1519 // stage 6
1520 int16x8_t x6[64];
1521 btf_16_neon_mode2(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1],
1522 v_cos_bit);
1523 btf_16_neon_mode2(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
1524 v_cos_bit);
1525 x6[4] = vqaddq_s16(x4[4], x5[5]);
1526 x6[5] = vqsubq_s16(x4[4], x5[5]);
1527 x6[6] = vqsubq_s16(x4[7], x5[6]);
1528 x6[7] = vqaddq_s16(x4[7], x5[6]);
1529
1530 btf_16_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
1531 v_cos_bit);
1532 btf_16_neon_mode02(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
1533 v_cos_bit);
1534
1535 x6[16] = vqaddq_s16(x4[16], x5[19]);
1536 x6[19] = vqsubq_s16(x4[16], x5[19]);
1537 x6[17] = vqaddq_s16(x4[17], x5[18]);
1538 x6[18] = vqsubq_s16(x4[17], x5[18]);
1539 x6[20] = vqsubq_s16(x4[23], x5[20]);
1540 x6[23] = vqaddq_s16(x4[23], x5[20]);
1541 x6[21] = vqsubq_s16(x4[22], x5[21]);
1542 x6[22] = vqaddq_s16(x4[22], x5[21]);
1543 x6[24] = vqaddq_s16(x4[24], x5[27]);
1544 x6[27] = vqsubq_s16(x4[24], x5[27]);
1545 x6[25] = vqaddq_s16(x4[25], x5[26]);
1546 x6[26] = vqsubq_s16(x4[25], x5[26]);
1547 x6[28] = vqsubq_s16(x4[31], x5[28]);
1548 x6[31] = vqaddq_s16(x4[31], x5[28]);
1549 x6[29] = vqsubq_s16(x4[30], x5[29]);
1550 x6[30] = vqaddq_s16(x4[30], x5[29]);
1551
1552 btf_16_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
1553 v_cos_bit);
1554 btf_16_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
1555 v_cos_bit);
1556 btf_16_neon_mode02(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
1557 v_cos_bit);
1558 btf_16_neon_mode02(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
1559 v_cos_bit);
1560 btf_16_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
1561 v_cos_bit);
1562 btf_16_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
1563 v_cos_bit);
1564 btf_16_neon_mode02(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
1565 v_cos_bit);
1566 btf_16_neon_mode02(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
1567 v_cos_bit);
1568
1569 // stage 7
1570 int16x8_t x7[64];
1571
1572 btf_16_neon_mode2(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
1573 btf_16_neon_mode2(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
1574 v_cos_bit);
1575 x7[8] = vqaddq_s16(x5[8], x6[9]);
1576 x7[9] = vqsubq_s16(x5[8], x6[9]);
1577 x7[10] = vqsubq_s16(x5[11], x6[10]);
1578 x7[11] = vqaddq_s16(x5[11], x6[10]);
1579 x7[12] = vqaddq_s16(x5[12], x6[13]);
1580 x7[13] = vqsubq_s16(x5[12], x6[13]);
1581 x7[14] = vqsubq_s16(x5[15], x6[14]);
1582 x7[15] = vqaddq_s16(x5[15], x6[14]);
1583
1584 btf_16_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
1585 v_cos_bit);
1586 btf_16_neon_mode02(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
1587 v_cos_bit);
1588
1589 btf_16_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
1590 v_cos_bit);
1591 btf_16_neon_mode02(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
1592 v_cos_bit);
1593
1594 x7[32] = vqaddq_s16(x5[32], x6[35]);
1595 x7[35] = vqsubq_s16(x5[32], x6[35]);
1596 x7[33] = vqaddq_s16(x5[33], x6[34]);
1597 x7[34] = vqsubq_s16(x5[33], x6[34]);
1598 x7[36] = vqsubq_s16(x5[39], x6[36]);
1599 x7[39] = vqaddq_s16(x5[39], x6[36]);
1600 x7[37] = vqsubq_s16(x5[38], x6[37]);
1601 x7[38] = vqaddq_s16(x5[38], x6[37]);
1602 x7[40] = vqaddq_s16(x5[40], x6[43]);
1603 x7[43] = vqsubq_s16(x5[40], x6[43]);
1604 x7[41] = vqaddq_s16(x5[41], x6[42]);
1605 x7[42] = vqsubq_s16(x5[41], x6[42]);
1606 x7[44] = vqsubq_s16(x5[47], x6[44]);
1607 x7[47] = vqaddq_s16(x5[47], x6[44]);
1608 x7[45] = vqsubq_s16(x5[46], x6[45]);
1609 x7[46] = vqaddq_s16(x5[46], x6[45]);
1610 x7[48] = vqaddq_s16(x5[48], x6[51]);
1611 x7[51] = vqsubq_s16(x5[48], x6[51]);
1612 x7[49] = vqaddq_s16(x5[49], x6[50]);
1613 x7[50] = vqsubq_s16(x5[49], x6[50]);
1614 x7[52] = vqsubq_s16(x5[55], x6[52]);
1615 x7[55] = vqaddq_s16(x5[55], x6[52]);
1616 x7[53] = vqsubq_s16(x5[54], x6[53]);
1617 x7[54] = vqaddq_s16(x5[54], x6[53]);
1618 x7[56] = vqaddq_s16(x5[56], x6[59]);
1619 x7[59] = vqsubq_s16(x5[56], x6[59]);
1620 x7[57] = vqaddq_s16(x5[57], x6[58]);
1621 x7[58] = vqsubq_s16(x5[57], x6[58]);
1622 x7[60] = vqsubq_s16(x5[63], x6[60]);
1623 x7[63] = vqaddq_s16(x5[63], x6[60]);
1624 x7[61] = vqsubq_s16(x5[62], x6[61]);
1625 x7[62] = vqaddq_s16(x5[62], x6[61]);
1626
1627 // stage 8
1628 int16x8_t x8[64];
1629
1630 btf_16_neon_mode2(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
1631 v_cos_bit);
1632 btf_16_neon_mode2(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
1633 v_cos_bit);
1634 btf_16_neon_mode2(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
1635 v_cos_bit);
1636 btf_16_neon_mode2(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
1637 v_cos_bit);
1638 x8[16] = vqaddq_s16(x6[16], x7[17]);
1639 x8[17] = vqsubq_s16(x6[16], x7[17]);
1640 x8[18] = vqsubq_s16(x6[19], x7[18]);
1641 x8[19] = vqaddq_s16(x6[19], x7[18]);
1642 x8[20] = vqaddq_s16(x6[20], x7[21]);
1643 x8[21] = vqsubq_s16(x6[20], x7[21]);
1644 x8[22] = vqsubq_s16(x6[23], x7[22]);
1645 x8[23] = vqaddq_s16(x6[23], x7[22]);
1646 x8[24] = vqaddq_s16(x6[24], x7[25]);
1647 x8[25] = vqsubq_s16(x6[24], x7[25]);
1648 x8[26] = vqsubq_s16(x6[27], x7[26]);
1649 x8[27] = vqaddq_s16(x6[27], x7[26]);
1650 x8[28] = vqaddq_s16(x6[28], x7[29]);
1651 x8[29] = vqsubq_s16(x6[28], x7[29]);
1652 x8[30] = vqsubq_s16(x6[31], x7[30]);
1653 x8[31] = vqaddq_s16(x6[31], x7[30]);
1654
1655 btf_16_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
1656 v_cos_bit);
1657 btf_16_neon_mode02(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
1658 v_cos_bit);
1659 btf_16_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
1660 v_cos_bit);
1661 btf_16_neon_mode02(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
1662 v_cos_bit);
1663 btf_16_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
1664 v_cos_bit);
1665 btf_16_neon_mode02(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
1666 v_cos_bit);
1667 btf_16_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
1668 v_cos_bit);
1669 btf_16_neon_mode02(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
1670 v_cos_bit);
1671
1672 // stage 9
1673 int16x8_t x9[64];
1674
1675 btf_16_neon_mode2(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
1676 v_cos_bit);
1677 btf_16_neon_mode2(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
1678 v_cos_bit);
1679 btf_16_neon_mode2(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
1680 v_cos_bit);
1681 btf_16_neon_mode2(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
1682 v_cos_bit);
1683 btf_16_neon_mode2(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
1684 v_cos_bit);
1685 btf_16_neon_mode2(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
1686 v_cos_bit);
1687 btf_16_neon_mode2(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
1688 v_cos_bit);
1689 btf_16_neon_mode2(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
1690 v_cos_bit);
1691 x9[32] = vqaddq_s16(x7[32], x8[33]);
1692 x9[33] = vqsubq_s16(x7[32], x8[33]);
1693 x9[34] = vqsubq_s16(x7[35], x8[34]);
1694 x9[35] = vqaddq_s16(x7[35], x8[34]);
1695 x9[36] = vqaddq_s16(x7[36], x8[37]);
1696 x9[37] = vqsubq_s16(x7[36], x8[37]);
1697 x9[38] = vqsubq_s16(x7[39], x8[38]);
1698 x9[39] = vqaddq_s16(x7[39], x8[38]);
1699 x9[40] = vqaddq_s16(x7[40], x8[41]);
1700 x9[41] = vqsubq_s16(x7[40], x8[41]);
1701 x9[42] = vqsubq_s16(x7[43], x8[42]);
1702 x9[43] = vqaddq_s16(x7[43], x8[42]);
1703 x9[44] = vqaddq_s16(x7[44], x8[45]);
1704 x9[45] = vqsubq_s16(x7[44], x8[45]);
1705 x9[46] = vqsubq_s16(x7[47], x8[46]);
1706 x9[47] = vqaddq_s16(x7[47], x8[46]);
1707 x9[48] = vqaddq_s16(x7[48], x8[49]);
1708 x9[49] = vqsubq_s16(x7[48], x8[49]);
1709 x9[50] = vqsubq_s16(x7[51], x8[50]);
1710 x9[51] = vqaddq_s16(x7[51], x8[50]);
1711 x9[52] = vqaddq_s16(x7[52], x8[53]);
1712 x9[53] = vqsubq_s16(x7[52], x8[53]);
1713 x9[54] = vqsubq_s16(x7[55], x8[54]);
1714 x9[55] = vqaddq_s16(x7[55], x8[54]);
1715 x9[56] = vqaddq_s16(x7[56], x8[57]);
1716 x9[57] = vqsubq_s16(x7[56], x8[57]);
1717 x9[58] = vqsubq_s16(x7[59], x8[58]);
1718 x9[59] = vqaddq_s16(x7[59], x8[58]);
1719 x9[60] = vqaddq_s16(x7[60], x8[61]);
1720 x9[61] = vqsubq_s16(x7[60], x8[61]);
1721 x9[62] = vqsubq_s16(x7[63], x8[62]);
1722 x9[63] = vqaddq_s16(x7[63], x8[62]);
1723
1724 // stage 10
1725 btf_16_neon_mode2(cospi[63], cospi[1], x9[32], x9[63], output[1], output[63],
1726 v_cos_bit);
1727
1728 btf_16_neon_mode2(cospi[31], cospi[33], x9[33], x9[62], output[33],
1729 output[31], v_cos_bit);
1730
1731 btf_16_neon_mode2(cospi[47], cospi[17], x9[34], x9[61], output[17],
1732 output[47], v_cos_bit);
1733
1734 btf_16_neon_mode2(cospi[15], cospi[49], x9[35], x9[60], output[49],
1735 output[15], v_cos_bit);
1736
1737 btf_16_neon_mode2(cospi[55], cospi[9], x9[36], x9[59], output[9], output[55],
1738 v_cos_bit);
1739
1740 btf_16_neon_mode2(cospi[23], cospi[41], x9[37], x9[58], output[41],
1741 output[23], v_cos_bit);
1742
1743 btf_16_neon_mode2(cospi[39], cospi[25], x9[38], x9[57], output[25],
1744 output[39], v_cos_bit);
1745
1746 btf_16_neon_mode2(cospi[7], cospi[57], x9[39], x9[56], output[57], output[7],
1747 v_cos_bit);
1748
1749 btf_16_neon_mode2(cospi[59], cospi[5], x9[40], x9[55], output[5], output[59],
1750 v_cos_bit);
1751
1752 btf_16_neon_mode2(cospi[27], cospi[37], x9[41], x9[54], output[37],
1753 output[27], v_cos_bit);
1754
1755 btf_16_neon_mode2(cospi[43], cospi[21], x9[42], x9[53], output[21],
1756 output[43], v_cos_bit);
1757
1758 btf_16_neon_mode2(cospi[11], cospi[53], x9[43], x9[52], output[53],
1759 output[11], v_cos_bit);
1760
1761 btf_16_neon_mode2(cospi[51], cospi[13], x9[44], x9[51], output[13],
1762 output[51], v_cos_bit);
1763
1764 btf_16_neon_mode2(cospi[19], cospi[45], x9[45], x9[50], output[45],
1765 output[19], v_cos_bit);
1766
1767 btf_16_neon_mode2(cospi[35], cospi[29], x9[46], x9[49], output[29],
1768 output[35], v_cos_bit);
1769
1770 btf_16_neon_mode2(cospi[3], cospi[61], x9[47], x9[48], output[61], output[3],
1771 v_cos_bit);
1772
1773 // stage 11
1774 output[0] = x6[0];
1775 output[2] = x9[16];
1776 output[4] = x8[8];
1777 output[6] = x9[24];
1778 output[8] = x7[4];
1779 output[10] = x9[20];
1780 output[12] = x8[12];
1781 output[14] = x9[28];
1782 output[16] = x6[2];
1783 output[18] = x9[18];
1784 output[20] = x8[10];
1785 output[22] = x9[26];
1786 output[24] = x7[6];
1787 output[26] = x9[22];
1788 output[28] = x8[14];
1789 output[30] = x9[30];
1790 output[32] = x6[1];
1791 output[34] = x9[17];
1792 output[36] = x8[9];
1793 output[38] = x9[25];
1794 output[40] = x7[5];
1795 output[42] = x9[21];
1796 output[44] = x8[13];
1797 output[46] = x9[29];
1798 output[48] = x6[3];
1799 output[52] = x8[11];
1800 output[54] = x9[27];
1801 output[56] = x7[7];
1802 output[58] = x9[23];
1803 output[60] = x8[15];
1804 output[62] = x9[31];
1805 }
1806
fadst_8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1807 void fadst_8x8_neon(const int16x8_t *input, int16x8_t *output, int8_t cos_bit,
1808 const int8_t *stage_range) {
1809 (void)stage_range;
1810 const int32_t *cospi = cospi_arr(cos_bit);
1811 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1812
1813 // stage 1
1814 int16x8_t x1[4];
1815
1816 x1[0] = vqnegq_s16(input[7]);
1817 x1[1] = vqnegq_s16(input[3]);
1818 x1[2] = vqnegq_s16(input[1]);
1819 x1[3] = vqnegq_s16(input[5]);
1820
1821 // stage 2
1822 int16x8_t x2[8];
1823
1824 btf_16_neon_mode3(cospi[32], cospi[32], x1[1], input[4], x2[2], x2[3],
1825 v_cos_bit);
1826 btf_16_neon_mode3(cospi[32], cospi[32], input[2], x1[3], x2[6], x2[7],
1827 v_cos_bit);
1828 // stage 3
1829 int16x8_t x3[8];
1830 x3[0] = vqaddq_s16(input[0], x2[2]);
1831 x3[2] = vqsubq_s16(input[0], x2[2]);
1832 x3[1] = vqaddq_s16(x1[0], x2[3]);
1833 x3[3] = vqsubq_s16(x1[0], x2[3]);
1834 x3[4] = vqaddq_s16(x1[2], x2[6]);
1835 x3[6] = vqsubq_s16(x1[2], x2[6]);
1836 x3[5] = vqaddq_s16(input[6], x2[7]);
1837 x3[7] = vqsubq_s16(input[6], x2[7]);
1838
1839 // stage 4
1840 btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
1841 v_cos_bit);
1842 btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
1843 v_cos_bit);
1844
1845 // stage 5
1846 int16x8_t x5[8];
1847 x5[0] = vqaddq_s16(x3[0], x3[4]);
1848 x5[4] = vqsubq_s16(x3[0], x3[4]);
1849 x5[1] = vqaddq_s16(x3[1], x3[5]);
1850 x5[5] = vqsubq_s16(x3[1], x3[5]);
1851 x5[2] = vqaddq_s16(x3[2], x3[6]);
1852 x5[6] = vqsubq_s16(x3[2], x3[6]);
1853 x5[3] = vqaddq_s16(x3[3], x3[7]);
1854 x5[7] = vqsubq_s16(x3[3], x3[7]);
1855
1856 // stage 6
1857 btf_16_neon_mode3(cospi[4], cospi[60], x5[0], x5[1], output[7], output[0],
1858 v_cos_bit);
1859 btf_16_neon_mode3(cospi[20], cospi[44], x5[2], x5[3], output[5], output[2],
1860 v_cos_bit);
1861 btf_16_neon_mode3(cospi[36], cospi[28], x5[4], x5[5], output[3], output[4],
1862 v_cos_bit);
1863 btf_16_neon_mode3(cospi[52], cospi[12], x5[6], x5[7], output[1], output[6],
1864 v_cos_bit);
1865 }
1866
fadst8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)1867 static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
1868 int8_t cos_bit, const int8_t *stage_range) {
1869 (void)stage_range;
1870 const int32_t *cospi = cospi_arr(cos_bit);
1871 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
1872
1873 // stage 1
1874 int16x8_t x1[12];
1875 x1[0] = vqnegq_s16(input[15]);
1876 x1[1] = vqnegq_s16(input[3]);
1877 x1[2] = vqnegq_s16(input[1]);
1878 x1[3] = vqnegq_s16(input[13]);
1879
1880 // stage 2
1881 btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[7], input[8],
1882 x1[4], x1[5]);
1883 btf_16_neon_mode1(cospi[32], cospi[32], input[4], input[11], x1[6], x1[7],
1884 v_cos_bit);
1885 btf_16_neon_mode1(cospi[32], cospi[32], input[6], input[9], x1[8], x1[9],
1886 v_cos_bit);
1887 btf_16_neon(-cospi[32], cospi[32], -cospi[32], -cospi[32], input[5],
1888 input[10], x1[10], x1[11]);
1889 // stage 3
1890 int16x8_t x3[16];
1891 x3[0] = vqaddq_s16(input[0], x1[4]);
1892 x3[2] = vqsubq_s16(input[0], x1[4]);
1893 x3[1] = vqaddq_s16(x1[0], x1[5]);
1894 x3[3] = vqsubq_s16(x1[0], x1[5]);
1895 x3[4] = vqaddq_s16(x1[1], x1[6]);
1896 x3[6] = vqsubq_s16(x1[1], x1[6]);
1897 x3[5] = vqaddq_s16(input[12], x1[7]);
1898 x3[7] = vqsubq_s16(input[12], x1[7]);
1899 x3[8] = vqaddq_s16(x1[2], x1[8]);
1900 x3[10] = vqsubq_s16(x1[2], x1[8]);
1901 x3[9] = vqaddq_s16(input[14], x1[9]);
1902 x3[11] = vqsubq_s16(input[14], x1[9]);
1903 x3[12] = vqaddq_s16(input[2], x1[10]);
1904 x3[14] = vqsubq_s16(input[2], x1[10]);
1905 x3[13] = vqaddq_s16(x1[3], x1[11]);
1906 x3[15] = vqsubq_s16(x1[3], x1[11]);
1907
1908 // stage 4
1909 btf_16_neon_mode3(cospi[16], cospi[48], x3[4], x3[5], x3[4], x3[5],
1910 v_cos_bit);
1911 btf_16_neon_mode0(cospi[48], cospi[16], x3[6], x3[7], x3[6], x3[7],
1912 v_cos_bit);
1913 btf_16_neon_mode3(cospi[16], cospi[48], x3[12], x3[13], x3[12], x3[13],
1914 v_cos_bit);
1915 btf_16_neon_mode0(cospi[48], cospi[16], x3[14], x3[15], x3[14], x3[15],
1916 v_cos_bit);
1917
1918 // stage 5
1919 int16x8_t x5[16];
1920 x5[0] = vqaddq_s16(x3[0], x3[4]);
1921 x5[4] = vqsubq_s16(x3[0], x3[4]);
1922 x5[1] = vqaddq_s16(x3[1], x3[5]);
1923 x5[5] = vqsubq_s16(x3[1], x3[5]);
1924 x5[2] = vqaddq_s16(x3[2], x3[6]);
1925 x5[6] = vqsubq_s16(x3[2], x3[6]);
1926 x5[3] = vqaddq_s16(x3[3], x3[7]);
1927 x5[7] = vqsubq_s16(x3[3], x3[7]);
1928 x5[8] = vqaddq_s16(x3[8], x3[12]);
1929 x5[12] = vqsubq_s16(x3[8], x3[12]);
1930 x5[9] = vqaddq_s16(x3[9], x3[13]);
1931 x5[13] = vqsubq_s16(x3[9], x3[13]);
1932 x5[10] = vqaddq_s16(x3[10], x3[14]);
1933 x5[14] = vqsubq_s16(x3[10], x3[14]);
1934 x5[11] = vqaddq_s16(x3[11], x3[15]);
1935 x5[15] = vqsubq_s16(x3[11], x3[15]);
1936
1937 // stage 6
1938 btf_16_neon_mode3(cospi[8], cospi[56], x5[8], x5[9], x5[8], x5[9], v_cos_bit);
1939 btf_16_neon_mode3(cospi[40], cospi[24], x5[10], x5[11], x5[10], x5[11],
1940 v_cos_bit);
1941 btf_16_neon_mode0(cospi[56], cospi[8], x5[12], x5[13], x5[12], x5[13],
1942 v_cos_bit);
1943 btf_16_neon_mode0(cospi[24], cospi[40], x5[14], x5[15], x5[14], x5[15],
1944 v_cos_bit);
1945
1946 // stage 7
1947 int16x8_t x7[16];
1948 x7[0] = vqaddq_s16(x5[0], x5[8]);
1949 x7[8] = vqsubq_s16(x5[0], x5[8]);
1950 x7[1] = vqaddq_s16(x5[1], x5[9]);
1951 x7[9] = vqsubq_s16(x5[1], x5[9]);
1952 x7[2] = vqaddq_s16(x5[2], x5[10]);
1953 x7[10] = vqsubq_s16(x5[2], x5[10]);
1954 x7[3] = vqaddq_s16(x5[3], x5[11]);
1955 x7[11] = vqsubq_s16(x5[3], x5[11]);
1956 x7[4] = vqaddq_s16(x5[4], x5[12]);
1957 x7[12] = vqsubq_s16(x5[4], x5[12]);
1958 x7[5] = vqaddq_s16(x5[5], x5[13]);
1959 x7[13] = vqsubq_s16(x5[5], x5[13]);
1960 x7[6] = vqaddq_s16(x5[6], x5[14]);
1961 x7[14] = vqsubq_s16(x5[6], x5[14]);
1962 x7[7] = vqaddq_s16(x5[7], x5[15]);
1963 x7[15] = vqsubq_s16(x5[7], x5[15]);
1964
1965 // stage 8
1966 btf_16_neon_mode3(cospi[2], cospi[62], x7[0], x7[1], output[15], output[0],
1967 v_cos_bit);
1968 btf_16_neon_mode3(cospi[10], cospi[54], x7[2], x7[3], output[13], output[2],
1969 v_cos_bit);
1970 btf_16_neon_mode3(cospi[18], cospi[46], x7[4], x7[5], output[11], output[4],
1971 v_cos_bit);
1972 btf_16_neon_mode3(cospi[26], cospi[38], x7[6], x7[7], output[9], output[6],
1973 v_cos_bit);
1974 btf_16_neon_mode3(cospi[34], cospi[30], x7[8], x7[9], output[7], output[8],
1975 v_cos_bit);
1976 btf_16_neon_mode3(cospi[42], cospi[22], x7[10], x7[11], output[5], output[10],
1977 v_cos_bit);
1978 btf_16_neon_mode3(cospi[50], cospi[14], x7[12], x7[13], output[3], output[12],
1979 v_cos_bit);
1980 btf_16_neon_mode3(cospi[58], cospi[6], x7[14], x7[15], output[1], output[14],
1981 v_cos_bit);
1982 }
1983
av1_fidentity4x4_neon(const int16x8_t * const input,int16x8_t * const output,const int8_t cos_bit,const int8_t * stage_range)1984 void av1_fidentity4x4_neon(const int16x8_t *const input,
1985 int16x8_t *const output, const int8_t cos_bit,
1986 const int8_t *stage_range) {
1987 (void)cos_bit;
1988 (void)stage_range;
1989 const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
1990 for (int i = 0; i < 4; ++i) {
1991 const int16x4_t b = vqrshrn_n_s32(
1992 vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
1993 output[i] = vcombine_s16(b, b);
1994 }
1995 }
1996
fidentity8x4_neon(const int16x8_t * const input,int16x8_t * const output,const int8_t cos_bit,const int8_t * stage_range)1997 static INLINE void fidentity8x4_neon(const int16x8_t *const input,
1998 int16x8_t *const output,
1999 const int8_t cos_bit,
2000 const int8_t *stage_range) {
2001 (void)stage_range;
2002 (void)cos_bit;
2003 const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2);
2004 for (int i = 0; i < 4; ++i) {
2005 const int16x4_t b_lo = vqrshrn_n_s32(
2006 vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2007 const int16x4_t b_hi = vqrshrn_n_s32(
2008 vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2009 output[i] = vcombine_s16(b_lo, b_hi);
2010 }
2011 }
2012
fidentity8x8_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2013 void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
2014 int8_t cos_bit, const int8_t *stage_range) {
2015 (void)cos_bit;
2016 (void)stage_range;
2017 int16x8_t one = vdupq_n_s16(1);
2018 output[0] = vqrshlq_s16(input[0], one);
2019 output[1] = vqrshlq_s16(input[1], one);
2020 output[2] = vqrshlq_s16(input[2], one);
2021 output[3] = vqrshlq_s16(input[3], one);
2022 output[4] = vqrshlq_s16(input[4], one);
2023 output[5] = vqrshlq_s16(input[5], one);
2024 output[6] = vqrshlq_s16(input[6], one);
2025 output[7] = vqrshlq_s16(input[7], one);
2026 }
2027
fidentity8x16_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2028 static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
2029 int8_t cos_bit,
2030 const int8_t *stage_range) {
2031 (void)stage_range;
2032 (void)cos_bit;
2033 const int16x4_t v_newsqrt2 = vdup_n_s16(NewSqrt2 * 2);
2034 for (int i = 0; i < 16; ++i) {
2035 const int16x4_t b_lo = vqrshrn_n_s32(
2036 vmull_s16(vget_low_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2037 const int16x4_t b_hi = vqrshrn_n_s32(
2038 vmull_s16(vget_high_s16(input[i]), v_newsqrt2), NewSqrt2Bits);
2039 output[i] = vcombine_s16(b_lo, b_hi);
2040 }
2041 }
2042
fidentity8x32_neon(const int16x8_t * input,int16x8_t * output,int8_t cos_bit,const int8_t * stage_range)2043 static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
2044 int8_t cos_bit,
2045 const int8_t *stage_range) {
2046 (void)stage_range;
2047 (void)cos_bit;
2048 for (int i = 0; i < 32; ++i) {
2049 output[i] = vshlq_n_s16(input[i], 2);
2050 }
2051 }
2052
2053 typedef void (*transform_1d_lbd_neon)(const int16x8_t *input, int16x8_t *output,
2054 int8_t cos_bit,
2055 const int8_t *stage_range);
2056
2057 static const transform_1d_lbd_neon col_txfm4x4_arr[TX_TYPES] = {
2058 av1_fdct4x4_neon, // DCT_DCT
2059 av1_fadst4x4_neon, // ADST_DCT
2060 av1_fdct4x4_neon, // DCT_ADST
2061 av1_fadst4x4_neon, // ADST_ADST
2062 av1_fadst4x4_neon, // FLIPADST_DCT
2063 av1_fdct4x4_neon, // DCT_FLIPADST
2064 av1_fadst4x4_neon, // FLIPADST_FLIPADST
2065 av1_fadst4x4_neon, // ADST_FLIPADST
2066 av1_fadst4x4_neon, // FLIPADST_ADST
2067 av1_fidentity4x4_neon, // IDTX
2068 av1_fdct4x4_neon, // V_DCT
2069 av1_fidentity4x4_neon, // H_DCT
2070 av1_fadst4x4_neon, // V_ADST
2071 av1_fidentity4x4_neon, // H_ADST
2072 av1_fadst4x4_neon, // V_FLIPADST
2073 av1_fidentity4x4_neon // H_FLIPADST
2074 };
2075
2076 static const transform_1d_lbd_neon row_txfm4x4_arr[TX_TYPES] = {
2077 av1_fdct4x4_neon, // DCT_DCT
2078 av1_fdct4x4_neon, // ADST_DCT
2079 av1_fadst4x4_neon, // DCT_ADST
2080 av1_fadst4x4_neon, // ADST_ADST
2081 av1_fdct4x4_neon, // FLIPADST_DCT
2082 av1_fadst4x4_neon, // DCT_FLIPADST
2083 av1_fadst4x4_neon, // FLIPADST_FLIPADST
2084 av1_fadst4x4_neon, // ADST_FLIPADST
2085 av1_fadst4x4_neon, // FLIPADST_ADST
2086 av1_fidentity4x4_neon, // IDTX
2087 av1_fidentity4x4_neon, // V_DCT
2088 av1_fdct4x4_neon, // H_DCT
2089 av1_fidentity4x4_neon, // V_ADST
2090 av1_fadst4x4_neon, // H_ADST
2091 av1_fidentity4x4_neon, // V_FLIPADST
2092 av1_fadst4x4_neon // H_FLIPADST
2093 };
2094
2095 static const transform_1d_lbd_neon col_txfm4x8_arr[TX_TYPES] = {
2096 fdct4x8_neon, // DCT_DCT
2097 fadst4x8_neon, // ADST_DCT
2098 fdct4x8_neon, // DCT_ADST
2099 fadst4x8_neon, // ADST_ADST
2100 fadst4x8_neon, // FLIPADST_DCT
2101 fdct4x8_neon, // DCT_FLIPADST
2102 fadst4x8_neon, // FLIPADST_FLIPADST
2103 fadst4x8_neon, // ADST_FLIPADST
2104 fadst4x8_neon, // FLIPADST_ADST
2105 fidentity8x8_neon, // IDTX
2106 fdct4x8_neon, // V_DCT
2107 fidentity8x8_neon, // H_DCT
2108 fadst4x8_neon, // V_ADST
2109 fidentity8x8_neon, // H_ADST
2110 fadst4x8_neon, // V_FLIPADST
2111 fidentity8x8_neon // H_FLIPADST
2112 };
2113
2114 static const transform_1d_lbd_neon row_txfm8x4_arr[TX_TYPES] = {
2115 fdct8x4_neon, // DCT_DCT
2116 fdct8x4_neon, // ADST_DCT
2117 fadst8x4_neon, // DCT_ADST
2118 fadst8x4_neon, // ADST_ADST
2119 fdct8x4_neon, // FLIPADST_DCT
2120 fadst8x4_neon, // DCT_FLIPADST
2121 fadst8x4_neon, // FLIPADST_FLIPADST
2122 fadst8x4_neon, // ADST_FLIPADST
2123 fadst8x4_neon, // FLIPADST_ADST
2124 fidentity8x4_neon, // IDTX
2125 fidentity8x4_neon, // V_DCT
2126 fdct8x4_neon, // H_DCT
2127 fidentity8x4_neon, // V_ADST
2128 fadst8x4_neon, // H_ADST
2129 fidentity8x4_neon, // V_FLIPADST
2130 fadst8x4_neon // H_FLIPADST
2131 };
2132
2133 static const transform_1d_lbd_neon col_txfm8x4_arr[TX_TYPES] = {
2134 fdct8x4_neon, // DCT_DCT
2135 fadst8x4_neon, // ADST_DCT
2136 fdct8x4_neon, // DCT_ADST
2137 fadst8x4_neon, // ADST_ADST
2138 fadst8x4_neon, // FLIPADST_DCT
2139 fdct8x4_neon, // DCT_FLIPADST
2140 fadst8x4_neon, // FLIPADST_FLIPADST
2141 fadst8x4_neon, // ADST_FLIPADST
2142 fadst8x4_neon, // FLIPADST_ADST
2143 fidentity8x4_neon, // IDTX
2144 fdct8x4_neon, // V_DCT
2145 fidentity8x4_neon, // H_DCT
2146 fadst8x4_neon, // V_ADST
2147 fidentity8x4_neon, // H_ADST
2148 fadst8x4_neon, // V_FLIPADST
2149 fidentity8x4_neon // H_FLIPADST
2150 };
2151
2152 static const transform_1d_lbd_neon row_txfm4x8_arr[TX_TYPES] = {
2153 fdct4x8_neon, // DCT_DCT
2154 fdct4x8_neon, // ADST_DCT
2155 fadst4x8_neon, // DCT_ADST
2156 fadst4x8_neon, // ADST_ADST
2157 fdct4x8_neon, // FLIPADST_DCT
2158 fadst4x8_neon, // DCT_FLIPADST
2159 fadst4x8_neon, // FLIPADST_FLIPADST
2160 fadst4x8_neon, // ADST_FLIPADST
2161 fadst4x8_neon, // FLIPADST_ADST
2162 fidentity8x8_neon, // IDTX
2163 fidentity8x8_neon, // V_DCT
2164 fdct4x8_neon, // H_DCT
2165 fidentity8x8_neon, // V_ADST
2166 fadst4x8_neon, // H_ADST
2167 fidentity8x8_neon, // V_FLIPADST
2168 fadst4x8_neon // H_FLIPADST
2169 };
2170
2171 static const transform_1d_lbd_neon col_txfm8x8_arr[TX_TYPES] = {
2172 fdct8x8_neon, // DCT_DCT
2173 fadst_8x8_neon, // ADST_DCT
2174 fdct8x8_neon, // DCT_ADST
2175 fadst_8x8_neon, // ADST_ADST
2176 fadst_8x8_neon, // FLIPADST_DCT
2177 fdct8x8_neon, // DCT_FLIPADST
2178 fadst_8x8_neon, // FLIPADST_FLIPADST
2179 fadst_8x8_neon, // ADST_FLIPADST
2180 fadst_8x8_neon, // FLIPADST_ADST
2181 fidentity8x8_neon, // IDTX
2182 fdct8x8_neon, // V_DCT
2183 fidentity8x8_neon, // H_DCT
2184 fadst_8x8_neon, // V_ADST
2185 fidentity8x8_neon, // H_ADST
2186 fadst_8x8_neon, // V_FLIPADST
2187 fidentity8x8_neon, // H_FLIPADST
2188 };
2189
2190 static const transform_1d_lbd_neon row_txfm8x8_arr[TX_TYPES] = {
2191 fdct8x8_neon, // DCT_DCT
2192 fdct8x8_neon, // ADST_DCT
2193 fadst_8x8_neon, // DCT_ADST
2194 fadst_8x8_neon, // ADST_ADST
2195 fdct8x8_neon, // FLIPADST_DCT
2196 fadst_8x8_neon, // DCT_FLIPADST
2197 fadst_8x8_neon, // FLIPADST_FLIPADST
2198 fadst_8x8_neon, // ADST_FLIPADST
2199 fadst_8x8_neon, // FLIPADST_ADST
2200 fidentity8x8_neon, // IDTX
2201 fidentity8x8_neon, // V_DCT
2202 fdct8x8_neon, // H_DCT
2203 fidentity8x8_neon, // V_ADST
2204 fadst_8x8_neon, // H_ADST
2205 fidentity8x8_neon, // V_FLIPADST
2206 fadst_8x8_neon // H_FLIPADST
2207 };
2208
2209 static const transform_1d_lbd_neon col_txfm8x16_arr[TX_TYPES] = {
2210 fdct8x16_neon, // DCT_DCT
2211 fadst8x16_neon, // ADST_DCT
2212 fdct8x16_neon, // DCT_ADST
2213 fadst8x16_neon, // ADST_ADST
2214 fadst8x16_neon, // FLIPADST_DCT
2215 fdct8x16_neon, // DCT_FLIPADST
2216 fadst8x16_neon, // FLIPADST_FLIPADST
2217 fadst8x16_neon, // ADST_FLIPADST
2218 fadst8x16_neon, // FLIPADST_ADST
2219 fidentity8x16_neon, // IDTX
2220 fdct8x16_neon, // V_DCT
2221 fidentity8x16_neon, // H_DCT
2222 fadst8x16_neon, // V_ADST
2223 fidentity8x16_neon, // H_ADST
2224 fadst8x16_neon, // V_FLIPADST
2225 fidentity8x16_neon // H_FLIPADST
2226 };
2227
2228 static const transform_1d_lbd_neon row_txfm8x16_arr[TX_TYPES] = {
2229 fdct8x16_neon, // DCT_DCT
2230 fdct8x16_neon, // ADST_DCT
2231 fadst8x16_neon, // DCT_ADST
2232 fadst8x16_neon, // ADST_ADST
2233 fdct8x16_neon, // FLIPADST_DCT
2234 fadst8x16_neon, // DCT_FLIPADST
2235 fadst8x16_neon, // FLIPADST_FLIPADST
2236 fadst8x16_neon, // ADST_FLIPADST
2237 fadst8x16_neon, // FLIPADST_ADST
2238 fidentity8x16_neon, // IDTX
2239 fidentity8x16_neon, // V_DCT
2240 fdct8x16_neon, // H_DCT
2241 fidentity8x16_neon, // V_ADST
2242 fadst8x16_neon, // H_ADST
2243 fidentity8x16_neon, // V_FLIPADST
2244 fadst8x16_neon // H_FLIPADST
2245 };
2246
2247 static const transform_1d_lbd_neon row_txfm8x32_arr[TX_TYPES] = {
2248 av1_fdct8x32_neon, // DCT_DCT
2249 NULL, // ADST_DCT
2250 NULL, // DCT_ADST
2251 NULL, // ADST_ADST
2252 NULL, // FLIPADST_DCT
2253 NULL, // DCT_FLIPADST
2254 NULL, // FLIPADST_FLIPADST
2255 NULL, // ADST_FLIPADST
2256 NULL, // FLIPADST_ADST
2257 fidentity8x32_neon, // IDTX
2258 fidentity8x32_neon, // V_DCT
2259 av1_fdct8x32_neon, // H_DCT
2260 NULL, // V_ADST
2261 NULL, // H_ADST
2262 NULL, // V_FLIPADST
2263 NULL // H_FLIPADST
2264 };
2265
2266 static const transform_1d_lbd_neon col_txfm8x32_arr[TX_TYPES] = {
2267 av1_fdct8x32_neon, // DCT_DCT
2268 NULL, // ADST_DCT
2269 NULL, // DCT_ADST
2270 NULL, // ADST_ADST
2271 NULL, // FLIPADST_DCT
2272 NULL, // DCT_FLIPADST
2273 NULL, // FLIPADST_FLIPADST
2274 NULL, // ADST_FLIPADST
2275 NULL, // FLIPADST_ADST
2276 fidentity8x32_neon, // IDTX
2277 av1_fdct8x32_neon, // V_DCT
2278 fidentity8x32_neon, // H_DCT
2279 NULL, // V_ADST
2280 NULL, // H_ADST
2281 NULL, // V_FLIPADST
2282 NULL // H_FLIPADST
2283 };
2284
av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2285 void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
2286 int stride, TX_TYPE tx_type, int bd) {
2287 (void)bd;
2288 int16x8_t buf0[4], buf1[4], *buf;
2289 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
2290 const int txw_idx = get_txw_idx(TX_4X4);
2291 const int txh_idx = get_txh_idx(TX_4X4);
2292 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2293 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2294 const int width = 4;
2295 const int height = 4;
2296 const transform_1d_lbd_neon col_txfm = col_txfm4x4_arr[tx_type];
2297 const transform_1d_lbd_neon row_txfm = row_txfm4x4_arr[tx_type];
2298 int ud_flip, lr_flip;
2299
2300 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2301 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2302 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2303 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2304 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2305 if (ud_flip) {
2306 load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2307 } else {
2308 load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2309 }
2310 round_shift_16bit_vector(buf0, height, &v_shift0);
2311 col_txfm(buf0, buf0, cos_bit_col, NULL);
2312 round_shift_16bit_vector(buf0, height, &v_shift1);
2313 transpose_16bit_4x4(buf0, buf1);
2314
2315 if (lr_flip) {
2316 buf = buf0;
2317 flip_buf_neon(buf1, buf, width);
2318 } else {
2319 buf = buf1;
2320 }
2321 row_txfm(buf, buf, cos_bit_row, NULL);
2322 round_shift_16bit_vector(buf0, height, &v_shift2);
2323
2324 transpose_16bit_4x4(buf, buf);
2325 store_buffer_16bit_to_32bit_w4(buf, output, width, height);
2326 }
2327
av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2328 void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
2329 int stride, TX_TYPE tx_type, int bd) {
2330 (void)stride;
2331 (void)bd;
2332 int16x8_t buf0[8], buf1[8], *buf;
2333 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
2334 const int txw_idx = get_txw_idx(TX_4X8);
2335 const int txh_idx = get_txh_idx(TX_4X8);
2336 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2337 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2338 const int width = 4;
2339 const int height = 8;
2340 const transform_1d_lbd_neon col_txfm = col_txfm4x8_arr[tx_type];
2341 const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
2342 int ud_flip, lr_flip;
2343
2344 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2345 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2346 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2347 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2348 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2349 if (ud_flip) {
2350 load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2351 } else {
2352 load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2353 }
2354 round_shift_16bit_vector(buf0, height, &v_shift0);
2355 col_txfm(buf0, buf0, cos_bit_col, NULL);
2356 round_shift_16bit_vector(buf0, height, &v_shift1);
2357 transpose_16bit_4x8(buf0, buf1);
2358
2359 if (lr_flip) {
2360 buf = buf0;
2361 flip_buf_neon(buf1, buf, width);
2362 } else {
2363 buf = buf1;
2364 }
2365 row_txfm(buf, buf, cos_bit_row, NULL);
2366 round_shift_16bit_vector(buf0, height, &v_shift2);
2367 transpose_16bit_8x4(buf, buf);
2368 store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
2369 }
2370
av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2371 void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
2372 int stride, TX_TYPE tx_type, int bd) {
2373 (void)bd;
2374 int16x8_t buf0[16], buf1[16];
2375 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
2376 const int txw_idx = get_txw_idx(TX_4X16);
2377 const int txh_idx = get_txh_idx(TX_4X16);
2378 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2379 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2380 const int width = 4;
2381 const int height = 16;
2382 const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2383 const transform_1d_lbd_neon row_txfm = row_txfm8x4_arr[tx_type];
2384 int ud_flip, lr_flip;
2385
2386 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2387 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2388 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2389 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2390 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2391 if (ud_flip) {
2392 load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
2393 } else {
2394 load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
2395 }
2396 round_shift_16bit_vector(buf0, height, &v_shift0);
2397 col_txfm(buf0, buf0, cos_bit_col, NULL);
2398 round_shift_16bit_vector(buf0, height, &v_shift1);
2399 transpose_16bit_4x8(buf0, buf1);
2400 transpose_16bit_4x8(buf0 + 8, buf1 + 8);
2401
2402 for (int i = 0; i < 2; i++) {
2403 int16x8_t *buf;
2404 if (lr_flip) {
2405 buf = buf0;
2406 flip_buf_neon(buf1 + 8 * i, buf, width);
2407 } else {
2408 buf = buf1 + 8 * i;
2409 }
2410 row_txfm(buf, buf, cos_bit_row, NULL);
2411 round_shift_16bit_vector(buf0, height, &v_shift2);
2412 transpose_16bit_8x4(buf, buf);
2413 store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
2414 }
2415 }
2416
av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2417 void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
2418 int stride, TX_TYPE tx_type, int bd) {
2419 (void)bd;
2420 int16x8_t buf0[8], buf1[8], *buf;
2421 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
2422 const int txw_idx = get_txw_idx(TX_8X4);
2423 const int txh_idx = get_txh_idx(TX_8X4);
2424 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2425 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2426 const int width = 8;
2427 const int height = 4;
2428 const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
2429 const transform_1d_lbd_neon row_txfm = row_txfm4x8_arr[tx_type];
2430 int ud_flip, lr_flip;
2431
2432 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2433 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2434 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2435 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2436 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2437 if (ud_flip)
2438 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2439 else
2440 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2441 round_shift_16bit_vector(buf0, height, &v_shift0);
2442 col_txfm(buf0, buf0, cos_bit_col, NULL);
2443 round_shift_16bit_vector(buf0, height, &v_shift1);
2444 transpose_16bit_8x8(buf0, buf1);
2445
2446 if (lr_flip) {
2447 buf = buf0;
2448 flip_buf_neon(buf1, buf, width);
2449 } else {
2450 buf = buf1;
2451 }
2452 row_txfm(buf, buf, cos_bit_row, NULL);
2453 round_shift_16bit_vector(buf0, height, &v_shift2);
2454 transpose_16bit_8x8(buf, buf);
2455 store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
2456 }
2457
av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2458 void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
2459 int stride, TX_TYPE tx_type, int bd) {
2460 (void)bd;
2461 int16x8_t buf0[8], buf1[8], *buf;
2462 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
2463 const int txw_idx = get_txw_idx(TX_8X8);
2464 const int txh_idx = get_txh_idx(TX_8X8);
2465 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2466 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2467 const int width = 8;
2468 const int height = 8;
2469 const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2470 const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2471 int ud_flip, lr_flip;
2472
2473 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2474 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2475 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2476 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2477 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2478 if (ud_flip)
2479 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2480 else
2481 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2482 round_shift_16bit_vector(buf0, height, &v_shift0);
2483 col_txfm(buf0, buf0, cos_bit_col, NULL);
2484 round_shift_16bit_vector(buf0, height, &v_shift1);
2485 transpose_16bit_8x8(buf0, buf1);
2486
2487 if (lr_flip) {
2488 buf = buf0;
2489 flip_buf_neon(buf1, buf, width);
2490 } else {
2491 buf = buf1;
2492 }
2493 row_txfm(buf, buf, cos_bit_row, NULL);
2494 round_shift_16bit_vector(buf0, height, &v_shift2);
2495 transpose_16bit_8x8(buf, buf);
2496 store_buffer_16bit_to_32bit_w8(buf, output, width, height);
2497 }
2498
av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2499 void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
2500 int stride, TX_TYPE tx_type, int bd) {
2501 (void)bd;
2502 int16x8_t buf0[16], buf1[16];
2503 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2504 const int txw_idx = get_txw_idx(TX_8X16);
2505 const int txh_idx = get_txh_idx(TX_8X16);
2506 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2507 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2508 const int width = 8;
2509 const int height = 16;
2510 const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2511 const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2512 int ud_flip, lr_flip;
2513
2514 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2515 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2516 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2517 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2518 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2519 if (ud_flip) {
2520 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2521 } else {
2522 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2523 }
2524 round_shift_16bit_vector(buf0, height, &v_shift0);
2525 col_txfm(buf0, buf0, cos_bit_col, NULL);
2526 round_shift_16bit_vector(buf0, height, &v_shift1);
2527 transpose_16bit_8x8(buf0, buf1);
2528 transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2529
2530 for (int i = 0; i < 2; i++) {
2531 int16x8_t *buf;
2532 if (lr_flip) {
2533 buf = buf0;
2534 flip_buf_neon(buf1 + width * i, buf, width);
2535 } else {
2536 buf = buf1 + width * i;
2537 }
2538 row_txfm(buf, buf, cos_bit_row, NULL);
2539 round_shift_16bit_vector(buf0, height, &v_shift2);
2540 transpose_16bit_8x8(buf, buf);
2541 store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2542 }
2543 }
2544
av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2545 void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
2546 int stride, TX_TYPE tx_type, int bd) {
2547 (void)bd;
2548 int16x8_t buf0[32], buf1[32];
2549 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
2550 const int txw_idx = get_txw_idx(TX_8X32);
2551 const int txh_idx = get_txh_idx(TX_8X32);
2552 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2553 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2554 const int width = 8;
2555 const int height = 32;
2556 const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2557 const transform_1d_lbd_neon row_txfm = row_txfm8x8_arr[tx_type];
2558 int ud_flip, lr_flip;
2559
2560 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2561 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2562 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2563 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2564 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2565 if (ud_flip) {
2566 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2567 } else {
2568 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2569 }
2570 round_shift_16bit_vector(buf0, height, &v_shift0);
2571 col_txfm(buf0, buf0, cos_bit_col, NULL);
2572 round_shift_16bit_vector(buf0, height, &v_shift1);
2573 transpose_16bit_8x8(buf0, buf1);
2574 transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2575 transpose_16bit_8x8(buf0 + 16, buf1 + 16);
2576 transpose_16bit_8x8(buf0 + 24, buf1 + 24);
2577
2578 for (int i = 0; i < 4; i++) {
2579 int16x8_t *buf;
2580 if (lr_flip) {
2581 buf = buf0;
2582 flip_buf_neon(buf1 + width * i, buf, width);
2583 } else {
2584 buf = buf1 + width * i;
2585 }
2586 row_txfm(buf, buf, cos_bit_row, NULL);
2587 round_shift_16bit_vector(buf0, height, &v_shift2);
2588 transpose_16bit_8x8(buf, buf);
2589 store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2590 }
2591 }
2592
av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2593 void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
2594 int stride, TX_TYPE tx_type, int bd) {
2595 (void)bd;
2596 int16x8_t buf0[16], buf1[16];
2597 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
2598 const int txw_idx = get_txw_idx(TX_16X4);
2599 const int txh_idx = get_txh_idx(TX_16X4);
2600 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2601 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2602 const int width = 16;
2603 const int height = 4;
2604 const transform_1d_lbd_neon col_txfm = col_txfm8x4_arr[tx_type];
2605 const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2606 int16x8_t *buf;
2607 int ud_flip, lr_flip;
2608
2609 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2610 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2611 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2612 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2613 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2614 for (int i = 0; i < 2; i++) {
2615 if (ud_flip) {
2616 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2617 } else {
2618 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2619 }
2620 round_shift_16bit_vector(buf0, height, &v_shift0);
2621 col_txfm(buf0, buf0, cos_bit_col, NULL);
2622 round_shift_16bit_vector(buf0, height, &v_shift1);
2623 transpose_16bit_8x4(buf0, buf1 + 8 * i);
2624 }
2625
2626 if (lr_flip) {
2627 buf = buf0;
2628 flip_buf_neon(buf1, buf, width);
2629 } else {
2630 buf = buf1;
2631 }
2632 row_txfm(buf, buf, cos_bit_row, NULL);
2633 round_shift_16bit_vector(buf0, height, &v_shift2);
2634 transpose_16bit_4x8(buf, buf);
2635 store_buffer_16bit_to_32bit_w8(buf, output, width, height);
2636 transpose_16bit_4x8(buf + 8, buf + 8);
2637 store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
2638 }
2639
av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2640 void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
2641 int stride, TX_TYPE tx_type, int bd) {
2642 (void)bd;
2643 int16x8_t buf0[16], buf1[16];
2644 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2645 const int txw_idx = get_txw_idx(TX_16X8);
2646 const int txh_idx = get_txh_idx(TX_16X8);
2647 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2648 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2649 const int width = 16;
2650 const int height = 8;
2651 const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2652 const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2653 int16x8_t *buf;
2654 int ud_flip, lr_flip;
2655
2656 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2657 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2658 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2659 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2660 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2661 for (int i = 0; i < 2; i++) {
2662 if (ud_flip) {
2663 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2664 } else {
2665 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2666 }
2667 round_shift_16bit_vector(buf0, height, &v_shift0);
2668 col_txfm(buf0, buf0, cos_bit_col, NULL);
2669 round_shift_16bit_vector(buf0, height, &v_shift1);
2670 transpose_16bit_8x8(buf0, buf1 + 8 * i);
2671 }
2672
2673 if (lr_flip) {
2674 buf = buf0;
2675 flip_buf_neon(buf1, buf, width);
2676 } else {
2677 buf = buf1;
2678 }
2679 row_txfm(buf, buf, cos_bit_row, NULL);
2680 round_shift_16bit_vector(buf0, height, &v_shift2);
2681 transpose_16bit_8x8(buf, buf);
2682 store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
2683 transpose_16bit_8x8(buf + 8, buf + 8);
2684 store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
2685 }
2686
av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2687 void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
2688 int stride, TX_TYPE tx_type, int bd) {
2689 (void)bd;
2690 int16x8_t buf0[16], buf1[32];
2691 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
2692 const int txw_idx = get_txw_idx(TX_16X16);
2693 const int txh_idx = get_txh_idx(TX_16X16);
2694 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2695 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2696 const int width = 16;
2697 const int height = 16;
2698 const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2699 const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2700 int ud_flip, lr_flip;
2701
2702 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2703 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2704 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2705 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2706 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2707
2708 for (int i = 0; i < 2; i++) {
2709 if (ud_flip) {
2710 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2711 } else {
2712 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2713 }
2714 round_shift_16bit_vector(buf0, height, &v_shift0);
2715 col_txfm(buf0, buf0, cos_bit_col, NULL);
2716 round_shift_16bit_vector(buf0, height, &v_shift1);
2717 transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2718 transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
2719 }
2720
2721 for (int i = 0; i < 2; i++) {
2722 int16x8_t *buf;
2723 if (lr_flip) {
2724 buf = buf0;
2725 flip_buf_neon(buf1 + width * i, buf, width);
2726 } else {
2727 buf = buf1 + width * i;
2728 }
2729 row_txfm(buf, buf, cos_bit_row, NULL);
2730 round_shift_16bit_vector(buf0, height, &v_shift2);
2731 transpose_16bit_8x8(buf, buf);
2732 store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2733 transpose_16bit_8x8(buf + 8, buf + 8);
2734 store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2735 8);
2736 }
2737 }
2738
av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2739 void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
2740 int stride, TX_TYPE tx_type, int bd) {
2741 (void)bd;
2742 int16x8_t buf0[32], buf1[64];
2743 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
2744 const int txw_idx = get_txw_idx(TX_16X32);
2745 const int txh_idx = get_txh_idx(TX_16X32);
2746 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2747 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2748 const int width = 16;
2749 const int height = 32;
2750 const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2751 const transform_1d_lbd_neon row_txfm = row_txfm8x16_arr[tx_type];
2752
2753 if (col_txfm != NULL && row_txfm != NULL) {
2754 int ud_flip, lr_flip;
2755 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2756 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2757 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2758 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2759 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2760
2761 for (int i = 0; i < 2; i++) {
2762 if (ud_flip) {
2763 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2764 } else {
2765 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2766 }
2767 round_shift_16bit_vector(buf0, height, &v_shift0);
2768 col_txfm(buf0, buf0, cos_bit_col, NULL);
2769 round_shift_16bit_vector(buf0, height, &v_shift1);
2770 transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
2771 transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
2772 transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
2773 transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
2774 }
2775
2776 for (int i = 0; i < 4; i++) {
2777 int16x8_t *buf;
2778 if (lr_flip) {
2779 buf = buf0;
2780 flip_buf_neon(buf1 + width * i, buf, width);
2781 } else {
2782 buf = buf1 + width * i;
2783 }
2784 row_txfm(buf, buf, cos_bit_row, NULL);
2785 round_shift_16bit_vector(buf0, height, &v_shift2);
2786 transpose_16bit_8x8(buf, buf);
2787 store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2788 8);
2789 transpose_16bit_8x8(buf + 8, buf + 8);
2790 store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
2791 width, 8);
2792 }
2793 } else {
2794 av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
2795 }
2796 }
2797
av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2798 void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
2799 int stride, TX_TYPE tx_type, int bd) {
2800 (void)bd;
2801 int16x8_t buf0[32], buf1[32];
2802 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
2803 const int txw_idx = get_txw_idx(TX_32X8);
2804 const int txh_idx = get_txh_idx(TX_32X8);
2805 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2806 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2807 const int width = 32;
2808 const int height = 8;
2809 const transform_1d_lbd_neon col_txfm = col_txfm8x8_arr[tx_type];
2810 const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2811
2812 if (col_txfm != NULL && row_txfm != NULL) {
2813 int ud_flip, lr_flip;
2814 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2815 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2816 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2817 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2818 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2819
2820 for (int i = 0; i < 4; i++) {
2821 if (ud_flip) {
2822 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2823 } else {
2824 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2825 }
2826 round_shift_16bit_vector(buf0, height, &v_shift0);
2827 col_txfm(buf0, buf0, cos_bit_col, NULL);
2828 round_shift_16bit_vector(buf0, height, &v_shift1);
2829 transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2830 }
2831
2832 for (int i = 0; i < 1; i++) {
2833 int16x8_t *buf;
2834 if (lr_flip) {
2835 buf = buf0;
2836 flip_buf_neon(buf1 + width * i, buf, width);
2837 } else {
2838 buf = buf1 + width * i;
2839 }
2840 row_txfm(buf, buf, cos_bit_row, NULL);
2841 round_shift_16bit_vector(buf, width, &v_shift2);
2842 transpose_16bit_8x8(buf, buf);
2843 store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2844 height);
2845 transpose_16bit_8x8(buf + 8, buf + 8);
2846 store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2847 height);
2848 transpose_16bit_8x8(buf + 16, buf + 16);
2849 store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2850 width, height);
2851 transpose_16bit_8x8(buf + 24, buf + 24);
2852 store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2853 width, height);
2854 }
2855 } else {
2856 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2857 }
2858 }
2859
av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2860 void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
2861 int stride, TX_TYPE tx_type, int bd) {
2862 (void)bd;
2863 int16x8_t buf0[32], buf1[64];
2864 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
2865 const int txw_idx = get_txw_idx(TX_32X16);
2866 const int txh_idx = get_txh_idx(TX_32X16);
2867 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2868 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2869 const int width = 32;
2870 const int height = 16;
2871 const transform_1d_lbd_neon col_txfm = col_txfm8x16_arr[tx_type];
2872 const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2873
2874 if (col_txfm != NULL && row_txfm != NULL) {
2875 const int16x4_t v_shifts = vget_low_s16(vmovl_s8(vld1_s8(&shift[0])));
2876 const int16x8_t v_shift0 = vdupq_lane_s16(v_shifts, 0);
2877 const int16x8_t v_shift1 = vdupq_lane_s16(v_shifts, 1);
2878 const int16x8_t v_shift2 = vdupq_lane_s16(v_shifts, 2);
2879 int ud_flip, lr_flip;
2880 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2881
2882 for (int i = 0; i < 4; i++) {
2883 if (ud_flip) {
2884 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2885 } else {
2886 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2887 }
2888 round_shift_16bit_vector(buf0, height, &v_shift0);
2889 col_txfm(buf0, buf0, cos_bit_col, NULL);
2890 round_shift_16bit_vector(buf0, height, &v_shift1);
2891 transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
2892 transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
2893 }
2894
2895 for (int i = 0; i < 2; i++) {
2896 int16x8_t *buf;
2897 if (lr_flip) {
2898 buf = buf0;
2899 flip_buf_neon(buf1 + width * i, buf, width);
2900 } else {
2901 buf = buf1 + width * i;
2902 }
2903 row_txfm(buf, buf, cos_bit_row, NULL);
2904 round_shift_16bit_vector(buf, width, &v_shift2);
2905 transpose_16bit_8x8(buf, buf);
2906 store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
2907 8);
2908 transpose_16bit_8x8(buf + 8, buf + 8);
2909 store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
2910 width, 8);
2911 transpose_16bit_8x8(buf + 16, buf + 16);
2912 store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2913 width, 8);
2914 transpose_16bit_8x8(buf + 24, buf + 24);
2915 store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2916 width, 8);
2917 }
2918 } else {
2919 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
2920 }
2921 }
2922
av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2923 void av1_lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
2924 int stride, TX_TYPE tx_type, int bd) {
2925 (void)bd;
2926 int16x8_t buf0[32], buf1[128];
2927 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
2928 const int txw_idx = get_txw_idx(TX_32X32);
2929 const int txh_idx = get_txh_idx(TX_32X32);
2930 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2931 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2932 const int width = 32;
2933 const int height = 32;
2934 const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
2935 const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr[tx_type];
2936
2937 if (col_txfm != NULL && row_txfm != NULL) {
2938 int ud_flip, lr_flip;
2939 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2940
2941 for (int i = 0; i < 4; i++) {
2942 if (ud_flip) {
2943 load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
2944 } else {
2945 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
2946 }
2947 round_shift_16bit(buf0, height, shift[0]);
2948 col_txfm(buf0, buf0, cos_bit_col, NULL);
2949 round_shift_16bit(buf0, height, shift[1]);
2950 transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
2951 transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
2952 transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
2953 transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
2954 }
2955
2956 for (int i = 0; i < 4; i++) {
2957 int16x8_t *buf;
2958 if (lr_flip) {
2959 buf = buf0;
2960 flip_buf_neon(buf1 + width * i, buf, width);
2961 } else {
2962 buf = buf1 + width * i;
2963 }
2964 row_txfm(buf, buf, cos_bit_row, NULL);
2965 round_shift_16bit(buf, width, shift[2]);
2966 transpose_16bit_8x8(buf, buf);
2967 store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
2968 transpose_16bit_8x8(buf + 8, buf + 8);
2969 store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
2970 8);
2971 transpose_16bit_8x8(buf + 16, buf + 16);
2972 store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
2973 width, 8);
2974 transpose_16bit_8x8(buf + 24, buf + 24);
2975 store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
2976 width, 8);
2977 }
2978 } else {
2979 av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
2980 }
2981 }
2982
av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2983 void av1_lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
2984 int stride, TX_TYPE tx_type, int bd) {
2985 (void)bd;
2986 (void)tx_type;
2987 assert(tx_type == DCT_DCT);
2988 const TX_SIZE tx_size = TX_64X16;
2989 int16x8_t buf0[64], buf1[128];
2990 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2991 const int txw_idx = get_txw_idx(tx_size);
2992 const int txh_idx = get_txh_idx(tx_size);
2993 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2994 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2995 const int width = tx_size_wide[tx_size];
2996 const int height = tx_size_high[tx_size];
2997 const transform_1d_lbd_neon col_txfm = fdct8x16_neon;
2998 const transform_1d_lbd_neon row_txfm = av1_fdct8x64_neon;
2999 const int width_div8 = (width >> 3);
3000 const int height_div8 = (height >> 3);
3001
3002 for (int i = 0; i < width_div8; i++) {
3003 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
3004 round_shift_16bit(buf0, height, shift[0]);
3005 col_txfm(buf0, buf0, cos_bit_col, NULL);
3006 round_shift_16bit(buf0, height, shift[1]);
3007 for (int j = 0; j < height_div8; ++j) {
3008 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
3009 }
3010 }
3011
3012 for (int i = 0; i < height_div8; i++) {
3013 int16x8_t *buf = buf1 + width * i;
3014 row_txfm(buf, buf, cos_bit_row, NULL);
3015 round_shift_16bit(buf, width, shift[2]);
3016 int32_t *output8 = output + 8 * 32 * i;
3017 for (int j = 0; j < 4; ++j) {
3018 int16x8_t *buf8 = buf + 8 * j;
3019 transpose_16bit_8x8(buf8, buf8);
3020 store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
3021 }
3022 }
3023 }
3024
av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)3025 void av1_lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
3026 int stride, TX_TYPE tx_type, int bd) {
3027 (void)bd;
3028 (void)tx_type;
3029 assert(tx_type == DCT_DCT);
3030 const TX_SIZE tx_size = TX_16X64;
3031 int16x8_t buf0[64], buf1[128];
3032 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
3033 const int txw_idx = get_txw_idx(tx_size);
3034 const int txh_idx = get_txh_idx(tx_size);
3035 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
3036 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
3037 const int width = tx_size_wide[tx_size];
3038 const int height = tx_size_high[tx_size];
3039 const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
3040 const transform_1d_lbd_neon row_txfm = fdct8x16_neon;
3041 const int width_div8 = (width >> 3);
3042 const int height_div8 = (height >> 3);
3043
3044 for (int i = 0; i < width_div8; i++) {
3045 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
3046 round_shift_16bit(buf0, height, shift[0]);
3047 col_txfm(buf0, buf0, cos_bit_col, NULL);
3048 round_shift_16bit(buf0, height, shift[1]);
3049 for (int j = 0; j < height_div8; ++j) {
3050 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
3051 }
3052 }
3053
3054 for (int i = 0; i < AOMMIN(4, height_div8); i++) {
3055 int16x8_t *buf = buf1 + width * i;
3056 row_txfm(buf, buf, cos_bit_row, NULL);
3057 round_shift_16bit(buf, width, shift[2]);
3058 int32_t *output8 = output + 8 * width * i;
3059 for (int j = 0; j < width_div8; ++j) {
3060 int16x8_t *buf8 = buf + 8 * j;
3061 transpose_16bit_8x8(buf8, buf8);
3062 store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
3063 }
3064 }
3065 // Zero out the bottom 16x32 area.
3066 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
3067 }
3068
3069 #define TRANSPOSE_4X4_L32(x0, x1, x2, x3, y0, y1, y2, y3) \
3070 do { \
3071 int32x4x2_t temp01 = vzipq_s32(x0, x1); \
3072 int32x4x2_t temp23 = vzipq_s32(x2, x3); \
3073 int32x4x2_t y01 = vzipq_s32(temp01.val[0], temp23.val[0]); \
3074 int32x4x2_t y23 = vzipq_s32(temp01.val[1], temp23.val[1]); \
3075 y0 = y01.val[0]; \
3076 y1 = y01.val[1]; \
3077 y2 = y23.val[0]; \
3078 y3 = y23.val[1]; \
3079 } while (0)
3080
transpose_32_4x4x2(int stride,const int32x4_t * inputA,const int32x4_t * inputB,int32x4_t * output)3081 static INLINE void transpose_32_4x4x2(int stride, const int32x4_t *inputA,
3082 const int32x4_t *inputB,
3083 int32x4_t *output) {
3084 TRANSPOSE_4X4_L32(inputA[0], inputA[2], inputA[1], inputA[3],
3085 output[0 * stride], output[1 * stride], output[2 * stride],
3086 output[3 * stride]);
3087 TRANSPOSE_4X4_L32(inputB[0], inputB[2], inputB[1], inputB[3],
3088 output[4 * stride], output[5 * stride], output[6 * stride],
3089 output[7 * stride]);
3090 }
3091
av1_fdct32_new_neon(int32x4_t * input,int32x4_t * output,int cos_bit,const int stride,const int8_t * stage_range)3092 static void av1_fdct32_new_neon(int32x4_t *input, int32x4_t *output,
3093 int cos_bit, const int stride,
3094 const int8_t *stage_range) {
3095 (void)stage_range;
3096 int32x4_t buf0[32];
3097 int32x4_t buf1[32];
3098 const int32_t *cospi;
3099 cospi = cospi_arr(cos_bit);
3100 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
3101
3102 int startidx = 0 * stride;
3103 int endidx = 31 * stride;
3104 // stage 0
3105 // stage 1
3106 buf1[0] = vaddq_s32(input[startidx], input[endidx]);
3107 buf1[31] = vsubq_s32(input[startidx], input[endidx]);
3108 startidx += stride;
3109 endidx -= stride;
3110 buf1[1] = vaddq_s32(input[startidx], input[endidx]);
3111 buf1[30] = vsubq_s32(input[startidx], input[endidx]);
3112 startidx += stride;
3113 endidx -= stride;
3114 buf1[2] = vaddq_s32(input[startidx], input[endidx]);
3115 buf1[29] = vsubq_s32(input[startidx], input[endidx]);
3116 startidx += stride;
3117 endidx -= stride;
3118 buf1[3] = vaddq_s32(input[startidx], input[endidx]);
3119 buf1[28] = vsubq_s32(input[startidx], input[endidx]);
3120 startidx += stride;
3121 endidx -= stride;
3122 buf1[4] = vaddq_s32(input[startidx], input[endidx]);
3123 buf1[27] = vsubq_s32(input[startidx], input[endidx]);
3124 startidx += stride;
3125 endidx -= stride;
3126 buf1[5] = vaddq_s32(input[startidx], input[endidx]);
3127 buf1[26] = vsubq_s32(input[startidx], input[endidx]);
3128 startidx += stride;
3129 endidx -= stride;
3130 buf1[6] = vaddq_s32(input[startidx], input[endidx]);
3131 buf1[25] = vsubq_s32(input[startidx], input[endidx]);
3132 startidx += stride;
3133 endidx -= stride;
3134 buf1[7] = vaddq_s32(input[startidx], input[endidx]);
3135 buf1[24] = vsubq_s32(input[startidx], input[endidx]);
3136 startidx += stride;
3137 endidx -= stride;
3138 buf1[8] = vaddq_s32(input[startidx], input[endidx]);
3139 buf1[23] = vsubq_s32(input[startidx], input[endidx]);
3140 startidx += stride;
3141 endidx -= stride;
3142 buf1[9] = vaddq_s32(input[startidx], input[endidx]);
3143 buf1[22] = vsubq_s32(input[startidx], input[endidx]);
3144 startidx += stride;
3145 endidx -= stride;
3146 buf1[10] = vaddq_s32(input[startidx], input[endidx]);
3147 buf1[21] = vsubq_s32(input[startidx], input[endidx]);
3148 startidx += stride;
3149 endidx -= stride;
3150 buf1[11] = vaddq_s32(input[startidx], input[endidx]);
3151 buf1[20] = vsubq_s32(input[startidx], input[endidx]);
3152 startidx += stride;
3153 endidx -= stride;
3154 buf1[12] = vaddq_s32(input[startidx], input[endidx]);
3155 buf1[19] = vsubq_s32(input[startidx], input[endidx]);
3156 startidx += stride;
3157 endidx -= stride;
3158 buf1[13] = vaddq_s32(input[startidx], input[endidx]);
3159 buf1[18] = vsubq_s32(input[startidx], input[endidx]);
3160 startidx += stride;
3161 endidx -= stride;
3162 buf1[14] = vaddq_s32(input[startidx], input[endidx]);
3163 buf1[17] = vsubq_s32(input[startidx], input[endidx]);
3164 startidx += stride;
3165 endidx -= stride;
3166 buf1[15] = vaddq_s32(input[startidx], input[endidx]);
3167 buf1[16] = vsubq_s32(input[startidx], input[endidx]);
3168
3169 // stage 2
3170 buf0[0] = vaddq_s32(buf1[0], buf1[15]);
3171 buf0[15] = vsubq_s32(buf1[0], buf1[15]);
3172 buf0[1] = vaddq_s32(buf1[1], buf1[14]);
3173 buf0[14] = vsubq_s32(buf1[1], buf1[14]);
3174 buf0[2] = vaddq_s32(buf1[2], buf1[13]);
3175 buf0[13] = vsubq_s32(buf1[2], buf1[13]);
3176 buf0[3] = vaddq_s32(buf1[3], buf1[12]);
3177 buf0[12] = vsubq_s32(buf1[3], buf1[12]);
3178 buf0[4] = vaddq_s32(buf1[4], buf1[11]);
3179 buf0[11] = vsubq_s32(buf1[4], buf1[11]);
3180 buf0[5] = vaddq_s32(buf1[5], buf1[10]);
3181 buf0[10] = vsubq_s32(buf1[5], buf1[10]);
3182 buf0[6] = vaddq_s32(buf1[6], buf1[9]);
3183 buf0[9] = vsubq_s32(buf1[6], buf1[9]);
3184 buf0[7] = vaddq_s32(buf1[7], buf1[8]);
3185 buf0[8] = vsubq_s32(buf1[7], buf1[8]);
3186 buf0[16] = buf1[16];
3187 buf0[17] = buf1[17];
3188 buf0[18] = buf1[18];
3189 buf0[19] = buf1[19];
3190 btf_32_neon_mode0(cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
3191 buf0[27], v_cos_bit);
3192 btf_32_neon_mode0(cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
3193 buf0[26], v_cos_bit);
3194 btf_32_neon_mode0(cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
3195 buf0[25], v_cos_bit);
3196 btf_32_neon_mode0(cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
3197 buf0[24], v_cos_bit);
3198 buf0[28] = buf1[28];
3199 buf0[29] = buf1[29];
3200 buf0[30] = buf1[30];
3201 buf0[31] = buf1[31];
3202
3203 // stage 3
3204 cospi = cospi_arr(cos_bit);
3205 buf1[0] = vaddq_s32(buf0[0], buf0[7]);
3206 buf1[7] = vsubq_s32(buf0[0], buf0[7]);
3207 buf1[1] = vaddq_s32(buf0[1], buf0[6]);
3208 buf1[6] = vsubq_s32(buf0[1], buf0[6]);
3209 buf1[2] = vaddq_s32(buf0[2], buf0[5]);
3210 buf1[5] = vsubq_s32(buf0[2], buf0[5]);
3211 buf1[3] = vaddq_s32(buf0[3], buf0[4]);
3212 buf1[4] = vsubq_s32(buf0[3], buf0[4]);
3213 buf1[8] = buf0[8];
3214 buf1[9] = buf0[9];
3215 btf_32_neon_mode0(cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
3216 buf1[13], v_cos_bit);
3217 btf_32_neon_mode0(cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
3218 buf1[12], v_cos_bit);
3219 buf1[14] = buf0[14];
3220 buf1[15] = buf0[15];
3221 buf1[16] = vaddq_s32(buf0[16], buf0[23]);
3222 buf1[23] = vsubq_s32(buf0[16], buf0[23]);
3223 buf1[17] = vaddq_s32(buf0[17], buf0[22]);
3224 buf1[22] = vsubq_s32(buf0[17], buf0[22]);
3225 buf1[18] = vaddq_s32(buf0[18], buf0[21]);
3226 buf1[21] = vsubq_s32(buf0[18], buf0[21]);
3227 buf1[19] = vaddq_s32(buf0[19], buf0[20]);
3228 buf1[20] = vsubq_s32(buf0[19], buf0[20]);
3229 buf1[24] = vsubq_s32(buf0[31], buf0[24]);
3230 buf1[31] = vaddq_s32(buf0[31], buf0[24]);
3231 buf1[25] = vsubq_s32(buf0[30], buf0[25]);
3232 buf1[30] = vaddq_s32(buf0[30], buf0[25]);
3233 buf1[26] = vsubq_s32(buf0[29], buf0[26]);
3234 buf1[29] = vaddq_s32(buf0[29], buf0[26]);
3235 buf1[27] = vsubq_s32(buf0[28], buf0[27]);
3236 buf1[28] = vaddq_s32(buf0[28], buf0[27]);
3237
3238 // stage 4
3239 cospi = cospi_arr(cos_bit);
3240 buf0[0] = vaddq_s32(buf1[0], buf1[3]);
3241 buf0[3] = vsubq_s32(buf1[0], buf1[3]);
3242 buf0[1] = vaddq_s32(buf1[1], buf1[2]);
3243 buf0[2] = vsubq_s32(buf1[1], buf1[2]);
3244 buf0[4] = buf1[4];
3245 btf_32_neon_mode0(cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
3246 v_cos_bit);
3247 buf0[7] = buf1[7];
3248 buf0[8] = vaddq_s32(buf1[8], buf1[11]);
3249 buf0[11] = vsubq_s32(buf1[8], buf1[11]);
3250 buf0[9] = vaddq_s32(buf1[9], buf1[10]);
3251 buf0[10] = vsubq_s32(buf1[9], buf1[10]);
3252 buf0[12] = vsubq_s32(buf1[15], buf1[12]);
3253 buf0[15] = vaddq_s32(buf1[15], buf1[12]);
3254 buf0[13] = vsubq_s32(buf1[14], buf1[13]);
3255 buf0[14] = vaddq_s32(buf1[14], buf1[13]);
3256 buf0[16] = buf1[16];
3257 buf0[17] = buf1[17];
3258 btf_32_neon_mode0(cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
3259 buf0[29], v_cos_bit);
3260 btf_32_neon_mode0(cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
3261 buf0[28], v_cos_bit);
3262 btf_32_neon_mode01(cospi[48], cospi[16], buf1[20], buf1[27], buf0[20],
3263 buf0[27], v_cos_bit);
3264 btf_32_neon_mode01(cospi[48], cospi[16], buf1[21], buf1[26], buf0[21],
3265 buf0[26], v_cos_bit);
3266 buf0[22] = buf1[22];
3267 buf0[23] = buf1[23];
3268 buf0[24] = buf1[24];
3269 buf0[25] = buf1[25];
3270 buf0[30] = buf1[30];
3271 buf0[31] = buf1[31];
3272
3273 // stage 5
3274 cospi = cospi_arr(cos_bit);
3275 btf_32_neon(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
3276 v_cos_bit);
3277 btf_32_type1_neon(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
3278 v_cos_bit);
3279 buf1[4] = vaddq_s32(buf0[4], buf0[5]);
3280 buf1[5] = vsubq_s32(buf0[4], buf0[5]);
3281 buf1[6] = vsubq_s32(buf0[7], buf0[6]);
3282 buf1[7] = vaddq_s32(buf0[7], buf0[6]);
3283 buf1[8] = buf0[8];
3284 btf_32_neon_mode0(cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
3285 v_cos_bit);
3286 btf_32_neon_mode01(cospi[48], cospi[16], buf0[10], buf0[13], buf1[10],
3287 buf1[13], v_cos_bit);
3288 buf1[11] = buf0[11];
3289 buf1[12] = buf0[12];
3290 buf1[15] = buf0[15];
3291 buf1[16] = vaddq_s32(buf0[16], buf0[19]);
3292 buf1[19] = vsubq_s32(buf0[16], buf0[19]);
3293 buf1[17] = vaddq_s32(buf0[17], buf0[18]);
3294 buf1[18] = vsubq_s32(buf0[17], buf0[18]);
3295 buf1[20] = vsubq_s32(buf0[23], buf0[20]);
3296 buf1[23] = vaddq_s32(buf0[23], buf0[20]);
3297 buf1[21] = vsubq_s32(buf0[22], buf0[21]);
3298 buf1[22] = vaddq_s32(buf0[22], buf0[21]);
3299 buf1[24] = vaddq_s32(buf0[24], buf0[27]);
3300 buf1[27] = vsubq_s32(buf0[24], buf0[27]);
3301 buf1[25] = vaddq_s32(buf0[25], buf0[26]);
3302 buf1[26] = vsubq_s32(buf0[25], buf0[26]);
3303 buf1[28] = vsubq_s32(buf0[31], buf0[28]);
3304 buf1[31] = vaddq_s32(buf0[31], buf0[28]);
3305 buf1[29] = vsubq_s32(buf0[30], buf0[29]);
3306 buf1[30] = vaddq_s32(buf0[30], buf0[29]);
3307
3308 // stage 6
3309 cospi = cospi_arr(cos_bit);
3310 buf0[0] = buf1[0];
3311 buf0[1] = buf1[1];
3312 buf0[2] = buf1[2];
3313 buf0[3] = buf1[3];
3314 btf_32_type1_neon(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
3315 v_cos_bit);
3316 btf_32_type1_neon(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
3317 v_cos_bit);
3318 buf0[8] = vaddq_s32(buf1[8], buf1[9]);
3319 buf0[9] = vsubq_s32(buf1[8], buf1[9]);
3320 buf0[10] = vsubq_s32(buf1[11], buf1[10]);
3321 buf0[11] = vaddq_s32(buf1[11], buf1[10]);
3322 buf0[12] = vaddq_s32(buf1[12], buf1[13]);
3323 buf0[13] = vsubq_s32(buf1[12], buf1[13]);
3324 buf0[14] = vsubq_s32(buf1[15], buf1[14]);
3325 buf0[15] = vaddq_s32(buf1[15], buf1[14]);
3326 buf0[16] = buf1[16];
3327 btf_32_neon_mode0(cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], buf0[30],
3328 v_cos_bit);
3329 btf_32_neon_mode01(cospi[56], cospi[8], buf1[18], buf1[29], buf0[18],
3330 buf0[29], v_cos_bit);
3331 buf0[19] = buf1[19];
3332 buf0[20] = buf1[20];
3333 btf_32_neon_mode0(cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
3334 buf0[26], v_cos_bit);
3335 btf_32_neon_mode01(cospi[24], cospi[40], buf1[22], buf1[25], buf0[22],
3336 buf0[25], v_cos_bit);
3337 buf0[23] = buf1[23];
3338 buf0[24] = buf1[24];
3339 buf0[27] = buf1[27];
3340 buf0[28] = buf1[28];
3341 buf0[31] = buf1[31];
3342
3343 // stage 7
3344 cospi = cospi_arr(cos_bit);
3345 buf1[0] = buf0[0];
3346 buf1[1] = buf0[1];
3347 buf1[2] = buf0[2];
3348 buf1[3] = buf0[3];
3349 buf1[4] = buf0[4];
3350 buf1[5] = buf0[5];
3351 buf1[6] = buf0[6];
3352 buf1[7] = buf0[7];
3353
3354 btf_32_type1_neon(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
3355 v_cos_bit);
3356 btf_32_type1_neon(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], buf1[14],
3357 v_cos_bit);
3358 btf_32_type1_neon(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
3359 buf1[13], v_cos_bit);
3360 btf_32_type1_neon(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
3361 buf1[12], v_cos_bit);
3362 buf1[16] = vaddq_s32(buf0[16], buf0[17]);
3363 buf1[17] = vsubq_s32(buf0[16], buf0[17]);
3364 buf1[18] = vsubq_s32(buf0[19], buf0[18]);
3365 buf1[19] = vaddq_s32(buf0[19], buf0[18]);
3366 buf1[20] = vaddq_s32(buf0[20], buf0[21]);
3367 buf1[21] = vsubq_s32(buf0[20], buf0[21]);
3368 buf1[22] = vsubq_s32(buf0[23], buf0[22]);
3369 buf1[23] = vaddq_s32(buf0[23], buf0[22]);
3370 buf1[24] = vaddq_s32(buf0[24], buf0[25]);
3371 buf1[25] = vsubq_s32(buf0[24], buf0[25]);
3372 buf1[26] = vsubq_s32(buf0[27], buf0[26]);
3373 buf1[27] = vaddq_s32(buf0[27], buf0[26]);
3374 buf1[28] = vaddq_s32(buf0[28], buf0[29]);
3375 buf1[29] = vsubq_s32(buf0[28], buf0[29]);
3376 buf1[30] = vsubq_s32(buf0[31], buf0[30]);
3377 buf1[31] = vaddq_s32(buf0[31], buf0[30]);
3378
3379 // stage 8
3380 cospi = cospi_arr(cos_bit);
3381 buf0[0] = buf1[0];
3382 buf0[1] = buf1[1];
3383 buf0[2] = buf1[2];
3384 buf0[3] = buf1[3];
3385 buf0[4] = buf1[4];
3386 buf0[5] = buf1[5];
3387 buf0[6] = buf1[6];
3388 buf0[7] = buf1[7];
3389 buf0[8] = buf1[8];
3390 buf0[9] = buf1[9];
3391 buf0[10] = buf1[10];
3392 buf0[11] = buf1[11];
3393 buf0[12] = buf1[12];
3394 buf0[13] = buf1[13];
3395 buf0[14] = buf1[14];
3396 buf0[15] = buf1[15];
3397
3398 btf_32_type1_neon(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], buf0[31],
3399 v_cos_bit);
3400 btf_32_type1_neon(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
3401 buf0[30], v_cos_bit);
3402 btf_32_type1_neon(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
3403 buf0[29], v_cos_bit);
3404 btf_32_type1_neon(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
3405 buf0[28], v_cos_bit);
3406 btf_32_type1_neon(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
3407 buf0[27], v_cos_bit);
3408 btf_32_type1_neon(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
3409 buf0[26], v_cos_bit);
3410 btf_32_type1_neon(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
3411 buf0[25], v_cos_bit);
3412 btf_32_type1_neon(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], buf0[24],
3413 v_cos_bit);
3414
3415 startidx = 0 * stride;
3416 endidx = 31 * stride;
3417 // stage 9
3418 output[startidx] = buf0[0];
3419 output[endidx] = buf0[31];
3420 startidx += stride;
3421 endidx -= stride;
3422 output[startidx] = buf0[16];
3423 output[endidx] = buf0[15];
3424 startidx += stride;
3425 endidx -= stride;
3426 output[startidx] = buf0[8];
3427 output[endidx] = buf0[23];
3428 startidx += stride;
3429 endidx -= stride;
3430 output[startidx] = buf0[24];
3431 output[endidx] = buf0[7];
3432 startidx += stride;
3433 endidx -= stride;
3434 output[startidx] = buf0[4];
3435 output[endidx] = buf0[27];
3436 startidx += stride;
3437 endidx -= stride;
3438 output[startidx] = buf0[20];
3439 output[endidx] = buf0[11];
3440 startidx += stride;
3441 endidx -= stride;
3442 output[startidx] = buf0[12];
3443 output[endidx] = buf0[19];
3444 startidx += stride;
3445 endidx -= stride;
3446 output[startidx] = buf0[28];
3447 output[endidx] = buf0[3];
3448 startidx += stride;
3449 endidx -= stride;
3450 output[startidx] = buf0[2];
3451 output[endidx] = buf0[29];
3452 startidx += stride;
3453 endidx -= stride;
3454 output[startidx] = buf0[18];
3455 output[endidx] = buf0[13];
3456 startidx += stride;
3457 endidx -= stride;
3458 output[startidx] = buf0[10];
3459 output[endidx] = buf0[21];
3460 startidx += stride;
3461 endidx -= stride;
3462 output[startidx] = buf0[26];
3463 output[endidx] = buf0[5];
3464 startidx += stride;
3465 endidx -= stride;
3466 output[startidx] = buf0[6];
3467 output[endidx] = buf0[25];
3468 startidx += stride;
3469 endidx -= stride;
3470 output[startidx] = buf0[22];
3471 output[endidx] = buf0[9];
3472 startidx += stride;
3473 endidx -= stride;
3474 output[startidx] = buf0[14];
3475 output[endidx] = buf0[17];
3476 startidx += stride;
3477 endidx -= stride;
3478 output[startidx] = buf0[30];
3479 output[endidx] = buf0[1];
3480 }
3481
av1_fdct64_new_stage1234_neon(int32x4_t * input,const int instride,int32x4_t * x3,int32x4_t * x4,const int32_t * cospi,const int32x4_t * v_cos_bit,int * startidx,int * endidx)3482 static void av1_fdct64_new_stage1234_neon(int32x4_t *input, const int instride,
3483 int32x4_t *x3, int32x4_t *x4,
3484 const int32_t *cospi,
3485 const int32x4_t *v_cos_bit,
3486 int *startidx, int *endidx) {
3487 // stage 1
3488 int32x4_t x1[64];
3489 x1[0] = vaddq_s32(input[*startidx], input[*endidx]);
3490 x1[63] = vsubq_s32(input[*startidx], input[*endidx]);
3491 *startidx += instride;
3492 *endidx -= instride;
3493 x1[1] = vaddq_s32(input[*startidx], input[*endidx]);
3494 x1[62] = vsubq_s32(input[*startidx], input[*endidx]);
3495 *startidx += instride;
3496 *endidx -= instride;
3497 x1[2] = vaddq_s32(input[*startidx], input[*endidx]);
3498 x1[61] = vsubq_s32(input[*startidx], input[*endidx]);
3499 *startidx += instride;
3500 *endidx -= instride;
3501 x1[3] = vaddq_s32(input[*startidx], input[*endidx]);
3502 x1[60] = vsubq_s32(input[*startidx], input[*endidx]);
3503 *startidx += instride;
3504 *endidx -= instride;
3505 x1[4] = vaddq_s32(input[*startidx], input[*endidx]);
3506 x1[59] = vsubq_s32(input[*startidx], input[*endidx]);
3507 *startidx += instride;
3508 *endidx -= instride;
3509 x1[5] = vaddq_s32(input[*startidx], input[*endidx]);
3510 x1[58] = vsubq_s32(input[*startidx], input[*endidx]);
3511 *startidx += instride;
3512 *endidx -= instride;
3513 x1[6] = vaddq_s32(input[*startidx], input[*endidx]);
3514 x1[57] = vsubq_s32(input[*startidx], input[*endidx]);
3515 *startidx += instride;
3516 *endidx -= instride;
3517 x1[7] = vaddq_s32(input[*startidx], input[*endidx]);
3518 x1[56] = vsubq_s32(input[*startidx], input[*endidx]);
3519 *startidx += instride;
3520 *endidx -= instride;
3521 x1[8] = vaddq_s32(input[*startidx], input[*endidx]);
3522 x1[55] = vsubq_s32(input[*startidx], input[*endidx]);
3523 *startidx += instride;
3524 *endidx -= instride;
3525 x1[9] = vaddq_s32(input[*startidx], input[*endidx]);
3526 x1[54] = vsubq_s32(input[*startidx], input[*endidx]);
3527 *startidx += instride;
3528 *endidx -= instride;
3529 x1[10] = vaddq_s32(input[*startidx], input[*endidx]);
3530 x1[53] = vsubq_s32(input[*startidx], input[*endidx]);
3531 *startidx += instride;
3532 *endidx -= instride;
3533 x1[11] = vaddq_s32(input[*startidx], input[*endidx]);
3534 x1[52] = vsubq_s32(input[*startidx], input[*endidx]);
3535 *startidx += instride;
3536 *endidx -= instride;
3537 x1[12] = vaddq_s32(input[*startidx], input[*endidx]);
3538 x1[51] = vsubq_s32(input[*startidx], input[*endidx]);
3539 *startidx += instride;
3540 *endidx -= instride;
3541 x1[13] = vaddq_s32(input[*startidx], input[*endidx]);
3542 x1[50] = vsubq_s32(input[*startidx], input[*endidx]);
3543 *startidx += instride;
3544 *endidx -= instride;
3545 x1[14] = vaddq_s32(input[*startidx], input[*endidx]);
3546 x1[49] = vsubq_s32(input[*startidx], input[*endidx]);
3547 *startidx += instride;
3548 *endidx -= instride;
3549 x1[15] = vaddq_s32(input[*startidx], input[*endidx]);
3550 x1[48] = vsubq_s32(input[*startidx], input[*endidx]);
3551 *startidx += instride;
3552 *endidx -= instride;
3553 x1[16] = vaddq_s32(input[*startidx], input[*endidx]);
3554 x1[47] = vsubq_s32(input[*startidx], input[*endidx]);
3555 *startidx += instride;
3556 *endidx -= instride;
3557 x1[17] = vaddq_s32(input[*startidx], input[*endidx]);
3558 x1[46] = vsubq_s32(input[*startidx], input[*endidx]);
3559 *startidx += instride;
3560 *endidx -= instride;
3561 x1[18] = vaddq_s32(input[*startidx], input[*endidx]);
3562 x1[45] = vsubq_s32(input[*startidx], input[*endidx]);
3563 *startidx += instride;
3564 *endidx -= instride;
3565 x1[19] = vaddq_s32(input[*startidx], input[*endidx]);
3566 x1[44] = vsubq_s32(input[*startidx], input[*endidx]);
3567 *startidx += instride;
3568 *endidx -= instride;
3569 x1[20] = vaddq_s32(input[*startidx], input[*endidx]);
3570 x1[43] = vsubq_s32(input[*startidx], input[*endidx]);
3571 *startidx += instride;
3572 *endidx -= instride;
3573 x1[21] = vaddq_s32(input[*startidx], input[*endidx]);
3574 x1[42] = vsubq_s32(input[*startidx], input[*endidx]);
3575 *startidx += instride;
3576 *endidx -= instride;
3577 x1[22] = vaddq_s32(input[*startidx], input[*endidx]);
3578 x1[41] = vsubq_s32(input[*startidx], input[*endidx]);
3579 *startidx += instride;
3580 *endidx -= instride;
3581 x1[23] = vaddq_s32(input[*startidx], input[*endidx]);
3582 x1[40] = vsubq_s32(input[*startidx], input[*endidx]);
3583 *startidx += instride;
3584 *endidx -= instride;
3585 x1[24] = vaddq_s32(input[*startidx], input[*endidx]);
3586 x1[39] = vsubq_s32(input[*startidx], input[*endidx]);
3587 *startidx += instride;
3588 *endidx -= instride;
3589 x1[25] = vaddq_s32(input[*startidx], input[*endidx]);
3590 x1[38] = vsubq_s32(input[*startidx], input[*endidx]);
3591 *startidx += instride;
3592 *endidx -= instride;
3593 x1[26] = vaddq_s32(input[*startidx], input[*endidx]);
3594 x1[37] = vsubq_s32(input[*startidx], input[*endidx]);
3595 *startidx += instride;
3596 *endidx -= instride;
3597 x1[27] = vaddq_s32(input[*startidx], input[*endidx]);
3598 x1[36] = vsubq_s32(input[*startidx], input[*endidx]);
3599 *startidx += instride;
3600 *endidx -= instride;
3601 x1[28] = vaddq_s32(input[*startidx], input[*endidx]);
3602 x1[35] = vsubq_s32(input[*startidx], input[*endidx]);
3603 *startidx += instride;
3604 *endidx -= instride;
3605 x1[29] = vaddq_s32(input[*startidx], input[*endidx]);
3606 x1[34] = vsubq_s32(input[*startidx], input[*endidx]);
3607 *startidx += instride;
3608 *endidx -= instride;
3609 x1[30] = vaddq_s32(input[*startidx], input[*endidx]);
3610 x1[33] = vsubq_s32(input[*startidx], input[*endidx]);
3611 *startidx += instride;
3612 *endidx -= instride;
3613 x1[31] = vaddq_s32(input[*startidx], input[*endidx]);
3614 x1[32] = vsubq_s32(input[*startidx], input[*endidx]);
3615
3616 // stage 2
3617 int32x4_t x2[64];
3618 x2[0] = vaddq_s32(x1[0], x1[31]);
3619 x2[31] = vsubq_s32(x1[0], x1[31]);
3620 x2[1] = vaddq_s32(x1[1], x1[30]);
3621 x2[30] = vsubq_s32(x1[1], x1[30]);
3622 x2[2] = vaddq_s32(x1[2], x1[29]);
3623 x2[29] = vsubq_s32(x1[2], x1[29]);
3624 x2[3] = vaddq_s32(x1[3], x1[28]);
3625 x2[28] = vsubq_s32(x1[3], x1[28]);
3626 x2[4] = vaddq_s32(x1[4], x1[27]);
3627 x2[27] = vsubq_s32(x1[4], x1[27]);
3628 x2[5] = vaddq_s32(x1[5], x1[26]);
3629 x2[26] = vsubq_s32(x1[5], x1[26]);
3630 x2[6] = vaddq_s32(x1[6], x1[25]);
3631 x2[25] = vsubq_s32(x1[6], x1[25]);
3632 x2[7] = vaddq_s32(x1[7], x1[24]);
3633 x2[24] = vsubq_s32(x1[7], x1[24]);
3634 x2[8] = vaddq_s32(x1[8], x1[23]);
3635 x2[23] = vsubq_s32(x1[8], x1[23]);
3636 x2[9] = vaddq_s32(x1[9], x1[22]);
3637 x2[22] = vsubq_s32(x1[9], x1[22]);
3638 x2[10] = vaddq_s32(x1[10], x1[21]);
3639 x2[21] = vsubq_s32(x1[10], x1[21]);
3640 x2[11] = vaddq_s32(x1[11], x1[20]);
3641 x2[20] = vsubq_s32(x1[11], x1[20]);
3642 x2[12] = vaddq_s32(x1[12], x1[19]);
3643 x2[19] = vsubq_s32(x1[12], x1[19]);
3644 x2[13] = vaddq_s32(x1[13], x1[18]);
3645 x2[18] = vsubq_s32(x1[13], x1[18]);
3646 x2[14] = vaddq_s32(x1[14], x1[17]);
3647 x2[17] = vsubq_s32(x1[14], x1[17]);
3648 x2[15] = vaddq_s32(x1[15], x1[16]);
3649 x2[16] = vsubq_s32(x1[15], x1[16]);
3650
3651 btf_32_neon_mode0(cospi[32], cospi[32], x1[40], x1[55], x2[40], x2[55],
3652 *v_cos_bit);
3653 btf_32_neon_mode0(cospi[32], cospi[32], x1[41], x1[54], x2[41], x2[54],
3654 *v_cos_bit);
3655 btf_32_neon_mode0(cospi[32], cospi[32], x1[42], x1[53], x2[42], x2[53],
3656 *v_cos_bit);
3657 btf_32_neon_mode0(cospi[32], cospi[32], x1[43], x1[52], x2[43], x2[52],
3658 *v_cos_bit);
3659 btf_32_neon_mode0(cospi[32], cospi[32], x1[44], x1[51], x2[44], x2[51],
3660 *v_cos_bit);
3661 btf_32_neon_mode0(cospi[32], cospi[32], x1[45], x1[50], x2[45], x2[50],
3662 *v_cos_bit);
3663 btf_32_neon_mode0(cospi[32], cospi[32], x1[46], x1[49], x2[46], x2[49],
3664 *v_cos_bit);
3665 btf_32_neon_mode0(cospi[32], cospi[32], x1[47], x1[48], x2[47], x2[48],
3666 *v_cos_bit);
3667
3668 // stage 3
3669 x3[0] = vaddq_s32(x2[0], x2[15]);
3670 x3[15] = vsubq_s32(x2[0], x2[15]);
3671 x3[1] = vaddq_s32(x2[1], x2[14]);
3672 x3[14] = vsubq_s32(x2[1], x2[14]);
3673 x3[2] = vaddq_s32(x2[2], x2[13]);
3674 x3[13] = vsubq_s32(x2[2], x2[13]);
3675 x3[3] = vaddq_s32(x2[3], x2[12]);
3676 x3[12] = vsubq_s32(x2[3], x2[12]);
3677 x3[4] = vaddq_s32(x2[4], x2[11]);
3678 x3[11] = vsubq_s32(x2[4], x2[11]);
3679 x3[5] = vaddq_s32(x2[5], x2[10]);
3680 x3[10] = vsubq_s32(x2[5], x2[10]);
3681 x3[6] = vaddq_s32(x2[6], x2[9]);
3682 x3[9] = vsubq_s32(x2[6], x2[9]);
3683 x3[7] = vaddq_s32(x2[7], x2[8]);
3684 x3[8] = vsubq_s32(x2[7], x2[8]);
3685
3686 btf_32_neon_mode0(cospi[32], cospi[32], x2[20], x2[27], x3[20], x3[27],
3687 *v_cos_bit);
3688 btf_32_neon_mode0(cospi[32], cospi[32], x2[21], x2[26], x3[21], x3[26],
3689 *v_cos_bit);
3690 btf_32_neon_mode0(cospi[32], cospi[32], x2[22], x2[25], x3[22], x3[25],
3691 *v_cos_bit);
3692 btf_32_neon_mode0(cospi[32], cospi[32], x2[23], x2[24], x3[23], x3[24],
3693 *v_cos_bit);
3694
3695 x3[32] = vaddq_s32(x1[32], x2[47]);
3696 x3[47] = vsubq_s32(x1[32], x2[47]);
3697 x3[33] = vaddq_s32(x1[33], x2[46]);
3698 x3[46] = vsubq_s32(x1[33], x2[46]);
3699 x3[34] = vaddq_s32(x1[34], x2[45]);
3700 x3[45] = vsubq_s32(x1[34], x2[45]);
3701 x3[35] = vaddq_s32(x1[35], x2[44]);
3702 x3[44] = vsubq_s32(x1[35], x2[44]);
3703 x3[36] = vaddq_s32(x1[36], x2[43]);
3704 x3[43] = vsubq_s32(x1[36], x2[43]);
3705 x3[37] = vaddq_s32(x1[37], x2[42]);
3706 x3[42] = vsubq_s32(x1[37], x2[42]);
3707 x3[38] = vaddq_s32(x1[38], x2[41]);
3708 x3[41] = vsubq_s32(x1[38], x2[41]);
3709 x3[39] = vaddq_s32(x1[39], x2[40]);
3710 x3[40] = vsubq_s32(x1[39], x2[40]);
3711 x3[48] = vsubq_s32(x1[63], x2[48]);
3712 x3[63] = vaddq_s32(x1[63], x2[48]);
3713 x3[49] = vsubq_s32(x1[62], x2[49]);
3714 x3[62] = vaddq_s32(x1[62], x2[49]);
3715 x3[50] = vsubq_s32(x1[61], x2[50]);
3716 x3[61] = vaddq_s32(x1[61], x2[50]);
3717 x3[51] = vsubq_s32(x1[60], x2[51]);
3718 x3[60] = vaddq_s32(x1[60], x2[51]);
3719 x3[52] = vsubq_s32(x1[59], x2[52]);
3720 x3[59] = vaddq_s32(x1[59], x2[52]);
3721 x3[53] = vsubq_s32(x1[58], x2[53]);
3722 x3[58] = vaddq_s32(x1[58], x2[53]);
3723 x3[54] = vsubq_s32(x1[57], x2[54]);
3724 x3[57] = vaddq_s32(x1[57], x2[54]);
3725 x3[55] = vsubq_s32(x1[56], x2[55]);
3726 x3[56] = vaddq_s32(x1[56], x2[55]);
3727
3728 // stage 4
3729 x4[0] = vaddq_s32(x3[0], x3[7]);
3730 x4[7] = vsubq_s32(x3[0], x3[7]);
3731 x4[1] = vaddq_s32(x3[1], x3[6]);
3732 x4[6] = vsubq_s32(x3[1], x3[6]);
3733 x4[2] = vaddq_s32(x3[2], x3[5]);
3734 x4[5] = vsubq_s32(x3[2], x3[5]);
3735 x4[3] = vaddq_s32(x3[3], x3[4]);
3736 x4[4] = vsubq_s32(x3[3], x3[4]);
3737
3738 btf_32_neon_mode0(cospi[32], cospi[32], x3[10], x3[13], x4[10], x4[13],
3739 *v_cos_bit);
3740 btf_32_neon_mode0(cospi[32], cospi[32], x3[11], x3[12], x4[11], x4[12],
3741 *v_cos_bit);
3742
3743 x4[16] = vaddq_s32(x2[16], x3[23]);
3744 x4[23] = vsubq_s32(x2[16], x3[23]);
3745 x4[17] = vaddq_s32(x2[17], x3[22]);
3746 x4[22] = vsubq_s32(x2[17], x3[22]);
3747 x4[18] = vaddq_s32(x2[18], x3[21]);
3748 x4[21] = vsubq_s32(x2[18], x3[21]);
3749 x4[19] = vaddq_s32(x2[19], x3[20]);
3750 x4[20] = vsubq_s32(x2[19], x3[20]);
3751 x4[24] = vsubq_s32(x2[31], x3[24]);
3752 x4[31] = vaddq_s32(x2[31], x3[24]);
3753 x4[25] = vsubq_s32(x2[30], x3[25]);
3754 x4[30] = vaddq_s32(x2[30], x3[25]);
3755 x4[26] = vsubq_s32(x2[29], x3[26]);
3756 x4[29] = vaddq_s32(x2[29], x3[26]);
3757 x4[27] = vsubq_s32(x2[28], x3[27]);
3758 x4[28] = vaddq_s32(x2[28], x3[27]);
3759
3760 btf_32_neon_mode0(cospi[16], cospi[48], x3[36], x3[59], x4[36], x4[59],
3761 *v_cos_bit);
3762 btf_32_neon_mode0(cospi[16], cospi[48], x3[37], x3[58], x4[37], x4[58],
3763 *v_cos_bit);
3764 btf_32_neon_mode0(cospi[16], cospi[48], x3[38], x3[57], x4[38], x4[57],
3765 *v_cos_bit);
3766 btf_32_neon_mode0(cospi[16], cospi[48], x3[39], x3[56], x4[39], x4[56],
3767 *v_cos_bit);
3768 btf_32_neon_mode01(cospi[48], cospi[16], x3[40], x3[55], x4[40], x4[55],
3769 *v_cos_bit);
3770 btf_32_neon_mode01(cospi[48], cospi[16], x3[41], x3[54], x4[41], x4[54],
3771 *v_cos_bit);
3772 btf_32_neon_mode01(cospi[48], cospi[16], x3[42], x3[53], x4[42], x4[53],
3773 *v_cos_bit);
3774 btf_32_neon_mode01(cospi[48], cospi[16], x3[43], x3[52], x4[43], x4[52],
3775 *v_cos_bit);
3776 }
3777
av1_fdct64_new_neon(int32x4_t * input,int32x4_t * output,int8_t cos_bit,const int instride,const int outstride,const int8_t * stage_range)3778 static void av1_fdct64_new_neon(int32x4_t *input, int32x4_t *output,
3779 int8_t cos_bit, const int instride,
3780 const int outstride,
3781 const int8_t *stage_range) {
3782 (void)stage_range;
3783 const int32_t *cospi = cospi_arr(cos_bit);
3784 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
3785
3786 int startidx = 0 * instride;
3787 int endidx = 63 * instride;
3788
3789 // stage 1-2-3-4
3790 int32x4_t x3[64], x4[64];
3791 av1_fdct64_new_stage1234_neon(input, instride, x3, x4, cospi, &v_cos_bit,
3792 &startidx, &endidx);
3793
3794 // stage 5
3795 int32x4_t x5[64];
3796 x5[0] = vaddq_s32(x4[0], x4[3]);
3797 x5[3] = vsubq_s32(x4[0], x4[3]);
3798 x5[1] = vaddq_s32(x4[1], x4[2]);
3799 x5[2] = vsubq_s32(x4[1], x4[2]);
3800
3801 btf_32_neon_mode0(cospi[32], cospi[32], x4[5], x4[6], x5[5], x5[6],
3802 v_cos_bit);
3803
3804 x5[8] = vaddq_s32(x3[8], x4[11]);
3805 x5[11] = vsubq_s32(x3[8], x4[11]);
3806 x5[9] = vaddq_s32(x3[9], x4[10]);
3807 x5[10] = vsubq_s32(x3[9], x4[10]);
3808 x5[12] = vsubq_s32(x3[15], x4[12]);
3809 x5[15] = vaddq_s32(x3[15], x4[12]);
3810 x5[13] = vsubq_s32(x3[14], x4[13]);
3811 x5[14] = vaddq_s32(x3[14], x4[13]);
3812
3813 btf_32_neon_mode0(cospi[16], cospi[48], x4[18], x4[29], x5[18], x5[29],
3814 v_cos_bit);
3815 btf_32_neon_mode0(cospi[16], cospi[48], x4[19], x4[28], x5[19], x5[28],
3816 v_cos_bit);
3817 btf_32_neon_mode01(cospi[48], cospi[16], x4[20], x4[27], x5[20], x5[27],
3818 v_cos_bit);
3819 btf_32_neon_mode01(cospi[48], cospi[16], x4[21], x4[26], x5[21], x5[26],
3820 v_cos_bit);
3821
3822 x5[32] = vaddq_s32(x3[32], x4[39]);
3823 x5[39] = vsubq_s32(x3[32], x4[39]);
3824 x5[33] = vaddq_s32(x3[33], x4[38]);
3825 x5[38] = vsubq_s32(x3[33], x4[38]);
3826 x5[34] = vaddq_s32(x3[34], x4[37]);
3827 x5[37] = vsubq_s32(x3[34], x4[37]);
3828 x5[35] = vaddq_s32(x3[35], x4[36]);
3829 x5[36] = vsubq_s32(x3[35], x4[36]);
3830 x5[40] = vsubq_s32(x3[47], x4[40]);
3831 x5[47] = vaddq_s32(x3[47], x4[40]);
3832 x5[41] = vsubq_s32(x3[46], x4[41]);
3833 x5[46] = vaddq_s32(x3[46], x4[41]);
3834 x5[42] = vsubq_s32(x3[45], x4[42]);
3835 x5[45] = vaddq_s32(x3[45], x4[42]);
3836 x5[43] = vsubq_s32(x3[44], x4[43]);
3837 x5[44] = vaddq_s32(x3[44], x4[43]);
3838 x5[48] = vaddq_s32(x3[48], x4[55]);
3839 x5[55] = vsubq_s32(x3[48], x4[55]);
3840 x5[49] = vaddq_s32(x3[49], x4[54]);
3841 x5[54] = vsubq_s32(x3[49], x4[54]);
3842 x5[50] = vaddq_s32(x3[50], x4[53]);
3843 x5[53] = vsubq_s32(x3[50], x4[53]);
3844 x5[51] = vaddq_s32(x3[51], x4[52]);
3845 x5[52] = vsubq_s32(x3[51], x4[52]);
3846 x5[56] = vsubq_s32(x3[63], x4[56]);
3847 x5[63] = vaddq_s32(x3[63], x4[56]);
3848 x5[57] = vsubq_s32(x3[62], x4[57]);
3849 x5[62] = vaddq_s32(x3[62], x4[57]);
3850 x5[58] = vsubq_s32(x3[61], x4[58]);
3851 x5[61] = vaddq_s32(x3[61], x4[58]);
3852 x5[59] = vsubq_s32(x3[60], x4[59]);
3853 x5[60] = vaddq_s32(x3[60], x4[59]);
3854
3855 // stage 6
3856 int32x4_t x6[64];
3857 btf_32_neon(cospi[32], cospi[32], x5[0], x5[1], x6[0], x6[1], v_cos_bit);
3858 btf_32_type1_neon(cospi[48], cospi[16], x5[2], x5[3], x6[2], x6[3],
3859 v_cos_bit);
3860 x6[4] = vaddq_s32(x4[4], x5[5]);
3861 x6[5] = vsubq_s32(x4[4], x5[5]);
3862 x6[6] = vsubq_s32(x4[7], x5[6]);
3863 x6[7] = vaddq_s32(x4[7], x5[6]);
3864 btf_32_neon_mode0(cospi[16], cospi[48], x5[9], x5[14], x6[9], x6[14],
3865 v_cos_bit);
3866 btf_32_neon_mode01(cospi[48], cospi[16], x5[10], x5[13], x6[10], x6[13],
3867 v_cos_bit);
3868
3869 x6[16] = vaddq_s32(x4[16], x5[19]);
3870 x6[19] = vsubq_s32(x4[16], x5[19]);
3871 x6[17] = vaddq_s32(x4[17], x5[18]);
3872 x6[18] = vsubq_s32(x4[17], x5[18]);
3873 x6[20] = vsubq_s32(x4[23], x5[20]);
3874 x6[23] = vaddq_s32(x4[23], x5[20]);
3875 x6[21] = vsubq_s32(x4[22], x5[21]);
3876 x6[22] = vaddq_s32(x4[22], x5[21]);
3877 x6[24] = vaddq_s32(x4[24], x5[27]);
3878 x6[27] = vsubq_s32(x4[24], x5[27]);
3879 x6[25] = vaddq_s32(x4[25], x5[26]);
3880 x6[26] = vsubq_s32(x4[25], x5[26]);
3881 x6[28] = vsubq_s32(x4[31], x5[28]);
3882 x6[31] = vaddq_s32(x4[31], x5[28]);
3883 x6[29] = vsubq_s32(x4[30], x5[29]);
3884 x6[30] = vaddq_s32(x4[30], x5[29]);
3885
3886 btf_32_neon_mode0(cospi[8], cospi[56], x5[34], x5[61], x6[34], x6[61],
3887 v_cos_bit);
3888 btf_32_neon_mode0(cospi[8], cospi[56], x5[35], x5[60], x6[35], x6[60],
3889 v_cos_bit);
3890 btf_32_neon_mode01(cospi[56], cospi[8], x5[36], x5[59], x6[36], x6[59],
3891 v_cos_bit);
3892 btf_32_neon_mode01(cospi[56], cospi[8], x5[37], x5[58], x6[37], x6[58],
3893 v_cos_bit);
3894 btf_32_neon_mode0(cospi[40], cospi[24], x5[42], x5[53], x6[42], x6[53],
3895 v_cos_bit);
3896 btf_32_neon_mode0(cospi[40], cospi[24], x5[43], x5[52], x6[43], x6[52],
3897 v_cos_bit);
3898 btf_32_neon_mode01(cospi[24], cospi[40], x5[44], x5[51], x6[44], x6[51],
3899 v_cos_bit);
3900 btf_32_neon_mode01(cospi[24], cospi[40], x5[45], x5[50], x6[45], x6[50],
3901 v_cos_bit);
3902
3903 // stage 7
3904 int32x4_t x7[64];
3905
3906 btf_32_type1_neon(cospi[56], cospi[8], x6[4], x6[7], x7[4], x7[7], v_cos_bit);
3907 btf_32_type1_neon(cospi[24], cospi[40], x6[5], x6[6], x7[5], x7[6],
3908 v_cos_bit);
3909 x7[8] = vaddq_s32(x5[8], x6[9]);
3910 x7[9] = vsubq_s32(x5[8], x6[9]);
3911 x7[10] = vsubq_s32(x5[11], x6[10]);
3912 x7[11] = vaddq_s32(x5[11], x6[10]);
3913 x7[12] = vaddq_s32(x5[12], x6[13]);
3914 x7[13] = vsubq_s32(x5[12], x6[13]);
3915 x7[14] = vsubq_s32(x5[15], x6[14]);
3916 x7[15] = vaddq_s32(x5[15], x6[14]);
3917
3918 btf_32_neon_mode0(cospi[8], cospi[56], x6[17], x6[30], x7[17], x7[30],
3919 v_cos_bit);
3920 btf_32_neon_mode01(cospi[56], cospi[8], x6[18], x6[29], x7[18], x7[29],
3921 v_cos_bit);
3922
3923 btf_32_neon_mode0(cospi[40], cospi[24], x6[21], x6[26], x7[21], x7[26],
3924 v_cos_bit);
3925 btf_32_neon_mode01(cospi[24], cospi[40], x6[22], x6[25], x7[22], x7[25],
3926 v_cos_bit);
3927
3928 x7[32] = vaddq_s32(x5[32], x6[35]);
3929 x7[35] = vsubq_s32(x5[32], x6[35]);
3930 x7[33] = vaddq_s32(x5[33], x6[34]);
3931 x7[34] = vsubq_s32(x5[33], x6[34]);
3932 x7[36] = vsubq_s32(x5[39], x6[36]);
3933 x7[39] = vaddq_s32(x5[39], x6[36]);
3934 x7[37] = vsubq_s32(x5[38], x6[37]);
3935 x7[38] = vaddq_s32(x5[38], x6[37]);
3936 x7[40] = vaddq_s32(x5[40], x6[43]);
3937 x7[43] = vsubq_s32(x5[40], x6[43]);
3938 x7[41] = vaddq_s32(x5[41], x6[42]);
3939 x7[42] = vsubq_s32(x5[41], x6[42]);
3940 x7[44] = vsubq_s32(x5[47], x6[44]);
3941 x7[47] = vaddq_s32(x5[47], x6[44]);
3942 x7[45] = vsubq_s32(x5[46], x6[45]);
3943 x7[46] = vaddq_s32(x5[46], x6[45]);
3944 x7[48] = vaddq_s32(x5[48], x6[51]);
3945 x7[51] = vsubq_s32(x5[48], x6[51]);
3946 x7[49] = vaddq_s32(x5[49], x6[50]);
3947 x7[50] = vsubq_s32(x5[49], x6[50]);
3948 x7[52] = vsubq_s32(x5[55], x6[52]);
3949 x7[55] = vaddq_s32(x5[55], x6[52]);
3950 x7[53] = vsubq_s32(x5[54], x6[53]);
3951 x7[54] = vaddq_s32(x5[54], x6[53]);
3952 x7[56] = vaddq_s32(x5[56], x6[59]);
3953 x7[59] = vsubq_s32(x5[56], x6[59]);
3954 x7[57] = vaddq_s32(x5[57], x6[58]);
3955 x7[58] = vsubq_s32(x5[57], x6[58]);
3956 x7[60] = vsubq_s32(x5[63], x6[60]);
3957 x7[63] = vaddq_s32(x5[63], x6[60]);
3958 x7[61] = vsubq_s32(x5[62], x6[61]);
3959 x7[62] = vaddq_s32(x5[62], x6[61]);
3960
3961 // stage 8
3962 int32x4_t x8[64];
3963
3964 btf_32_type1_neon(cospi[60], cospi[4], x7[8], x7[15], x8[8], x8[15],
3965 v_cos_bit);
3966 btf_32_type1_neon(cospi[28], cospi[36], x7[9], x7[14], x8[9], x8[14],
3967 v_cos_bit);
3968 btf_32_type1_neon(cospi[44], cospi[20], x7[10], x7[13], x8[10], x8[13],
3969 v_cos_bit);
3970 btf_32_type1_neon(cospi[12], cospi[52], x7[11], x7[12], x8[11], x8[12],
3971 v_cos_bit);
3972 x8[16] = vaddq_s32(x6[16], x7[17]);
3973 x8[17] = vsubq_s32(x6[16], x7[17]);
3974 x8[18] = vsubq_s32(x6[19], x7[18]);
3975 x8[19] = vaddq_s32(x6[19], x7[18]);
3976 x8[20] = vaddq_s32(x6[20], x7[21]);
3977 x8[21] = vsubq_s32(x6[20], x7[21]);
3978 x8[22] = vsubq_s32(x6[23], x7[22]);
3979 x8[23] = vaddq_s32(x6[23], x7[22]);
3980 x8[24] = vaddq_s32(x6[24], x7[25]);
3981 x8[25] = vsubq_s32(x6[24], x7[25]);
3982 x8[26] = vsubq_s32(x6[27], x7[26]);
3983 x8[27] = vaddq_s32(x6[27], x7[26]);
3984 x8[28] = vaddq_s32(x6[28], x7[29]);
3985 x8[29] = vsubq_s32(x6[28], x7[29]);
3986 x8[30] = vsubq_s32(x6[31], x7[30]);
3987 x8[31] = vaddq_s32(x6[31], x7[30]);
3988
3989 btf_32_neon_mode0(cospi[4], cospi[60], x7[33], x7[62], x8[33], x8[62],
3990 v_cos_bit);
3991 btf_32_neon_mode01(cospi[60], cospi[4], x7[34], x7[61], x8[34], x8[61],
3992 v_cos_bit);
3993 btf_32_neon_mode0(cospi[36], cospi[28], x7[37], x7[58], x8[37], x8[58],
3994 v_cos_bit);
3995 btf_32_neon_mode01(cospi[28], cospi[36], x7[38], x7[57], x8[38], x8[57],
3996 v_cos_bit);
3997 btf_32_neon_mode0(cospi[20], cospi[44], x7[41], x7[54], x8[41], x8[54],
3998 v_cos_bit);
3999 btf_32_neon_mode01(cospi[44], cospi[20], x7[42], x7[53], x8[42], x8[53],
4000 v_cos_bit);
4001 btf_32_neon_mode0(cospi[52], cospi[12], x7[45], x7[50], x8[45], x8[50],
4002 v_cos_bit);
4003 btf_32_neon_mode01(cospi[12], cospi[52], x7[46], x7[49], x8[46], x8[49],
4004 v_cos_bit);
4005
4006 // stage 9
4007 int32x4_t x9[64];
4008
4009 btf_32_type1_neon(cospi[62], cospi[2], x8[16], x8[31], x9[16], x9[31],
4010 v_cos_bit);
4011 btf_32_type1_neon(cospi[30], cospi[34], x8[17], x8[30], x9[17], x9[30],
4012 v_cos_bit);
4013 btf_32_type1_neon(cospi[46], cospi[18], x8[18], x8[29], x9[18], x9[29],
4014 v_cos_bit);
4015 btf_32_type1_neon(cospi[14], cospi[50], x8[19], x8[28], x9[19], x9[28],
4016 v_cos_bit);
4017 btf_32_type1_neon(cospi[54], cospi[10], x8[20], x8[27], x9[20], x9[27],
4018 v_cos_bit);
4019 btf_32_type1_neon(cospi[22], cospi[42], x8[21], x8[26], x9[21], x9[26],
4020 v_cos_bit);
4021 btf_32_type1_neon(cospi[38], cospi[26], x8[22], x8[25], x9[22], x9[25],
4022 v_cos_bit);
4023 btf_32_type1_neon(cospi[6], cospi[58], x8[23], x8[24], x9[23], x9[24],
4024 v_cos_bit);
4025 x9[32] = vaddq_s32(x7[32], x8[33]);
4026 x9[33] = vsubq_s32(x7[32], x8[33]);
4027 x9[34] = vsubq_s32(x7[35], x8[34]);
4028 x9[35] = vaddq_s32(x7[35], x8[34]);
4029 x9[36] = vaddq_s32(x7[36], x8[37]);
4030 x9[37] = vsubq_s32(x7[36], x8[37]);
4031 x9[38] = vsubq_s32(x7[39], x8[38]);
4032 x9[39] = vaddq_s32(x7[39], x8[38]);
4033 x9[40] = vaddq_s32(x7[40], x8[41]);
4034 x9[41] = vsubq_s32(x7[40], x8[41]);
4035 x9[42] = vsubq_s32(x7[43], x8[42]);
4036 x9[43] = vaddq_s32(x7[43], x8[42]);
4037 x9[44] = vaddq_s32(x7[44], x8[45]);
4038 x9[45] = vsubq_s32(x7[44], x8[45]);
4039 x9[46] = vsubq_s32(x7[47], x8[46]);
4040 x9[47] = vaddq_s32(x7[47], x8[46]);
4041 x9[48] = vaddq_s32(x7[48], x8[49]);
4042 x9[49] = vsubq_s32(x7[48], x8[49]);
4043 x9[50] = vsubq_s32(x7[51], x8[50]);
4044 x9[51] = vaddq_s32(x7[51], x8[50]);
4045 x9[52] = vaddq_s32(x7[52], x8[53]);
4046 x9[53] = vsubq_s32(x7[52], x8[53]);
4047 x9[54] = vsubq_s32(x7[55], x8[54]);
4048 x9[55] = vaddq_s32(x7[55], x8[54]);
4049 x9[56] = vaddq_s32(x7[56], x8[57]);
4050 x9[57] = vsubq_s32(x7[56], x8[57]);
4051 x9[58] = vsubq_s32(x7[59], x8[58]);
4052 x9[59] = vaddq_s32(x7[59], x8[58]);
4053 x9[60] = vaddq_s32(x7[60], x8[61]);
4054 x9[61] = vsubq_s32(x7[60], x8[61]);
4055 x9[62] = vsubq_s32(x7[63], x8[62]);
4056 x9[63] = vaddq_s32(x7[63], x8[62]);
4057
4058 // stage 10
4059 int32x4_t x10[64];
4060
4061 btf_32_type1_neon(cospi[63], cospi[1], x9[32], x9[63], x10[32], x10[63],
4062 v_cos_bit);
4063 btf_32_type1_neon(cospi[31], cospi[33], x9[33], x9[62], x10[33], x10[62],
4064 v_cos_bit);
4065 btf_32_type1_neon(cospi[47], cospi[17], x9[34], x9[61], x10[34], x10[61],
4066 v_cos_bit);
4067 btf_32_type1_neon(cospi[15], cospi[49], x9[35], x9[60], x10[35], x10[60],
4068 v_cos_bit);
4069 btf_32_type1_neon(cospi[55], cospi[9], x9[36], x9[59], x10[36], x10[59],
4070 v_cos_bit);
4071 btf_32_type1_neon(cospi[23], cospi[41], x9[37], x9[58], x10[37], x10[58],
4072 v_cos_bit);
4073 btf_32_type1_neon(cospi[39], cospi[25], x9[38], x9[57], x10[38], x10[57],
4074 v_cos_bit);
4075 btf_32_type1_neon(cospi[7], cospi[57], x9[39], x9[56], x10[39], x10[56],
4076 v_cos_bit);
4077 btf_32_type1_neon(cospi[59], cospi[5], x9[40], x9[55], x10[40], x10[55],
4078 v_cos_bit);
4079 btf_32_type1_neon(cospi[27], cospi[37], x9[41], x9[54], x10[41], x10[54],
4080 v_cos_bit);
4081 btf_32_type1_neon(cospi[43], cospi[21], x9[42], x9[53], x10[42], x10[53],
4082 v_cos_bit);
4083 btf_32_type1_neon(cospi[11], cospi[53], x9[43], x9[52], x10[43], x10[52],
4084 v_cos_bit);
4085 btf_32_type1_neon(cospi[51], cospi[13], x9[44], x9[51], x10[44], x10[51],
4086 v_cos_bit);
4087 btf_32_type1_neon(cospi[19], cospi[45], x9[45], x9[50], x10[45], x10[50],
4088 v_cos_bit);
4089 btf_32_type1_neon(cospi[35], cospi[29], x9[46], x9[49], x10[46], x10[49],
4090 v_cos_bit);
4091 btf_32_type1_neon(cospi[3], cospi[61], x9[47], x9[48], x10[47], x10[48],
4092 v_cos_bit);
4093
4094 startidx = 0 * outstride;
4095 endidx = 63 * outstride;
4096 // stage 11
4097 output[startidx] = x6[0];
4098 output[endidx] = x10[63];
4099 startidx += outstride;
4100 endidx -= outstride;
4101 output[startidx] = x10[32];
4102 output[endidx] = x9[31];
4103 startidx += outstride;
4104 endidx -= outstride;
4105 output[startidx] = x9[16];
4106 output[endidx] = x10[47];
4107 startidx += outstride;
4108 endidx -= outstride;
4109 output[startidx] = x10[48];
4110 output[endidx] = x8[15];
4111 startidx += outstride;
4112 endidx -= outstride;
4113 output[startidx] = x8[8];
4114 output[endidx] = x10[55];
4115 startidx += outstride;
4116 endidx -= outstride;
4117 output[startidx] = x10[40];
4118 output[endidx] = x9[23];
4119 startidx += outstride;
4120 endidx -= outstride;
4121 output[startidx] = x9[24];
4122 output[endidx] = x10[39];
4123 startidx += outstride;
4124 endidx -= outstride;
4125 output[startidx] = x10[56];
4126 output[endidx] = x7[7];
4127 startidx += outstride;
4128 endidx -= outstride;
4129 output[startidx] = x7[4];
4130 output[endidx] = x10[59];
4131 startidx += outstride;
4132 endidx -= outstride;
4133 output[startidx] = x10[36];
4134 output[endidx] = x9[27];
4135 startidx += outstride;
4136 endidx -= outstride;
4137 output[startidx] = x9[20];
4138 output[endidx] = x10[43];
4139 startidx += outstride;
4140 endidx -= outstride;
4141 output[startidx] = x10[52];
4142 output[endidx] = x8[11];
4143 startidx += outstride;
4144 endidx -= outstride;
4145 output[startidx] = x8[12];
4146 output[endidx] = x10[51];
4147 startidx += outstride;
4148 endidx -= outstride;
4149 output[startidx] = x10[44];
4150 output[endidx] = x9[19];
4151 startidx += outstride;
4152 endidx -= outstride;
4153 output[startidx] = x9[28];
4154 output[endidx] = x10[35];
4155 startidx += outstride;
4156 endidx -= outstride;
4157 output[startidx] = x10[60];
4158 output[endidx] = x6[3];
4159 startidx += outstride;
4160 endidx -= outstride;
4161 output[startidx] = x6[2];
4162 output[endidx] = x10[61];
4163 startidx += outstride;
4164 endidx -= outstride;
4165 output[startidx] = x10[34];
4166 output[endidx] = x9[29];
4167 startidx += outstride;
4168 endidx -= outstride;
4169 output[startidx] = x9[18];
4170 output[endidx] = x10[45];
4171 startidx += outstride;
4172 endidx -= outstride;
4173 output[startidx] = x10[50];
4174 output[endidx] = x8[13];
4175 startidx += outstride;
4176 endidx -= outstride;
4177 output[startidx] = x8[10];
4178 output[endidx] = x10[53];
4179 startidx += outstride;
4180 endidx -= outstride;
4181 output[startidx] = x10[42];
4182 output[endidx] = x9[21];
4183 startidx += outstride;
4184 endidx -= outstride;
4185 output[startidx] = x9[26];
4186 output[endidx] = x10[37];
4187 startidx += outstride;
4188 endidx -= outstride;
4189 output[startidx] = x10[58];
4190 output[endidx] = x7[5];
4191 startidx += outstride;
4192 endidx -= outstride;
4193 output[startidx] = x7[6];
4194 output[endidx] = x10[57];
4195 startidx += outstride;
4196 endidx -= outstride;
4197 output[startidx] = x10[38];
4198 output[endidx] = x9[25];
4199 startidx += outstride;
4200 endidx -= outstride;
4201 output[startidx] = x9[22];
4202 output[endidx] = x10[41];
4203 startidx += outstride;
4204 endidx -= outstride;
4205 output[startidx] = x10[54];
4206 output[endidx] = x8[9];
4207 startidx += outstride;
4208 endidx -= outstride;
4209 output[startidx] = x8[14];
4210 output[endidx] = x10[49];
4211 startidx += outstride;
4212 endidx -= outstride;
4213 output[startidx] = x10[46];
4214 output[endidx] = x9[17];
4215 startidx += outstride;
4216 endidx -= outstride;
4217 output[startidx] = x9[30];
4218 output[endidx] = x10[33];
4219 startidx += outstride;
4220 endidx -= outstride;
4221 output[startidx] = x10[62];
4222 output[endidx] = x6[1];
4223 }
4224
av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4225 static void av1_lowbd_fwd_txfm2d_64x64_neon(const int16_t *input,
4226 int32_t *output, int stride,
4227 TX_TYPE tx_type, int bd) {
4228 (void)bd;
4229 (void)tx_type;
4230 assert(tx_type == DCT_DCT);
4231 const TX_SIZE tx_size = TX_64X64;
4232 int16x8_t buf0[64], buf1[512];
4233 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4234 const int txw_idx = get_txw_idx(tx_size);
4235 const int txh_idx = get_txh_idx(tx_size);
4236 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4237 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4238 const int width = tx_size_wide[tx_size];
4239 const int height = tx_size_high[tx_size];
4240 const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
4241 const int width_div8 = (width >> 3);
4242 const int height_div8 = (height >> 3);
4243
4244 for (int i = 0; i < width_div8; i++) {
4245 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4246 round_shift_16bit(buf0, height, shift[0]);
4247 col_txfm(buf0, buf0, cos_bit_col, NULL);
4248 round_shift_16bit(buf0, height, shift[1]);
4249 for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4250 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4251 }
4252 }
4253 for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4254 int32x4_t bufA[64];
4255 int32x4_t bufB[64];
4256 int16x8_t *buf = buf1 + width * i;
4257 for (int j = 0; j < width; ++j) {
4258 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4259 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4260 }
4261 av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
4262 av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
4263 av1_round_shift_array_32_neon(bufA, bufA, 32);
4264 av1_round_shift_array_32_neon(bufB, bufB, 32);
4265
4266 int32_t *output8 = output + 8 * 32 * i;
4267 for (int j = 0; j < width_div8; ++j) {
4268 int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4269 transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4270 }
4271 }
4272 }
av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4273 static void av1_lowbd_fwd_txfm2d_64x32_neon(const int16_t *input,
4274 int32_t *output, int stride,
4275 TX_TYPE tx_type, int bd) {
4276 (void)bd;
4277 const TX_SIZE tx_size = TX_64X32;
4278 int16x8_t buf0[64], buf1[256];
4279 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4280 const int txw_idx = get_txw_idx(tx_size);
4281 const int txh_idx = get_txh_idx(tx_size);
4282 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4283 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4284 const int width = tx_size_wide[tx_size];
4285 const int height = tx_size_high[tx_size];
4286 const transform_1d_lbd_neon col_txfm = col_txfm8x32_arr[tx_type];
4287 const int width_div8 = (width >> 3);
4288 const int height_div8 = (height >> 3);
4289
4290 for (int i = 0; i < width_div8; i++) {
4291 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4292 round_shift_16bit(buf0, height, shift[0]);
4293 col_txfm(buf0, buf0, cos_bit_col, NULL);
4294 round_shift_16bit(buf0, height, shift[1]);
4295 for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4296 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4297 }
4298 }
4299 assert(tx_type == DCT_DCT);
4300 for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4301 int32x4_t bufA[64];
4302 int32x4_t bufB[64];
4303 int16x8_t *buf = buf1 + width * i;
4304 for (int j = 0; j < width; ++j) {
4305 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4306 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4307 }
4308 av1_fdct64_new_neon(bufA, bufA, cos_bit_row, 1, 1, NULL);
4309 av1_fdct64_new_neon(bufB, bufB, cos_bit_row, 1, 1, NULL);
4310 av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
4311 av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
4312
4313 int32_t *output8 = output + 8 * 32 * i;
4314 for (int j = 0; j < width_div8; ++j) {
4315 int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4316 transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4317 }
4318 }
4319 }
4320
av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)4321 static void av1_lowbd_fwd_txfm2d_32x64_neon(const int16_t *input,
4322 int32_t *output, int stride,
4323 TX_TYPE tx_type, int bd) {
4324 (void)bd;
4325 (void)tx_type;
4326 assert(tx_type == DCT_DCT);
4327 const TX_SIZE tx_size = TX_32X64;
4328 int16x8_t buf0[64], buf1[256];
4329 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
4330 const int txw_idx = get_txw_idx(tx_size);
4331 const int txh_idx = get_txh_idx(tx_size);
4332 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
4333 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
4334 const int width = tx_size_wide[tx_size];
4335 const int height = tx_size_high[tx_size];
4336 const transform_1d_lbd_neon col_txfm = av1_fdct8x64_neon;
4337 const int width_div8 = (width >> 3);
4338 const int height_div8 = (height >> 3);
4339
4340 for (int i = 0; i < width_div8; i++) {
4341 load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
4342 round_shift_16bit(buf0, height, shift[0]);
4343 col_txfm(buf0, buf0, cos_bit_col, NULL);
4344 round_shift_16bit(buf0, height, shift[1]);
4345 for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
4346 transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
4347 }
4348 }
4349
4350 for (int i = 0; i < AOMMIN(4, height_div8); i++) {
4351 int32x4_t bufA[32];
4352 int32x4_t bufB[32];
4353 int16x8_t *buf = buf1 + width * i;
4354 for (int j = 0; j < width; ++j) {
4355 bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
4356 bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
4357 }
4358 av1_fdct32_new_neon(bufA, bufA, cos_bit_row, 1, NULL);
4359 av1_fdct32_new_neon(bufB, bufB, cos_bit_row, 1, NULL);
4360 av1_round_shift_rect_array_32_neon(bufA, bufA, 32);
4361 av1_round_shift_rect_array_32_neon(bufB, bufB, 32);
4362
4363 int32_t *output8 = output + 8 * 32 * i;
4364 for (int j = 0; j < (32 / 4); ++j) {
4365 int32x4_t *out = (int32x4_t *)(output8 + 4 * j);
4366 transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
4367 }
4368 }
4369 }
4370
4371 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
4372 av1_lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
4373 av1_lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
4374 av1_lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
4375 av1_lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
4376 av1_lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
4377 av1_lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
4378 av1_lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
4379 av1_lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
4380 av1_lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
4381 av1_lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
4382 av1_lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
4383 av1_lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
4384 av1_lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
4385 av1_lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
4386 av1_lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
4387 av1_lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
4388 av1_lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
4389 av1_lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
4390 av1_lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
4391 };
4392
av1_lowbd_fwd_txfm_neon(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)4393 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
4394 int diff_stride, TxfmParam *txfm_param) {
4395 FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
4396 if ((fwd_txfm2d_func == NULL) ||
4397 (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
4398 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
4399 } else {
4400 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
4401 txfm_param->bd);
4402 }
4403 }
4404