1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/txfm_common.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18
19 // Most gcc 4.9 distributions outside of Android do not generate correct code
20 // for this function.
21 #if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
22 __GNUC__ == 4 && __GNUC_MINOR__ <= 9
23
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)24 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
25 vpx_fdct32x32_c(input, output, stride);
26 }
27
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)28 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
29 int stride) {
30 vpx_fdct32x32_rd_c(input, output, stride);
31 }
32
33 #else
34
35 #define LOAD_INCREMENT(src, stride, dest, index) \
36 do { \
37 dest[index] = vld1q_s16(src); \
38 src += stride; \
39 } while (0)
40
41 #define ADD_S16(src, index0, index1, dest, index3) \
42 do { \
43 dest[index3] = vaddq_s16(src[index0], src[index1]); \
44 } while (0)
45
46 #define ADD_SHIFT_S16(src, index0, index1) \
47 do { \
48 src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
49 } while (0)
50
51 // Load, cross, and multiply by 4. Load the first 8 and last 8, then the
52 // middle
53 // 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
load(const int16_t * a,int stride,int16x8_t * b)54 static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
55 const int16_t *a_end = a + 24 * stride;
56 int16x8_t c[8];
57
58 LOAD_INCREMENT(a, stride, b, 0);
59 LOAD_INCREMENT(a, stride, b, 1);
60 LOAD_INCREMENT(a, stride, b, 2);
61 LOAD_INCREMENT(a, stride, b, 3);
62 LOAD_INCREMENT(a, stride, b, 4);
63 LOAD_INCREMENT(a, stride, b, 5);
64 LOAD_INCREMENT(a, stride, b, 6);
65 LOAD_INCREMENT(a, stride, b, 7);
66
67 LOAD_INCREMENT(a_end, stride, b, 24);
68 LOAD_INCREMENT(a_end, stride, b, 25);
69 LOAD_INCREMENT(a_end, stride, b, 26);
70 LOAD_INCREMENT(a_end, stride, b, 27);
71 LOAD_INCREMENT(a_end, stride, b, 28);
72 LOAD_INCREMENT(a_end, stride, b, 29);
73 LOAD_INCREMENT(a_end, stride, b, 30);
74 LOAD_INCREMENT(a_end, stride, b, 31);
75
76 ADD_S16(b, 0, 31, c, 0);
77 ADD_S16(b, 1, 30, c, 1);
78 ADD_S16(b, 2, 29, c, 2);
79 ADD_S16(b, 3, 28, c, 3);
80 ADD_S16(b, 4, 27, c, 4);
81 ADD_S16(b, 5, 26, c, 5);
82 ADD_S16(b, 6, 25, c, 6);
83 ADD_S16(b, 7, 24, c, 7);
84
85 ADD_SHIFT_S16(b, 7, 24);
86 ADD_SHIFT_S16(b, 6, 25);
87 ADD_SHIFT_S16(b, 5, 26);
88 ADD_SHIFT_S16(b, 4, 27);
89 ADD_SHIFT_S16(b, 3, 28);
90 ADD_SHIFT_S16(b, 2, 29);
91 ADD_SHIFT_S16(b, 1, 30);
92 ADD_SHIFT_S16(b, 0, 31);
93
94 b[0] = vshlq_n_s16(c[0], 2);
95 b[1] = vshlq_n_s16(c[1], 2);
96 b[2] = vshlq_n_s16(c[2], 2);
97 b[3] = vshlq_n_s16(c[3], 2);
98 b[4] = vshlq_n_s16(c[4], 2);
99 b[5] = vshlq_n_s16(c[5], 2);
100 b[6] = vshlq_n_s16(c[6], 2);
101 b[7] = vshlq_n_s16(c[7], 2);
102
103 LOAD_INCREMENT(a, stride, b, 8);
104 LOAD_INCREMENT(a, stride, b, 9);
105 LOAD_INCREMENT(a, stride, b, 10);
106 LOAD_INCREMENT(a, stride, b, 11);
107 LOAD_INCREMENT(a, stride, b, 12);
108 LOAD_INCREMENT(a, stride, b, 13);
109 LOAD_INCREMENT(a, stride, b, 14);
110 LOAD_INCREMENT(a, stride, b, 15);
111 LOAD_INCREMENT(a, stride, b, 16);
112 LOAD_INCREMENT(a, stride, b, 17);
113 LOAD_INCREMENT(a, stride, b, 18);
114 LOAD_INCREMENT(a, stride, b, 19);
115 LOAD_INCREMENT(a, stride, b, 20);
116 LOAD_INCREMENT(a, stride, b, 21);
117 LOAD_INCREMENT(a, stride, b, 22);
118 LOAD_INCREMENT(a, stride, b, 23);
119
120 ADD_S16(b, 8, 23, c, 0);
121 ADD_S16(b, 9, 22, c, 1);
122 ADD_S16(b, 10, 21, c, 2);
123 ADD_S16(b, 11, 20, c, 3);
124 ADD_S16(b, 12, 19, c, 4);
125 ADD_S16(b, 13, 18, c, 5);
126 ADD_S16(b, 14, 17, c, 6);
127 ADD_S16(b, 15, 16, c, 7);
128
129 ADD_SHIFT_S16(b, 15, 16);
130 ADD_SHIFT_S16(b, 14, 17);
131 ADD_SHIFT_S16(b, 13, 18);
132 ADD_SHIFT_S16(b, 12, 19);
133 ADD_SHIFT_S16(b, 11, 20);
134 ADD_SHIFT_S16(b, 10, 21);
135 ADD_SHIFT_S16(b, 9, 22);
136 ADD_SHIFT_S16(b, 8, 23);
137
138 b[8] = vshlq_n_s16(c[0], 2);
139 b[9] = vshlq_n_s16(c[1], 2);
140 b[10] = vshlq_n_s16(c[2], 2);
141 b[11] = vshlq_n_s16(c[3], 2);
142 b[12] = vshlq_n_s16(c[4], 2);
143 b[13] = vshlq_n_s16(c[5], 2);
144 b[14] = vshlq_n_s16(c[6], 2);
145 b[15] = vshlq_n_s16(c[7], 2);
146 }
147
148 #undef LOAD_INCREMENT
149 #undef ADD_S16
150 #undef ADD_SHIFT_S16
151
152 #define STORE_S16(src, index, dest) \
153 do { \
154 store_s16q_to_tran_low(dest, src[index]); \
155 dest += 8; \
156 } while (0);
157
158 // Store 32 16x8 values, assuming stride == 32.
159 // Slight twist: store horizontally in blocks of 8.
store(tran_low_t * a,const int16x8_t * b)160 static INLINE void store(tran_low_t *a, const int16x8_t *b) {
161 STORE_S16(b, 0, a);
162 STORE_S16(b, 8, a);
163 STORE_S16(b, 16, a);
164 STORE_S16(b, 24, a);
165 STORE_S16(b, 1, a);
166 STORE_S16(b, 9, a);
167 STORE_S16(b, 17, a);
168 STORE_S16(b, 25, a);
169 STORE_S16(b, 2, a);
170 STORE_S16(b, 10, a);
171 STORE_S16(b, 18, a);
172 STORE_S16(b, 26, a);
173 STORE_S16(b, 3, a);
174 STORE_S16(b, 11, a);
175 STORE_S16(b, 19, a);
176 STORE_S16(b, 27, a);
177 STORE_S16(b, 4, a);
178 STORE_S16(b, 12, a);
179 STORE_S16(b, 20, a);
180 STORE_S16(b, 28, a);
181 STORE_S16(b, 5, a);
182 STORE_S16(b, 13, a);
183 STORE_S16(b, 21, a);
184 STORE_S16(b, 29, a);
185 STORE_S16(b, 6, a);
186 STORE_S16(b, 14, a);
187 STORE_S16(b, 22, a);
188 STORE_S16(b, 30, a);
189 STORE_S16(b, 7, a);
190 STORE_S16(b, 15, a);
191 STORE_S16(b, 23, a);
192 STORE_S16(b, 31, a);
193 }
194
195 #undef STORE_S16
196
197 // fdct_round_shift((a +/- b) * c)
butterfly_one_coeff(const int16x8_t a,const int16x8_t b,const tran_high_t constant,int16x8_t * add,int16x8_t * sub)198 static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
199 const tran_high_t constant,
200 int16x8_t *add, int16x8_t *sub) {
201 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
202 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
203 const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
204 const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
205 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
206 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
207 const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
208 const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
209 const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
210 const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
211 *add = vcombine_s16(rounded0, rounded1);
212 *sub = vcombine_s16(rounded2, rounded3);
213 }
214
215 // fdct_round_shift(a * c0 +/- b * c1)
butterfly_two_coeff(const int16x8_t a,const int16x8_t b,const tran_coef_t constant0,const tran_coef_t constant1,int16x8_t * add,int16x8_t * sub)216 static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
217 const tran_coef_t constant0,
218 const tran_coef_t constant1,
219 int16x8_t *add, int16x8_t *sub) {
220 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
221 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
222 const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
223 const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
224 const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
225 const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
226 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
227 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
228 const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
229 const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
230 const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
231 const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
232 *add = vcombine_s16(rounded0, rounded1);
233 *sub = vcombine_s16(rounded2, rounded3);
234 }
235
236 // Add 2 if positive, 1 if negative, and shift by 2.
237 // In practice, subtract the sign bit, then shift with rounding.
sub_round_shift(const int16x8_t a)238 static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
239 const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
240 const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
241 const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
242 return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
243 }
244
dct_body_first_pass(const int16x8_t * in,int16x8_t * out)245 static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
246 int16x8_t a[32];
247 int16x8_t b[32];
248
249 // Stage 1: Done as part of the load.
250
251 // Stage 2.
252 // Mini cross. X the first 16 values and the middle 8 of the second half.
253 a[0] = vaddq_s16(in[0], in[15]);
254 a[1] = vaddq_s16(in[1], in[14]);
255 a[2] = vaddq_s16(in[2], in[13]);
256 a[3] = vaddq_s16(in[3], in[12]);
257 a[4] = vaddq_s16(in[4], in[11]);
258 a[5] = vaddq_s16(in[5], in[10]);
259 a[6] = vaddq_s16(in[6], in[9]);
260 a[7] = vaddq_s16(in[7], in[8]);
261
262 a[8] = vsubq_s16(in[7], in[8]);
263 a[9] = vsubq_s16(in[6], in[9]);
264 a[10] = vsubq_s16(in[5], in[10]);
265 a[11] = vsubq_s16(in[4], in[11]);
266 a[12] = vsubq_s16(in[3], in[12]);
267 a[13] = vsubq_s16(in[2], in[13]);
268 a[14] = vsubq_s16(in[1], in[14]);
269 a[15] = vsubq_s16(in[0], in[15]);
270
271 a[16] = in[16];
272 a[17] = in[17];
273 a[18] = in[18];
274 a[19] = in[19];
275
276 butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
277 butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
278 butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
279 butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
280
281 a[28] = in[28];
282 a[29] = in[29];
283 a[30] = in[30];
284 a[31] = in[31];
285
286 // Stage 3.
287 b[0] = vaddq_s16(a[0], a[7]);
288 b[1] = vaddq_s16(a[1], a[6]);
289 b[2] = vaddq_s16(a[2], a[5]);
290 b[3] = vaddq_s16(a[3], a[4]);
291
292 b[4] = vsubq_s16(a[3], a[4]);
293 b[5] = vsubq_s16(a[2], a[5]);
294 b[6] = vsubq_s16(a[1], a[6]);
295 b[7] = vsubq_s16(a[0], a[7]);
296
297 b[8] = a[8];
298 b[9] = a[9];
299
300 butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
301 butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
302
303 b[14] = a[14];
304 b[15] = a[15];
305
306 b[16] = vaddq_s16(in[16], a[23]);
307 b[17] = vaddq_s16(in[17], a[22]);
308 b[18] = vaddq_s16(in[18], a[21]);
309 b[19] = vaddq_s16(in[19], a[20]);
310
311 b[20] = vsubq_s16(in[19], a[20]);
312 b[21] = vsubq_s16(in[18], a[21]);
313 b[22] = vsubq_s16(in[17], a[22]);
314 b[23] = vsubq_s16(in[16], a[23]);
315
316 b[24] = vsubq_s16(in[31], a[24]);
317 b[25] = vsubq_s16(in[30], a[25]);
318 b[26] = vsubq_s16(in[29], a[26]);
319 b[27] = vsubq_s16(in[28], a[27]);
320
321 b[28] = vaddq_s16(in[28], a[27]);
322 b[29] = vaddq_s16(in[29], a[26]);
323 b[30] = vaddq_s16(in[30], a[25]);
324 b[31] = vaddq_s16(in[31], a[24]);
325
326 // Stage 4.
327 a[0] = vaddq_s16(b[0], b[3]);
328 a[1] = vaddq_s16(b[1], b[2]);
329 a[2] = vsubq_s16(b[1], b[2]);
330 a[3] = vsubq_s16(b[0], b[3]);
331
332 a[4] = b[4];
333
334 butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
335
336 a[7] = b[7];
337
338 a[8] = vaddq_s16(b[8], b[11]);
339 a[9] = vaddq_s16(b[9], b[10]);
340 a[10] = vsubq_s16(b[9], b[10]);
341 a[11] = vsubq_s16(b[8], b[11]);
342 a[12] = vsubq_s16(b[15], b[12]);
343 a[13] = vsubq_s16(b[14], b[13]);
344 a[14] = vaddq_s16(b[14], b[13]);
345 a[15] = vaddq_s16(b[15], b[12]);
346
347 a[16] = b[16];
348 a[17] = b[17];
349
350 butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
351 butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
352 butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
353 butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
354
355 a[22] = b[22];
356 a[23] = b[23];
357 a[24] = b[24];
358 a[25] = b[25];
359
360 a[30] = b[30];
361 a[31] = b[31];
362
363 // Stage 5.
364 butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
365 butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
366
367 b[4] = vaddq_s16(a[4], a[5]);
368 b[5] = vsubq_s16(a[4], a[5]);
369 b[6] = vsubq_s16(a[7], a[6]);
370 b[7] = vaddq_s16(a[7], a[6]);
371
372 b[8] = a[8];
373
374 butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
375 butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
376
377 b[11] = a[11];
378 b[12] = a[12];
379
380 b[15] = a[15];
381
382 b[16] = vaddq_s16(a[19], a[16]);
383 b[17] = vaddq_s16(a[18], a[17]);
384 b[18] = vsubq_s16(a[17], a[18]);
385 b[19] = vsubq_s16(a[16], a[19]);
386 b[20] = vsubq_s16(a[23], a[20]);
387 b[21] = vsubq_s16(a[22], a[21]);
388 b[22] = vaddq_s16(a[21], a[22]);
389 b[23] = vaddq_s16(a[20], a[23]);
390 b[24] = vaddq_s16(a[27], a[24]);
391 b[25] = vaddq_s16(a[26], a[25]);
392 b[26] = vsubq_s16(a[25], a[26]);
393 b[27] = vsubq_s16(a[24], a[27]);
394 b[28] = vsubq_s16(a[31], a[28]);
395 b[29] = vsubq_s16(a[30], a[29]);
396 b[30] = vaddq_s16(a[29], a[30]);
397 b[31] = vaddq_s16(a[28], a[31]);
398
399 // Stage 6.
400 a[0] = b[0];
401 a[1] = b[1];
402 a[2] = b[2];
403 a[3] = b[3];
404
405 butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
406 butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
407
408 a[8] = vaddq_s16(b[8], b[9]);
409 a[9] = vsubq_s16(b[8], b[9]);
410 a[10] = vsubq_s16(b[11], b[10]);
411 a[11] = vaddq_s16(b[11], b[10]);
412 a[12] = vaddq_s16(b[12], b[13]);
413 a[13] = vsubq_s16(b[12], b[13]);
414 a[14] = vsubq_s16(b[15], b[14]);
415 a[15] = vaddq_s16(b[15], b[14]);
416
417 a[16] = b[16];
418 a[19] = b[19];
419 a[20] = b[20];
420 a[23] = b[23];
421 a[24] = b[24];
422 a[27] = b[27];
423 a[28] = b[28];
424 a[31] = b[31];
425
426 butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
427 butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
428
429 butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
430 butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
431
432 // Stage 7.
433 b[0] = a[0];
434 b[1] = a[1];
435 b[2] = a[2];
436 b[3] = a[3];
437 b[4] = a[4];
438 b[5] = a[5];
439 b[6] = a[6];
440 b[7] = a[7];
441
442 butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
443 butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
444 butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
445 butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
446
447 b[16] = vaddq_s16(a[16], a[17]);
448 b[17] = vsubq_s16(a[16], a[17]);
449 b[18] = vsubq_s16(a[19], a[18]);
450 b[19] = vaddq_s16(a[19], a[18]);
451 b[20] = vaddq_s16(a[20], a[21]);
452 b[21] = vsubq_s16(a[20], a[21]);
453 b[22] = vsubq_s16(a[23], a[22]);
454 b[23] = vaddq_s16(a[23], a[22]);
455 b[24] = vaddq_s16(a[24], a[25]);
456 b[25] = vsubq_s16(a[24], a[25]);
457 b[26] = vsubq_s16(a[27], a[26]);
458 b[27] = vaddq_s16(a[27], a[26]);
459 b[28] = vaddq_s16(a[28], a[29]);
460 b[29] = vsubq_s16(a[28], a[29]);
461 b[30] = vsubq_s16(a[31], a[30]);
462 b[31] = vaddq_s16(a[31], a[30]);
463
464 // Final stage.
465 // Also compute partial rounding shift:
466 // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
467 out[0] = sub_round_shift(b[0]);
468 out[16] = sub_round_shift(b[1]);
469 out[8] = sub_round_shift(b[2]);
470 out[24] = sub_round_shift(b[3]);
471 out[4] = sub_round_shift(b[4]);
472 out[20] = sub_round_shift(b[5]);
473 out[12] = sub_round_shift(b[6]);
474 out[28] = sub_round_shift(b[7]);
475 out[2] = sub_round_shift(b[8]);
476 out[18] = sub_round_shift(b[9]);
477 out[10] = sub_round_shift(b[10]);
478 out[26] = sub_round_shift(b[11]);
479 out[6] = sub_round_shift(b[12]);
480 out[22] = sub_round_shift(b[13]);
481 out[14] = sub_round_shift(b[14]);
482 out[30] = sub_round_shift(b[15]);
483
484 butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
485 out[1] = sub_round_shift(a[1]);
486 out[31] = sub_round_shift(a[31]);
487
488 butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
489 out[17] = sub_round_shift(a[17]);
490 out[15] = sub_round_shift(a[15]);
491
492 butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
493 out[9] = sub_round_shift(a[9]);
494 out[23] = sub_round_shift(a[23]);
495
496 butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
497 out[25] = sub_round_shift(a[25]);
498 out[7] = sub_round_shift(a[7]);
499
500 butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
501 out[5] = sub_round_shift(a[5]);
502 out[27] = sub_round_shift(a[27]);
503
504 butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
505 out[21] = sub_round_shift(a[21]);
506 out[11] = sub_round_shift(a[11]);
507
508 butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
509 out[13] = sub_round_shift(a[13]);
510 out[19] = sub_round_shift(a[19]);
511
512 butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
513 out[29] = sub_round_shift(a[29]);
514 out[3] = sub_round_shift(a[3]);
515 }
516
517 #define PASS_THROUGH(src, dst, element) \
518 do { \
519 dst##_lo[element] = src##_lo[element]; \
520 dst##_hi[element] = src##_hi[element]; \
521 } while (0)
522
523 #define ADD_S16_S32(a, left_index, right_index, b, b_index) \
524 do { \
525 b##_lo[b_index] = \
526 vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
527 b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
528 vget_high_s16(a[right_index])); \
529 } while (0)
530
531 #define SUB_S16_S32(a, left_index, right_index, b, b_index) \
532 do { \
533 b##_lo[b_index] = \
534 vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
535 b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
536 vget_high_s16(a[right_index])); \
537 } while (0)
538
539 #define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
540 do { \
541 c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
542 c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
543 } while (0)
544
545 #define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
546 do { \
547 temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
548 temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
549 c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
550 c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
551 } while (0)
552
553 #define ADD_S32(a, left_index, right_index, b, b_index) \
554 do { \
555 b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
556 b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
557 } while (0)
558
559 #define SUB_S32(a, left_index, right_index, b, b_index) \
560 do { \
561 b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
562 b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
563 } while (0)
564
565 // Like butterfly_one_coeff, but don't narrow results.
butterfly_one_coeff_s16_s32(const int16x8_t a,const int16x8_t b,const tran_high_t constant,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)566 static INLINE void butterfly_one_coeff_s16_s32(
567 const int16x8_t a, const int16x8_t b, const tran_high_t constant,
568 int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
569 int32x4_t *sub_hi) {
570 const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
571 const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
572 const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
573 const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
574 const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
575 const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
576 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
577 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
578 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
579 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
580 }
581
582 #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
583 add_index, sub_index) \
584 do { \
585 butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
586 &b##_lo[add_index], &b##_hi[add_index], \
587 &b##_lo[sub_index], &b##_hi[sub_index]); \
588 } while (0)
589
590 // Like butterfly_one_coeff, but with s32.
butterfly_one_coeff_s32(const int32x4_t a_lo,const int32x4_t a_hi,const int32x4_t b_lo,const int32x4_t b_hi,const int32_t constant,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)591 static INLINE void butterfly_one_coeff_s32(
592 const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
593 const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
594 int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
595 const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
596 const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
597 const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
598 const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
599 const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
600 const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
601 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
602 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
603 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
604 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
605 }
606
607 #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
608 sub_index) \
609 do { \
610 butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
611 a##_lo[right_index], a##_hi[right_index], \
612 constant, &b##_lo[add_index], &b##_hi[add_index], \
613 &b##_lo[sub_index], &b##_hi[sub_index]); \
614 } while (0)
615
616 // Like butterfly_two_coeff, but with s32.
butterfly_two_coeff_s32(const int32x4_t a_lo,const int32x4_t a_hi,const int32x4_t b_lo,const int32x4_t b_hi,const int32_t constant0,const int32_t constant1,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)617 static INLINE void butterfly_two_coeff_s32(
618 const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
619 const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
620 int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
621 int32x4_t *sub_hi) {
622 const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
623 const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
624 const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
625 const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
626 const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
627 const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
628 const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
629 const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
630 *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
631 *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
632 *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
633 *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
634 }
635
636 #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
637 right_constant, b, add_index, sub_index) \
638 do { \
639 butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
640 a##_lo[right_index], a##_hi[right_index], \
641 left_constant, right_constant, &b##_lo[add_index], \
642 &b##_hi[add_index], &b##_lo[sub_index], \
643 &b##_hi[sub_index]); \
644 } while (0)
645
646 // Add 1 if positive, 2 if negative, and shift by 2.
647 // In practice, add 1, then add the sign bit, then shift without rounding.
add_round_shift_s32(const int32x4_t a_lo,const int32x4_t a_hi)648 static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
649 const int32x4_t a_hi) {
650 const int32x4_t one = vdupq_n_s32(1);
651 const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
652 const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
653 const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
654 const int16x4_t b_lo =
655 vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
656 const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
657 const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
658 const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
659 const int16x4_t b_hi =
660 vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
661 return vcombine_s16(b_lo, b_hi);
662 }
663
dct_body_second_pass(const int16x8_t * in,int16x8_t * out)664 static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
665 int16x8_t a[32];
666 int16x8_t b[32];
667 int32x4_t c_lo[32];
668 int32x4_t c_hi[32];
669 int32x4_t d_lo[32];
670 int32x4_t d_hi[32];
671
672 // Stage 1. Done as part of the load for the first pass.
673 a[0] = vaddq_s16(in[0], in[31]);
674 a[1] = vaddq_s16(in[1], in[30]);
675 a[2] = vaddq_s16(in[2], in[29]);
676 a[3] = vaddq_s16(in[3], in[28]);
677 a[4] = vaddq_s16(in[4], in[27]);
678 a[5] = vaddq_s16(in[5], in[26]);
679 a[6] = vaddq_s16(in[6], in[25]);
680 a[7] = vaddq_s16(in[7], in[24]);
681 a[8] = vaddq_s16(in[8], in[23]);
682 a[9] = vaddq_s16(in[9], in[22]);
683 a[10] = vaddq_s16(in[10], in[21]);
684 a[11] = vaddq_s16(in[11], in[20]);
685 a[12] = vaddq_s16(in[12], in[19]);
686 a[13] = vaddq_s16(in[13], in[18]);
687 a[14] = vaddq_s16(in[14], in[17]);
688 a[15] = vaddq_s16(in[15], in[16]);
689 a[16] = vsubq_s16(in[15], in[16]);
690 a[17] = vsubq_s16(in[14], in[17]);
691 a[18] = vsubq_s16(in[13], in[18]);
692 a[19] = vsubq_s16(in[12], in[19]);
693 a[20] = vsubq_s16(in[11], in[20]);
694 a[21] = vsubq_s16(in[10], in[21]);
695 a[22] = vsubq_s16(in[9], in[22]);
696 a[23] = vsubq_s16(in[8], in[23]);
697 a[24] = vsubq_s16(in[7], in[24]);
698 a[25] = vsubq_s16(in[6], in[25]);
699 a[26] = vsubq_s16(in[5], in[26]);
700 a[27] = vsubq_s16(in[4], in[27]);
701 a[28] = vsubq_s16(in[3], in[28]);
702 a[29] = vsubq_s16(in[2], in[29]);
703 a[30] = vsubq_s16(in[1], in[30]);
704 a[31] = vsubq_s16(in[0], in[31]);
705
706 // Stage 2.
707 b[0] = vaddq_s16(a[0], a[15]);
708 b[1] = vaddq_s16(a[1], a[14]);
709 b[2] = vaddq_s16(a[2], a[13]);
710 b[3] = vaddq_s16(a[3], a[12]);
711 b[4] = vaddq_s16(a[4], a[11]);
712 b[5] = vaddq_s16(a[5], a[10]);
713 b[6] = vaddq_s16(a[6], a[9]);
714 b[7] = vaddq_s16(a[7], a[8]);
715
716 b[8] = vsubq_s16(a[7], a[8]);
717 b[9] = vsubq_s16(a[6], a[9]);
718 b[10] = vsubq_s16(a[5], a[10]);
719 b[11] = vsubq_s16(a[4], a[11]);
720 b[12] = vsubq_s16(a[3], a[12]);
721 b[13] = vsubq_s16(a[2], a[13]);
722 b[14] = vsubq_s16(a[1], a[14]);
723 b[15] = vsubq_s16(a[0], a[15]);
724
725 b[16] = a[16];
726 b[17] = a[17];
727 b[18] = a[18];
728 b[19] = a[19];
729
730 butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
731 butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
732 butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
733 butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
734
735 b[28] = a[28];
736 b[29] = a[29];
737 b[30] = a[30];
738 b[31] = a[31];
739
740 // Stage 3. With extreme values for input this calculation rolls over int16_t.
741 // The sources for b[0] get added multiple times and, through testing, have
742 // been shown to overflow starting here.
743 ADD_S16_S32(b, 0, 7, c, 0);
744 ADD_S16_S32(b, 1, 6, c, 1);
745 ADD_S16_S32(b, 2, 5, c, 2);
746 ADD_S16_S32(b, 3, 4, c, 3);
747 SUB_S16_S32(b, 3, 4, c, 4);
748 SUB_S16_S32(b, 2, 5, c, 5);
749 SUB_S16_S32(b, 1, 6, c, 6);
750 SUB_S16_S32(b, 0, 7, c, 7);
751
752 a[8] = b[8];
753 a[9] = b[9];
754
755 BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
756 BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
757
758 a[14] = b[14];
759 a[15] = b[15];
760
761 ADD_S16_S32(b, 16, 23, c, 16);
762 ADD_S16_S32(b, 17, 22, c, 17);
763 ADD_S16_S32(b, 18, 21, c, 18);
764 ADD_S16_S32(b, 19, 20, c, 19);
765 SUB_S16_S32(b, 19, 20, c, 20);
766 SUB_S16_S32(b, 18, 21, c, 21);
767 SUB_S16_S32(b, 17, 22, c, 22);
768 SUB_S16_S32(b, 16, 23, c, 23);
769 SUB_S16_S32(b, 31, 24, c, 24);
770 SUB_S16_S32(b, 30, 25, c, 25);
771 SUB_S16_S32(b, 29, 26, c, 26);
772 SUB_S16_S32(b, 28, 27, c, 27);
773 ADD_S16_S32(b, 28, 27, c, 28);
774 ADD_S16_S32(b, 29, 26, c, 29);
775 ADD_S16_S32(b, 30, 25, c, 30);
776 ADD_S16_S32(b, 31, 24, c, 31);
777
778 // Stage 4.
779 ADD_S32(c, 0, 3, d, 0);
780 ADD_S32(c, 1, 2, d, 1);
781 SUB_S32(c, 1, 2, d, 2);
782 SUB_S32(c, 0, 3, d, 3);
783
784 PASS_THROUGH(c, d, 4);
785
786 BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
787
788 PASS_THROUGH(c, d, 7);
789
790 ADDW_S16_S32(c, 11, a, 8, d, 8);
791 ADDW_S16_S32(c, 10, a, 9, d, 9);
792 SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
793 SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
794 SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
795 SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
796 ADDW_S16_S32(c, 13, b, 14, d, 14);
797 ADDW_S16_S32(c, 12, b, 15, d, 15);
798
799 PASS_THROUGH(c, d, 16);
800 PASS_THROUGH(c, d, 17);
801
802 BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
803 BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
804 BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
805 BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
806
807 PASS_THROUGH(c, d, 22);
808 PASS_THROUGH(c, d, 23);
809 PASS_THROUGH(c, d, 24);
810 PASS_THROUGH(c, d, 25);
811
812 PASS_THROUGH(c, d, 30);
813 PASS_THROUGH(c, d, 31);
814
815 // Stage 5.
816 BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
817 BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
818
819 ADD_S32(d, 4, 5, c, 4);
820 SUB_S32(d, 4, 5, c, 5);
821 SUB_S32(d, 7, 6, c, 6);
822 ADD_S32(d, 7, 6, c, 7);
823
824 PASS_THROUGH(d, c, 8);
825
826 BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
827 BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
828
829 PASS_THROUGH(d, c, 11);
830 PASS_THROUGH(d, c, 12);
831 PASS_THROUGH(d, c, 15);
832
833 ADD_S32(d, 16, 19, c, 16);
834 ADD_S32(d, 17, 18, c, 17);
835 SUB_S32(d, 17, 18, c, 18);
836 SUB_S32(d, 16, 19, c, 19);
837 SUB_S32(d, 23, 20, c, 20);
838 SUB_S32(d, 22, 21, c, 21);
839 ADD_S32(d, 22, 21, c, 22);
840 ADD_S32(d, 23, 20, c, 23);
841 ADD_S32(d, 24, 27, c, 24);
842 ADD_S32(d, 25, 26, c, 25);
843 SUB_S32(d, 25, 26, c, 26);
844 SUB_S32(d, 24, 27, c, 27);
845 SUB_S32(d, 31, 28, c, 28);
846 SUB_S32(d, 30, 29, c, 29);
847 ADD_S32(d, 30, 29, c, 30);
848 ADD_S32(d, 31, 28, c, 31);
849
850 // Stage 6.
851 PASS_THROUGH(c, d, 0);
852 PASS_THROUGH(c, d, 1);
853 PASS_THROUGH(c, d, 2);
854 PASS_THROUGH(c, d, 3);
855
856 BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
857 BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
858
859 ADD_S32(c, 8, 9, d, 8);
860 SUB_S32(c, 8, 9, d, 9);
861 SUB_S32(c, 11, 10, d, 10);
862 ADD_S32(c, 11, 10, d, 11);
863 ADD_S32(c, 12, 13, d, 12);
864 SUB_S32(c, 12, 13, d, 13);
865 SUB_S32(c, 15, 14, d, 14);
866 ADD_S32(c, 15, 14, d, 15);
867
868 PASS_THROUGH(c, d, 16);
869 PASS_THROUGH(c, d, 19);
870 PASS_THROUGH(c, d, 20);
871 PASS_THROUGH(c, d, 23);
872 PASS_THROUGH(c, d, 24);
873 PASS_THROUGH(c, d, 27);
874 PASS_THROUGH(c, d, 28);
875 PASS_THROUGH(c, d, 31);
876
877 BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
878 BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
879 BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
880 BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
881
882 // Stage 7.
883 PASS_THROUGH(d, c, 0);
884 PASS_THROUGH(d, c, 1);
885 PASS_THROUGH(d, c, 2);
886 PASS_THROUGH(d, c, 3);
887 PASS_THROUGH(d, c, 4);
888 PASS_THROUGH(d, c, 5);
889 PASS_THROUGH(d, c, 6);
890 PASS_THROUGH(d, c, 7);
891
892 BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
893 BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
894 BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
895 BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
896
897 ADD_S32(d, 16, 17, c, 16);
898 SUB_S32(d, 16, 17, c, 17);
899 SUB_S32(d, 19, 18, c, 18);
900 ADD_S32(d, 19, 18, c, 19);
901 ADD_S32(d, 20, 21, c, 20);
902 SUB_S32(d, 20, 21, c, 21);
903 SUB_S32(d, 23, 22, c, 22);
904 ADD_S32(d, 23, 22, c, 23);
905 ADD_S32(d, 24, 25, c, 24);
906 SUB_S32(d, 24, 25, c, 25);
907 SUB_S32(d, 27, 26, c, 26);
908 ADD_S32(d, 27, 26, c, 27);
909 ADD_S32(d, 28, 29, c, 28);
910 SUB_S32(d, 28, 29, c, 29);
911 SUB_S32(d, 31, 30, c, 30);
912 ADD_S32(d, 31, 30, c, 31);
913
914 // Final stage.
915 // Roll rounding into this function so we can pass back int16x8.
916
917 out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
918 out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
919
920 out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
921 out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
922 out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
923 out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
924 out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
925
926 out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
927 out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
928 out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
929 out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
930
931 out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
932 out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
933 out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
934 out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
935 out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
936
937 BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
938 out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
939 out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
940
941 BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
942 out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
943 out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
944
945 BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
946 out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
947 out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
948
949 BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
950 out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
951 out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
952
953 BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
954 out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
955 out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
956
957 BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
958 out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
959 out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
960
961 BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
962 out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
963 out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
964
965 BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
966 out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
967 out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
968 }
969
970 // Add 1 if positive, 2 if negative, and shift by 2.
971 // In practice, add 1, then add the sign bit, then shift without rounding.
add_round_shift_s16(const int16x8_t a)972 static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
973 const int16x8_t one = vdupq_n_s16(1);
974 const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
975 const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
976 const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
977 return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
978 }
979
dct_body_second_pass_rd(const int16x8_t * in,int16x8_t * out)980 static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
981 int16x8_t a[32];
982 int16x8_t b[32];
983
984 // Stage 1. Done as part of the load for the first pass.
985 a[0] = vaddq_s16(in[0], in[31]);
986 a[1] = vaddq_s16(in[1], in[30]);
987 a[2] = vaddq_s16(in[2], in[29]);
988 a[3] = vaddq_s16(in[3], in[28]);
989 a[4] = vaddq_s16(in[4], in[27]);
990 a[5] = vaddq_s16(in[5], in[26]);
991 a[6] = vaddq_s16(in[6], in[25]);
992 a[7] = vaddq_s16(in[7], in[24]);
993 a[8] = vaddq_s16(in[8], in[23]);
994 a[9] = vaddq_s16(in[9], in[22]);
995 a[10] = vaddq_s16(in[10], in[21]);
996 a[11] = vaddq_s16(in[11], in[20]);
997 a[12] = vaddq_s16(in[12], in[19]);
998 a[13] = vaddq_s16(in[13], in[18]);
999 a[14] = vaddq_s16(in[14], in[17]);
1000 a[15] = vaddq_s16(in[15], in[16]);
1001 a[16] = vsubq_s16(in[15], in[16]);
1002 a[17] = vsubq_s16(in[14], in[17]);
1003 a[18] = vsubq_s16(in[13], in[18]);
1004 a[19] = vsubq_s16(in[12], in[19]);
1005 a[20] = vsubq_s16(in[11], in[20]);
1006 a[21] = vsubq_s16(in[10], in[21]);
1007 a[22] = vsubq_s16(in[9], in[22]);
1008 a[23] = vsubq_s16(in[8], in[23]);
1009 a[24] = vsubq_s16(in[7], in[24]);
1010 a[25] = vsubq_s16(in[6], in[25]);
1011 a[26] = vsubq_s16(in[5], in[26]);
1012 a[27] = vsubq_s16(in[4], in[27]);
1013 a[28] = vsubq_s16(in[3], in[28]);
1014 a[29] = vsubq_s16(in[2], in[29]);
1015 a[30] = vsubq_s16(in[1], in[30]);
1016 a[31] = vsubq_s16(in[0], in[31]);
1017
1018 // Stage 2.
1019 // For the "rd" version, all the values are rounded down after stage 2 to keep
1020 // the values in 16 bits.
1021 b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
1022 b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
1023 b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
1024 b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
1025 b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
1026 b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
1027 b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
1028 b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
1029
1030 b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
1031 b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
1032 b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
1033 b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
1034 b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
1035 b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
1036 b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
1037 b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
1038
1039 b[16] = add_round_shift_s16(a[16]);
1040 b[17] = add_round_shift_s16(a[17]);
1041 b[18] = add_round_shift_s16(a[18]);
1042 b[19] = add_round_shift_s16(a[19]);
1043
1044 butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
1045 butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
1046 butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
1047 butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
1048 b[20] = add_round_shift_s16(b[20]);
1049 b[21] = add_round_shift_s16(b[21]);
1050 b[22] = add_round_shift_s16(b[22]);
1051 b[23] = add_round_shift_s16(b[23]);
1052 b[24] = add_round_shift_s16(b[24]);
1053 b[25] = add_round_shift_s16(b[25]);
1054 b[26] = add_round_shift_s16(b[26]);
1055 b[27] = add_round_shift_s16(b[27]);
1056
1057 b[28] = add_round_shift_s16(a[28]);
1058 b[29] = add_round_shift_s16(a[29]);
1059 b[30] = add_round_shift_s16(a[30]);
1060 b[31] = add_round_shift_s16(a[31]);
1061
1062 // Stage 3.
1063 a[0] = vaddq_s16(b[0], b[7]);
1064 a[1] = vaddq_s16(b[1], b[6]);
1065 a[2] = vaddq_s16(b[2], b[5]);
1066 a[3] = vaddq_s16(b[3], b[4]);
1067
1068 a[4] = vsubq_s16(b[3], b[4]);
1069 a[5] = vsubq_s16(b[2], b[5]);
1070 a[6] = vsubq_s16(b[1], b[6]);
1071 a[7] = vsubq_s16(b[0], b[7]);
1072
1073 a[8] = b[8];
1074 a[9] = b[9];
1075
1076 butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
1077 butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
1078
1079 a[14] = b[14];
1080 a[15] = b[15];
1081
1082 a[16] = vaddq_s16(b[16], b[23]);
1083 a[17] = vaddq_s16(b[17], b[22]);
1084 a[18] = vaddq_s16(b[18], b[21]);
1085 a[19] = vaddq_s16(b[19], b[20]);
1086
1087 a[20] = vsubq_s16(b[19], b[20]);
1088 a[21] = vsubq_s16(b[18], b[21]);
1089 a[22] = vsubq_s16(b[17], b[22]);
1090 a[23] = vsubq_s16(b[16], b[23]);
1091
1092 a[24] = vsubq_s16(b[31], b[24]);
1093 a[25] = vsubq_s16(b[30], b[25]);
1094 a[26] = vsubq_s16(b[29], b[26]);
1095 a[27] = vsubq_s16(b[28], b[27]);
1096
1097 a[28] = vaddq_s16(b[28], b[27]);
1098 a[29] = vaddq_s16(b[29], b[26]);
1099 a[30] = vaddq_s16(b[30], b[25]);
1100 a[31] = vaddq_s16(b[31], b[24]);
1101
1102 // Stage 4.
1103 b[0] = vaddq_s16(a[0], a[3]);
1104 b[1] = vaddq_s16(a[1], a[2]);
1105 b[2] = vsubq_s16(a[1], a[2]);
1106 b[3] = vsubq_s16(a[0], a[3]);
1107
1108 b[4] = a[4];
1109
1110 butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
1111
1112 b[7] = a[7];
1113
1114 b[8] = vaddq_s16(a[8], a[11]);
1115 b[9] = vaddq_s16(a[9], a[10]);
1116 b[10] = vsubq_s16(a[9], a[10]);
1117 b[11] = vsubq_s16(a[8], a[11]);
1118 b[12] = vsubq_s16(a[15], a[12]);
1119 b[13] = vsubq_s16(a[14], a[13]);
1120 b[14] = vaddq_s16(a[14], a[13]);
1121 b[15] = vaddq_s16(a[15], a[12]);
1122
1123 b[16] = a[16];
1124 b[17] = a[17];
1125
1126 butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
1127 butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
1128 butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
1129 butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
1130
1131 b[22] = a[22];
1132 b[23] = a[23];
1133 b[24] = a[24];
1134 b[25] = a[25];
1135
1136 b[30] = a[30];
1137 b[31] = a[31];
1138
1139 // Stage 5.
1140 butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
1141 butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
1142
1143 a[4] = vaddq_s16(b[4], b[5]);
1144 a[5] = vsubq_s16(b[4], b[5]);
1145 a[6] = vsubq_s16(b[7], b[6]);
1146 a[7] = vaddq_s16(b[7], b[6]);
1147
1148 a[8] = b[8];
1149
1150 butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
1151 butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
1152
1153 a[11] = b[11];
1154 a[12] = b[12];
1155
1156 a[15] = b[15];
1157
1158 a[16] = vaddq_s16(b[19], b[16]);
1159 a[17] = vaddq_s16(b[18], b[17]);
1160 a[18] = vsubq_s16(b[17], b[18]);
1161 a[19] = vsubq_s16(b[16], b[19]);
1162 a[20] = vsubq_s16(b[23], b[20]);
1163 a[21] = vsubq_s16(b[22], b[21]);
1164 a[22] = vaddq_s16(b[21], b[22]);
1165 a[23] = vaddq_s16(b[20], b[23]);
1166 a[24] = vaddq_s16(b[27], b[24]);
1167 a[25] = vaddq_s16(b[26], b[25]);
1168 a[26] = vsubq_s16(b[25], b[26]);
1169 a[27] = vsubq_s16(b[24], b[27]);
1170 a[28] = vsubq_s16(b[31], b[28]);
1171 a[29] = vsubq_s16(b[30], b[29]);
1172 a[30] = vaddq_s16(b[29], b[30]);
1173 a[31] = vaddq_s16(b[28], b[31]);
1174
1175 // Stage 6.
1176 b[0] = a[0];
1177 b[1] = a[1];
1178 b[2] = a[2];
1179 b[3] = a[3];
1180
1181 butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
1182 butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
1183
1184 b[8] = vaddq_s16(a[8], a[9]);
1185 b[9] = vsubq_s16(a[8], a[9]);
1186 b[10] = vsubq_s16(a[11], a[10]);
1187 b[11] = vaddq_s16(a[11], a[10]);
1188 b[12] = vaddq_s16(a[12], a[13]);
1189 b[13] = vsubq_s16(a[12], a[13]);
1190 b[14] = vsubq_s16(a[15], a[14]);
1191 b[15] = vaddq_s16(a[15], a[14]);
1192
1193 b[16] = a[16];
1194 b[19] = a[19];
1195 b[20] = a[20];
1196 b[23] = a[23];
1197 b[24] = a[24];
1198 b[27] = a[27];
1199 b[28] = a[28];
1200 b[31] = a[31];
1201
1202 butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
1203 butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
1204
1205 butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
1206 butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
1207
1208 // Stage 7.
1209 a[0] = b[0];
1210 a[1] = b[1];
1211 a[2] = b[2];
1212 a[3] = b[3];
1213 a[4] = b[4];
1214 a[5] = b[5];
1215 a[6] = b[6];
1216 a[7] = b[7];
1217
1218 butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
1219 butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
1220 butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
1221 butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
1222
1223 a[16] = vaddq_s16(b[16], b[17]);
1224 a[17] = vsubq_s16(b[16], b[17]);
1225 a[18] = vsubq_s16(b[19], b[18]);
1226 a[19] = vaddq_s16(b[19], b[18]);
1227 a[20] = vaddq_s16(b[20], b[21]);
1228 a[21] = vsubq_s16(b[20], b[21]);
1229 a[22] = vsubq_s16(b[23], b[22]);
1230 a[23] = vaddq_s16(b[23], b[22]);
1231 a[24] = vaddq_s16(b[24], b[25]);
1232 a[25] = vsubq_s16(b[24], b[25]);
1233 a[26] = vsubq_s16(b[27], b[26]);
1234 a[27] = vaddq_s16(b[27], b[26]);
1235 a[28] = vaddq_s16(b[28], b[29]);
1236 a[29] = vsubq_s16(b[28], b[29]);
1237 a[30] = vsubq_s16(b[31], b[30]);
1238 a[31] = vaddq_s16(b[31], b[30]);
1239
1240 // Final stage.
1241 out[0] = a[0];
1242 out[16] = a[1];
1243 out[8] = a[2];
1244 out[24] = a[3];
1245 out[4] = a[4];
1246 out[20] = a[5];
1247 out[12] = a[6];
1248 out[28] = a[7];
1249 out[2] = a[8];
1250 out[18] = a[9];
1251 out[10] = a[10];
1252 out[26] = a[11];
1253 out[6] = a[12];
1254 out[22] = a[13];
1255 out[14] = a[14];
1256 out[30] = a[15];
1257
1258 butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
1259 butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
1260 &out[15]);
1261 butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
1262 butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
1263 butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
1264 butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
1265 &out[11]);
1266 butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
1267 &out[19]);
1268 butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
1269 }
1270
1271 #undef PASS_THROUGH
1272 #undef ADD_S16_S32
1273 #undef SUB_S16_S32
1274 #undef ADDW_S16_S32
1275 #undef SUBW_S16_S32
1276 #undef ADD_S32
1277 #undef SUB_S32
1278 #undef BUTTERFLY_ONE_S16_S32
1279 #undef BUTTERFLY_ONE_S32
1280 #undef BUTTERFLY_TWO_S32
1281
1282 // Transpose 8x8 to a new location. Don't use transpose_neon.h because those
1283 // are all in-place.
1284 // TODO(johannkoenig): share with other fdcts.
transpose_8x8(const int16x8_t * a,int16x8_t * b)1285 static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
1286 // Swap 16 bit elements.
1287 const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
1288 const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
1289 const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
1290 const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
1291
1292 // Swap 32 bit elements.
1293 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
1294 vreinterpretq_s32_s16(c1.val[0]));
1295 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
1296 vreinterpretq_s32_s16(c1.val[1]));
1297 const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
1298 vreinterpretq_s32_s16(c3.val[0]));
1299 const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
1300 vreinterpretq_s32_s16(c3.val[1]));
1301
1302 // Swap 64 bit elements
1303 const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
1304 const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
1305 const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
1306 const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
1307
1308 b[0] = e0.val[0];
1309 b[1] = e1.val[0];
1310 b[2] = e2.val[0];
1311 b[3] = e3.val[0];
1312 b[4] = e0.val[1];
1313 b[5] = e1.val[1];
1314 b[6] = e2.val[1];
1315 b[7] = e3.val[1];
1316 }
1317
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)1318 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
1319 int16x8_t temp0[32];
1320 int16x8_t temp1[32];
1321 int16x8_t temp2[32];
1322 int16x8_t temp3[32];
1323 int16x8_t temp4[32];
1324 int16x8_t temp5[32];
1325
1326 // Process in 8x32 columns.
1327 load(input, stride, temp0);
1328 dct_body_first_pass(temp0, temp1);
1329
1330 load(input + 8, stride, temp0);
1331 dct_body_first_pass(temp0, temp2);
1332
1333 load(input + 16, stride, temp0);
1334 dct_body_first_pass(temp0, temp3);
1335
1336 load(input + 24, stride, temp0);
1337 dct_body_first_pass(temp0, temp4);
1338
1339 // Generate the top row by munging the first set of 8 from each one together.
1340 transpose_8x8(&temp1[0], &temp0[0]);
1341 transpose_8x8(&temp2[0], &temp0[8]);
1342 transpose_8x8(&temp3[0], &temp0[16]);
1343 transpose_8x8(&temp4[0], &temp0[24]);
1344
1345 dct_body_second_pass(temp0, temp5);
1346
1347 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1348 &temp5[5], &temp5[6], &temp5[7]);
1349 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1350 &temp5[13], &temp5[14], &temp5[15]);
1351 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1352 &temp5[21], &temp5[22], &temp5[23]);
1353 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1354 &temp5[29], &temp5[30], &temp5[31]);
1355 store(output, temp5);
1356
1357 // Second row of 8x32.
1358 transpose_8x8(&temp1[8], &temp0[0]);
1359 transpose_8x8(&temp2[8], &temp0[8]);
1360 transpose_8x8(&temp3[8], &temp0[16]);
1361 transpose_8x8(&temp4[8], &temp0[24]);
1362
1363 dct_body_second_pass(temp0, temp5);
1364
1365 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1366 &temp5[5], &temp5[6], &temp5[7]);
1367 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1368 &temp5[13], &temp5[14], &temp5[15]);
1369 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1370 &temp5[21], &temp5[22], &temp5[23]);
1371 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1372 &temp5[29], &temp5[30], &temp5[31]);
1373 store(output + 8 * 32, temp5);
1374
1375 // Third row of 8x32
1376 transpose_8x8(&temp1[16], &temp0[0]);
1377 transpose_8x8(&temp2[16], &temp0[8]);
1378 transpose_8x8(&temp3[16], &temp0[16]);
1379 transpose_8x8(&temp4[16], &temp0[24]);
1380
1381 dct_body_second_pass(temp0, temp5);
1382
1383 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1384 &temp5[5], &temp5[6], &temp5[7]);
1385 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1386 &temp5[13], &temp5[14], &temp5[15]);
1387 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1388 &temp5[21], &temp5[22], &temp5[23]);
1389 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1390 &temp5[29], &temp5[30], &temp5[31]);
1391 store(output + 16 * 32, temp5);
1392
1393 // Final row of 8x32.
1394 transpose_8x8(&temp1[24], &temp0[0]);
1395 transpose_8x8(&temp2[24], &temp0[8]);
1396 transpose_8x8(&temp3[24], &temp0[16]);
1397 transpose_8x8(&temp4[24], &temp0[24]);
1398
1399 dct_body_second_pass(temp0, temp5);
1400
1401 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1402 &temp5[5], &temp5[6], &temp5[7]);
1403 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1404 &temp5[13], &temp5[14], &temp5[15]);
1405 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1406 &temp5[21], &temp5[22], &temp5[23]);
1407 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1408 &temp5[29], &temp5[30], &temp5[31]);
1409 store(output + 24 * 32, temp5);
1410 }
1411
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)1412 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
1413 int stride) {
1414 int16x8_t temp0[32];
1415 int16x8_t temp1[32];
1416 int16x8_t temp2[32];
1417 int16x8_t temp3[32];
1418 int16x8_t temp4[32];
1419 int16x8_t temp5[32];
1420
1421 // Process in 8x32 columns.
1422 load(input, stride, temp0);
1423 dct_body_first_pass(temp0, temp1);
1424
1425 load(input + 8, stride, temp0);
1426 dct_body_first_pass(temp0, temp2);
1427
1428 load(input + 16, stride, temp0);
1429 dct_body_first_pass(temp0, temp3);
1430
1431 load(input + 24, stride, temp0);
1432 dct_body_first_pass(temp0, temp4);
1433
1434 // Generate the top row by munging the first set of 8 from each one together.
1435 transpose_8x8(&temp1[0], &temp0[0]);
1436 transpose_8x8(&temp2[0], &temp0[8]);
1437 transpose_8x8(&temp3[0], &temp0[16]);
1438 transpose_8x8(&temp4[0], &temp0[24]);
1439
1440 dct_body_second_pass_rd(temp0, temp5);
1441
1442 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1443 &temp5[5], &temp5[6], &temp5[7]);
1444 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1445 &temp5[13], &temp5[14], &temp5[15]);
1446 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1447 &temp5[21], &temp5[22], &temp5[23]);
1448 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1449 &temp5[29], &temp5[30], &temp5[31]);
1450 store(output, temp5);
1451
1452 // Second row of 8x32.
1453 transpose_8x8(&temp1[8], &temp0[0]);
1454 transpose_8x8(&temp2[8], &temp0[8]);
1455 transpose_8x8(&temp3[8], &temp0[16]);
1456 transpose_8x8(&temp4[8], &temp0[24]);
1457
1458 dct_body_second_pass_rd(temp0, temp5);
1459
1460 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1461 &temp5[5], &temp5[6], &temp5[7]);
1462 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1463 &temp5[13], &temp5[14], &temp5[15]);
1464 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1465 &temp5[21], &temp5[22], &temp5[23]);
1466 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1467 &temp5[29], &temp5[30], &temp5[31]);
1468 store(output + 8 * 32, temp5);
1469
1470 // Third row of 8x32
1471 transpose_8x8(&temp1[16], &temp0[0]);
1472 transpose_8x8(&temp2[16], &temp0[8]);
1473 transpose_8x8(&temp3[16], &temp0[16]);
1474 transpose_8x8(&temp4[16], &temp0[24]);
1475
1476 dct_body_second_pass_rd(temp0, temp5);
1477
1478 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1479 &temp5[5], &temp5[6], &temp5[7]);
1480 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1481 &temp5[13], &temp5[14], &temp5[15]);
1482 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1483 &temp5[21], &temp5[22], &temp5[23]);
1484 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1485 &temp5[29], &temp5[30], &temp5[31]);
1486 store(output + 16 * 32, temp5);
1487
1488 // Final row of 8x32.
1489 transpose_8x8(&temp1[24], &temp0[0]);
1490 transpose_8x8(&temp2[24], &temp0[8]);
1491 transpose_8x8(&temp3[24], &temp0[16]);
1492 transpose_8x8(&temp4[24], &temp0[24]);
1493
1494 dct_body_second_pass_rd(temp0, temp5);
1495
1496 transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1497 &temp5[5], &temp5[6], &temp5[7]);
1498 transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1499 &temp5[13], &temp5[14], &temp5[15]);
1500 transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1501 &temp5[21], &temp5[22], &temp5[23]);
1502 transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1503 &temp5[29], &temp5[30], &temp5[31]);
1504 store(output + 24 * 32, temp5);
1505 }
1506 #endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
1507 // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
1508