1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/txfm_common.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 
19 // Most gcc 4.9 distributions outside of Android do not generate correct code
20 // for this function.
21 #if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
22     __GNUC__ == 4 && __GNUC_MINOR__ <= 9
23 
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)24 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
25   vpx_fdct32x32_c(input, output, stride);
26 }
27 
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)28 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
29                            int stride) {
30   vpx_fdct32x32_rd_c(input, output, stride);
31 }
32 
33 #else
34 
35 #define LOAD_INCREMENT(src, stride, dest, index) \
36   do {                                           \
37     dest[index] = vld1q_s16(src);                \
38     src += stride;                               \
39   } while (0)
40 
41 #define ADD_S16(src, index0, index1, dest, index3)      \
42   do {                                                  \
43     dest[index3] = vaddq_s16(src[index0], src[index1]); \
44   } while (0)
45 
46 #define ADD_SHIFT_S16(src, index0, index1)                             \
47   do {                                                                 \
48     src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
49   } while (0)
50 
51 // Load, cross, and multiply by 4. Load the first 8 and last 8, then the
52 // middle
53 // 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
load(const int16_t * a,int stride,int16x8_t * b)54 static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
55   const int16_t *a_end = a + 24 * stride;
56   int16x8_t c[8];
57 
58   LOAD_INCREMENT(a, stride, b, 0);
59   LOAD_INCREMENT(a, stride, b, 1);
60   LOAD_INCREMENT(a, stride, b, 2);
61   LOAD_INCREMENT(a, stride, b, 3);
62   LOAD_INCREMENT(a, stride, b, 4);
63   LOAD_INCREMENT(a, stride, b, 5);
64   LOAD_INCREMENT(a, stride, b, 6);
65   LOAD_INCREMENT(a, stride, b, 7);
66 
67   LOAD_INCREMENT(a_end, stride, b, 24);
68   LOAD_INCREMENT(a_end, stride, b, 25);
69   LOAD_INCREMENT(a_end, stride, b, 26);
70   LOAD_INCREMENT(a_end, stride, b, 27);
71   LOAD_INCREMENT(a_end, stride, b, 28);
72   LOAD_INCREMENT(a_end, stride, b, 29);
73   LOAD_INCREMENT(a_end, stride, b, 30);
74   LOAD_INCREMENT(a_end, stride, b, 31);
75 
76   ADD_S16(b, 0, 31, c, 0);
77   ADD_S16(b, 1, 30, c, 1);
78   ADD_S16(b, 2, 29, c, 2);
79   ADD_S16(b, 3, 28, c, 3);
80   ADD_S16(b, 4, 27, c, 4);
81   ADD_S16(b, 5, 26, c, 5);
82   ADD_S16(b, 6, 25, c, 6);
83   ADD_S16(b, 7, 24, c, 7);
84 
85   ADD_SHIFT_S16(b, 7, 24);
86   ADD_SHIFT_S16(b, 6, 25);
87   ADD_SHIFT_S16(b, 5, 26);
88   ADD_SHIFT_S16(b, 4, 27);
89   ADD_SHIFT_S16(b, 3, 28);
90   ADD_SHIFT_S16(b, 2, 29);
91   ADD_SHIFT_S16(b, 1, 30);
92   ADD_SHIFT_S16(b, 0, 31);
93 
94   b[0] = vshlq_n_s16(c[0], 2);
95   b[1] = vshlq_n_s16(c[1], 2);
96   b[2] = vshlq_n_s16(c[2], 2);
97   b[3] = vshlq_n_s16(c[3], 2);
98   b[4] = vshlq_n_s16(c[4], 2);
99   b[5] = vshlq_n_s16(c[5], 2);
100   b[6] = vshlq_n_s16(c[6], 2);
101   b[7] = vshlq_n_s16(c[7], 2);
102 
103   LOAD_INCREMENT(a, stride, b, 8);
104   LOAD_INCREMENT(a, stride, b, 9);
105   LOAD_INCREMENT(a, stride, b, 10);
106   LOAD_INCREMENT(a, stride, b, 11);
107   LOAD_INCREMENT(a, stride, b, 12);
108   LOAD_INCREMENT(a, stride, b, 13);
109   LOAD_INCREMENT(a, stride, b, 14);
110   LOAD_INCREMENT(a, stride, b, 15);
111   LOAD_INCREMENT(a, stride, b, 16);
112   LOAD_INCREMENT(a, stride, b, 17);
113   LOAD_INCREMENT(a, stride, b, 18);
114   LOAD_INCREMENT(a, stride, b, 19);
115   LOAD_INCREMENT(a, stride, b, 20);
116   LOAD_INCREMENT(a, stride, b, 21);
117   LOAD_INCREMENT(a, stride, b, 22);
118   LOAD_INCREMENT(a, stride, b, 23);
119 
120   ADD_S16(b, 8, 23, c, 0);
121   ADD_S16(b, 9, 22, c, 1);
122   ADD_S16(b, 10, 21, c, 2);
123   ADD_S16(b, 11, 20, c, 3);
124   ADD_S16(b, 12, 19, c, 4);
125   ADD_S16(b, 13, 18, c, 5);
126   ADD_S16(b, 14, 17, c, 6);
127   ADD_S16(b, 15, 16, c, 7);
128 
129   ADD_SHIFT_S16(b, 15, 16);
130   ADD_SHIFT_S16(b, 14, 17);
131   ADD_SHIFT_S16(b, 13, 18);
132   ADD_SHIFT_S16(b, 12, 19);
133   ADD_SHIFT_S16(b, 11, 20);
134   ADD_SHIFT_S16(b, 10, 21);
135   ADD_SHIFT_S16(b, 9, 22);
136   ADD_SHIFT_S16(b, 8, 23);
137 
138   b[8] = vshlq_n_s16(c[0], 2);
139   b[9] = vshlq_n_s16(c[1], 2);
140   b[10] = vshlq_n_s16(c[2], 2);
141   b[11] = vshlq_n_s16(c[3], 2);
142   b[12] = vshlq_n_s16(c[4], 2);
143   b[13] = vshlq_n_s16(c[5], 2);
144   b[14] = vshlq_n_s16(c[6], 2);
145   b[15] = vshlq_n_s16(c[7], 2);
146 }
147 
148 #undef LOAD_INCREMENT
149 #undef ADD_S16
150 #undef ADD_SHIFT_S16
151 
152 #define STORE_S16(src, index, dest)           \
153   do {                                        \
154     store_s16q_to_tran_low(dest, src[index]); \
155     dest += 8;                                \
156   } while (0);
157 
158 // Store 32 16x8 values, assuming stride == 32.
159 // Slight twist: store horizontally in blocks of 8.
store(tran_low_t * a,const int16x8_t * b)160 static INLINE void store(tran_low_t *a, const int16x8_t *b) {
161   STORE_S16(b, 0, a);
162   STORE_S16(b, 8, a);
163   STORE_S16(b, 16, a);
164   STORE_S16(b, 24, a);
165   STORE_S16(b, 1, a);
166   STORE_S16(b, 9, a);
167   STORE_S16(b, 17, a);
168   STORE_S16(b, 25, a);
169   STORE_S16(b, 2, a);
170   STORE_S16(b, 10, a);
171   STORE_S16(b, 18, a);
172   STORE_S16(b, 26, a);
173   STORE_S16(b, 3, a);
174   STORE_S16(b, 11, a);
175   STORE_S16(b, 19, a);
176   STORE_S16(b, 27, a);
177   STORE_S16(b, 4, a);
178   STORE_S16(b, 12, a);
179   STORE_S16(b, 20, a);
180   STORE_S16(b, 28, a);
181   STORE_S16(b, 5, a);
182   STORE_S16(b, 13, a);
183   STORE_S16(b, 21, a);
184   STORE_S16(b, 29, a);
185   STORE_S16(b, 6, a);
186   STORE_S16(b, 14, a);
187   STORE_S16(b, 22, a);
188   STORE_S16(b, 30, a);
189   STORE_S16(b, 7, a);
190   STORE_S16(b, 15, a);
191   STORE_S16(b, 23, a);
192   STORE_S16(b, 31, a);
193 }
194 
195 #undef STORE_S16
196 
197 // fdct_round_shift((a +/- b) * c)
butterfly_one_coeff(const int16x8_t a,const int16x8_t b,const tran_high_t constant,int16x8_t * add,int16x8_t * sub)198 static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
199                                        const tran_high_t constant,
200                                        int16x8_t *add, int16x8_t *sub) {
201   const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
202   const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
203   const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
204   const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
205   const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
206   const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
207   const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
208   const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
209   const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
210   const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
211   *add = vcombine_s16(rounded0, rounded1);
212   *sub = vcombine_s16(rounded2, rounded3);
213 }
214 
215 // fdct_round_shift(a * c0 +/- b * c1)
butterfly_two_coeff(const int16x8_t a,const int16x8_t b,const tran_coef_t constant0,const tran_coef_t constant1,int16x8_t * add,int16x8_t * sub)216 static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
217                                        const tran_coef_t constant0,
218                                        const tran_coef_t constant1,
219                                        int16x8_t *add, int16x8_t *sub) {
220   const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
221   const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
222   const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
223   const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
224   const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
225   const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
226   const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
227   const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
228   const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
229   const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
230   const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
231   const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
232   *add = vcombine_s16(rounded0, rounded1);
233   *sub = vcombine_s16(rounded2, rounded3);
234 }
235 
236 // Add 2 if positive, 1 if negative, and shift by 2.
237 // In practice, subtract the sign bit, then shift with rounding.
sub_round_shift(const int16x8_t a)238 static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
239   const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
240   const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
241   const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
242   return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
243 }
244 
dct_body_first_pass(const int16x8_t * in,int16x8_t * out)245 static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
246   int16x8_t a[32];
247   int16x8_t b[32];
248 
249   // Stage 1: Done as part of the load.
250 
251   // Stage 2.
252   // Mini cross. X the first 16 values and the middle 8 of the second half.
253   a[0] = vaddq_s16(in[0], in[15]);
254   a[1] = vaddq_s16(in[1], in[14]);
255   a[2] = vaddq_s16(in[2], in[13]);
256   a[3] = vaddq_s16(in[3], in[12]);
257   a[4] = vaddq_s16(in[4], in[11]);
258   a[5] = vaddq_s16(in[5], in[10]);
259   a[6] = vaddq_s16(in[6], in[9]);
260   a[7] = vaddq_s16(in[7], in[8]);
261 
262   a[8] = vsubq_s16(in[7], in[8]);
263   a[9] = vsubq_s16(in[6], in[9]);
264   a[10] = vsubq_s16(in[5], in[10]);
265   a[11] = vsubq_s16(in[4], in[11]);
266   a[12] = vsubq_s16(in[3], in[12]);
267   a[13] = vsubq_s16(in[2], in[13]);
268   a[14] = vsubq_s16(in[1], in[14]);
269   a[15] = vsubq_s16(in[0], in[15]);
270 
271   a[16] = in[16];
272   a[17] = in[17];
273   a[18] = in[18];
274   a[19] = in[19];
275 
276   butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
277   butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
278   butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
279   butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
280 
281   a[28] = in[28];
282   a[29] = in[29];
283   a[30] = in[30];
284   a[31] = in[31];
285 
286   // Stage 3.
287   b[0] = vaddq_s16(a[0], a[7]);
288   b[1] = vaddq_s16(a[1], a[6]);
289   b[2] = vaddq_s16(a[2], a[5]);
290   b[3] = vaddq_s16(a[3], a[4]);
291 
292   b[4] = vsubq_s16(a[3], a[4]);
293   b[5] = vsubq_s16(a[2], a[5]);
294   b[6] = vsubq_s16(a[1], a[6]);
295   b[7] = vsubq_s16(a[0], a[7]);
296 
297   b[8] = a[8];
298   b[9] = a[9];
299 
300   butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
301   butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
302 
303   b[14] = a[14];
304   b[15] = a[15];
305 
306   b[16] = vaddq_s16(in[16], a[23]);
307   b[17] = vaddq_s16(in[17], a[22]);
308   b[18] = vaddq_s16(in[18], a[21]);
309   b[19] = vaddq_s16(in[19], a[20]);
310 
311   b[20] = vsubq_s16(in[19], a[20]);
312   b[21] = vsubq_s16(in[18], a[21]);
313   b[22] = vsubq_s16(in[17], a[22]);
314   b[23] = vsubq_s16(in[16], a[23]);
315 
316   b[24] = vsubq_s16(in[31], a[24]);
317   b[25] = vsubq_s16(in[30], a[25]);
318   b[26] = vsubq_s16(in[29], a[26]);
319   b[27] = vsubq_s16(in[28], a[27]);
320 
321   b[28] = vaddq_s16(in[28], a[27]);
322   b[29] = vaddq_s16(in[29], a[26]);
323   b[30] = vaddq_s16(in[30], a[25]);
324   b[31] = vaddq_s16(in[31], a[24]);
325 
326   // Stage 4.
327   a[0] = vaddq_s16(b[0], b[3]);
328   a[1] = vaddq_s16(b[1], b[2]);
329   a[2] = vsubq_s16(b[1], b[2]);
330   a[3] = vsubq_s16(b[0], b[3]);
331 
332   a[4] = b[4];
333 
334   butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
335 
336   a[7] = b[7];
337 
338   a[8] = vaddq_s16(b[8], b[11]);
339   a[9] = vaddq_s16(b[9], b[10]);
340   a[10] = vsubq_s16(b[9], b[10]);
341   a[11] = vsubq_s16(b[8], b[11]);
342   a[12] = vsubq_s16(b[15], b[12]);
343   a[13] = vsubq_s16(b[14], b[13]);
344   a[14] = vaddq_s16(b[14], b[13]);
345   a[15] = vaddq_s16(b[15], b[12]);
346 
347   a[16] = b[16];
348   a[17] = b[17];
349 
350   butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
351   butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
352   butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
353   butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
354 
355   a[22] = b[22];
356   a[23] = b[23];
357   a[24] = b[24];
358   a[25] = b[25];
359 
360   a[30] = b[30];
361   a[31] = b[31];
362 
363   // Stage 5.
364   butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
365   butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
366 
367   b[4] = vaddq_s16(a[4], a[5]);
368   b[5] = vsubq_s16(a[4], a[5]);
369   b[6] = vsubq_s16(a[7], a[6]);
370   b[7] = vaddq_s16(a[7], a[6]);
371 
372   b[8] = a[8];
373 
374   butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
375   butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
376 
377   b[11] = a[11];
378   b[12] = a[12];
379 
380   b[15] = a[15];
381 
382   b[16] = vaddq_s16(a[19], a[16]);
383   b[17] = vaddq_s16(a[18], a[17]);
384   b[18] = vsubq_s16(a[17], a[18]);
385   b[19] = vsubq_s16(a[16], a[19]);
386   b[20] = vsubq_s16(a[23], a[20]);
387   b[21] = vsubq_s16(a[22], a[21]);
388   b[22] = vaddq_s16(a[21], a[22]);
389   b[23] = vaddq_s16(a[20], a[23]);
390   b[24] = vaddq_s16(a[27], a[24]);
391   b[25] = vaddq_s16(a[26], a[25]);
392   b[26] = vsubq_s16(a[25], a[26]);
393   b[27] = vsubq_s16(a[24], a[27]);
394   b[28] = vsubq_s16(a[31], a[28]);
395   b[29] = vsubq_s16(a[30], a[29]);
396   b[30] = vaddq_s16(a[29], a[30]);
397   b[31] = vaddq_s16(a[28], a[31]);
398 
399   // Stage 6.
400   a[0] = b[0];
401   a[1] = b[1];
402   a[2] = b[2];
403   a[3] = b[3];
404 
405   butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
406   butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
407 
408   a[8] = vaddq_s16(b[8], b[9]);
409   a[9] = vsubq_s16(b[8], b[9]);
410   a[10] = vsubq_s16(b[11], b[10]);
411   a[11] = vaddq_s16(b[11], b[10]);
412   a[12] = vaddq_s16(b[12], b[13]);
413   a[13] = vsubq_s16(b[12], b[13]);
414   a[14] = vsubq_s16(b[15], b[14]);
415   a[15] = vaddq_s16(b[15], b[14]);
416 
417   a[16] = b[16];
418   a[19] = b[19];
419   a[20] = b[20];
420   a[23] = b[23];
421   a[24] = b[24];
422   a[27] = b[27];
423   a[28] = b[28];
424   a[31] = b[31];
425 
426   butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
427   butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
428 
429   butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
430   butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
431 
432   // Stage 7.
433   b[0] = a[0];
434   b[1] = a[1];
435   b[2] = a[2];
436   b[3] = a[3];
437   b[4] = a[4];
438   b[5] = a[5];
439   b[6] = a[6];
440   b[7] = a[7];
441 
442   butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
443   butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
444   butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
445   butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
446 
447   b[16] = vaddq_s16(a[16], a[17]);
448   b[17] = vsubq_s16(a[16], a[17]);
449   b[18] = vsubq_s16(a[19], a[18]);
450   b[19] = vaddq_s16(a[19], a[18]);
451   b[20] = vaddq_s16(a[20], a[21]);
452   b[21] = vsubq_s16(a[20], a[21]);
453   b[22] = vsubq_s16(a[23], a[22]);
454   b[23] = vaddq_s16(a[23], a[22]);
455   b[24] = vaddq_s16(a[24], a[25]);
456   b[25] = vsubq_s16(a[24], a[25]);
457   b[26] = vsubq_s16(a[27], a[26]);
458   b[27] = vaddq_s16(a[27], a[26]);
459   b[28] = vaddq_s16(a[28], a[29]);
460   b[29] = vsubq_s16(a[28], a[29]);
461   b[30] = vsubq_s16(a[31], a[30]);
462   b[31] = vaddq_s16(a[31], a[30]);
463 
464   // Final stage.
465   // Also compute partial rounding shift:
466   // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
467   out[0] = sub_round_shift(b[0]);
468   out[16] = sub_round_shift(b[1]);
469   out[8] = sub_round_shift(b[2]);
470   out[24] = sub_round_shift(b[3]);
471   out[4] = sub_round_shift(b[4]);
472   out[20] = sub_round_shift(b[5]);
473   out[12] = sub_round_shift(b[6]);
474   out[28] = sub_round_shift(b[7]);
475   out[2] = sub_round_shift(b[8]);
476   out[18] = sub_round_shift(b[9]);
477   out[10] = sub_round_shift(b[10]);
478   out[26] = sub_round_shift(b[11]);
479   out[6] = sub_round_shift(b[12]);
480   out[22] = sub_round_shift(b[13]);
481   out[14] = sub_round_shift(b[14]);
482   out[30] = sub_round_shift(b[15]);
483 
484   butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
485   out[1] = sub_round_shift(a[1]);
486   out[31] = sub_round_shift(a[31]);
487 
488   butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
489   out[17] = sub_round_shift(a[17]);
490   out[15] = sub_round_shift(a[15]);
491 
492   butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
493   out[9] = sub_round_shift(a[9]);
494   out[23] = sub_round_shift(a[23]);
495 
496   butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
497   out[25] = sub_round_shift(a[25]);
498   out[7] = sub_round_shift(a[7]);
499 
500   butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
501   out[5] = sub_round_shift(a[5]);
502   out[27] = sub_round_shift(a[27]);
503 
504   butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
505   out[21] = sub_round_shift(a[21]);
506   out[11] = sub_round_shift(a[11]);
507 
508   butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
509   out[13] = sub_round_shift(a[13]);
510   out[19] = sub_round_shift(a[19]);
511 
512   butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
513   out[29] = sub_round_shift(a[29]);
514   out[3] = sub_round_shift(a[3]);
515 }
516 
517 #define PASS_THROUGH(src, dst, element)    \
518   do {                                     \
519     dst##_lo[element] = src##_lo[element]; \
520     dst##_hi[element] = src##_hi[element]; \
521   } while (0)
522 
523 #define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
524   do {                                                                        \
525     b##_lo[b_index] =                                                         \
526         vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
527     b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
528                                 vget_high_s16(a[right_index]));               \
529   } while (0)
530 
531 #define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
532   do {                                                                        \
533     b##_lo[b_index] =                                                         \
534         vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
535     b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
536                                 vget_high_s16(a[right_index]));               \
537   } while (0)
538 
539 #define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
540   do {                                                                       \
541     c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
542     c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
543   } while (0)
544 
545 #define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
546   do {                                                                     \
547     temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
548     temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
549     c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
550     c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
551   } while (0)
552 
553 #define ADD_S32(a, left_index, right_index, b, b_index)                   \
554   do {                                                                    \
555     b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
556     b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
557   } while (0)
558 
559 #define SUB_S32(a, left_index, right_index, b, b_index)                   \
560   do {                                                                    \
561     b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
562     b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
563   } while (0)
564 
565 // Like butterfly_one_coeff, but don't narrow results.
butterfly_one_coeff_s16_s32(const int16x8_t a,const int16x8_t b,const tran_high_t constant,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)566 static INLINE void butterfly_one_coeff_s16_s32(
567     const int16x8_t a, const int16x8_t b, const tran_high_t constant,
568     int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
569     int32x4_t *sub_hi) {
570   const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
571   const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
572   const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
573   const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
574   const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
575   const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
576   *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
577   *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
578   *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
579   *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
580 }
581 
582 #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
583                               add_index, sub_index)                      \
584   do {                                                                   \
585     butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
586                                 &b##_lo[add_index], &b##_hi[add_index],  \
587                                 &b##_lo[sub_index], &b##_hi[sub_index]); \
588   } while (0)
589 
590 // Like butterfly_one_coeff, but with s32.
butterfly_one_coeff_s32(const int32x4_t a_lo,const int32x4_t a_hi,const int32x4_t b_lo,const int32x4_t b_hi,const int32_t constant,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)591 static INLINE void butterfly_one_coeff_s32(
592     const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
593     const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
594     int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
595   const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
596   const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
597   const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
598   const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
599   const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
600   const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
601   *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
602   *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
603   *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
604   *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
605 }
606 
607 #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
608                           sub_index)                                          \
609   do {                                                                        \
610     butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
611                             a##_lo[right_index], a##_hi[right_index],         \
612                             constant, &b##_lo[add_index], &b##_hi[add_index], \
613                             &b##_lo[sub_index], &b##_hi[sub_index]);          \
614   } while (0)
615 
616 // Like butterfly_two_coeff, but with s32.
butterfly_two_coeff_s32(const int32x4_t a_lo,const int32x4_t a_hi,const int32x4_t b_lo,const int32x4_t b_hi,const int32_t constant0,const int32_t constant1,int32x4_t * add_lo,int32x4_t * add_hi,int32x4_t * sub_lo,int32x4_t * sub_hi)617 static INLINE void butterfly_two_coeff_s32(
618     const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
619     const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
620     int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
621     int32x4_t *sub_hi) {
622   const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
623   const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
624   const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
625   const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
626   const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
627   const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
628   const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
629   const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
630   *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
631   *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
632   *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
633   *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
634 }
635 
636 #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
637                           right_constant, b, add_index, sub_index)             \
638   do {                                                                         \
639     butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
640                             a##_lo[right_index], a##_hi[right_index],          \
641                             left_constant, right_constant, &b##_lo[add_index], \
642                             &b##_hi[add_index], &b##_lo[sub_index],            \
643                             &b##_hi[sub_index]);                               \
644   } while (0)
645 
646 // Add 1 if positive, 2 if negative, and shift by 2.
647 // In practice, add 1, then add the sign bit, then shift without rounding.
add_round_shift_s32(const int32x4_t a_lo,const int32x4_t a_hi)648 static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
649                                             const int32x4_t a_hi) {
650   const int32x4_t one = vdupq_n_s32(1);
651   const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
652   const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
653   const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
654   const int16x4_t b_lo =
655       vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
656   const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
657   const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
658   const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
659   const int16x4_t b_hi =
660       vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
661   return vcombine_s16(b_lo, b_hi);
662 }
663 
dct_body_second_pass(const int16x8_t * in,int16x8_t * out)664 static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
665   int16x8_t a[32];
666   int16x8_t b[32];
667   int32x4_t c_lo[32];
668   int32x4_t c_hi[32];
669   int32x4_t d_lo[32];
670   int32x4_t d_hi[32];
671 
672   // Stage 1. Done as part of the load for the first pass.
673   a[0] = vaddq_s16(in[0], in[31]);
674   a[1] = vaddq_s16(in[1], in[30]);
675   a[2] = vaddq_s16(in[2], in[29]);
676   a[3] = vaddq_s16(in[3], in[28]);
677   a[4] = vaddq_s16(in[4], in[27]);
678   a[5] = vaddq_s16(in[5], in[26]);
679   a[6] = vaddq_s16(in[6], in[25]);
680   a[7] = vaddq_s16(in[7], in[24]);
681   a[8] = vaddq_s16(in[8], in[23]);
682   a[9] = vaddq_s16(in[9], in[22]);
683   a[10] = vaddq_s16(in[10], in[21]);
684   a[11] = vaddq_s16(in[11], in[20]);
685   a[12] = vaddq_s16(in[12], in[19]);
686   a[13] = vaddq_s16(in[13], in[18]);
687   a[14] = vaddq_s16(in[14], in[17]);
688   a[15] = vaddq_s16(in[15], in[16]);
689   a[16] = vsubq_s16(in[15], in[16]);
690   a[17] = vsubq_s16(in[14], in[17]);
691   a[18] = vsubq_s16(in[13], in[18]);
692   a[19] = vsubq_s16(in[12], in[19]);
693   a[20] = vsubq_s16(in[11], in[20]);
694   a[21] = vsubq_s16(in[10], in[21]);
695   a[22] = vsubq_s16(in[9], in[22]);
696   a[23] = vsubq_s16(in[8], in[23]);
697   a[24] = vsubq_s16(in[7], in[24]);
698   a[25] = vsubq_s16(in[6], in[25]);
699   a[26] = vsubq_s16(in[5], in[26]);
700   a[27] = vsubq_s16(in[4], in[27]);
701   a[28] = vsubq_s16(in[3], in[28]);
702   a[29] = vsubq_s16(in[2], in[29]);
703   a[30] = vsubq_s16(in[1], in[30]);
704   a[31] = vsubq_s16(in[0], in[31]);
705 
706   // Stage 2.
707   b[0] = vaddq_s16(a[0], a[15]);
708   b[1] = vaddq_s16(a[1], a[14]);
709   b[2] = vaddq_s16(a[2], a[13]);
710   b[3] = vaddq_s16(a[3], a[12]);
711   b[4] = vaddq_s16(a[4], a[11]);
712   b[5] = vaddq_s16(a[5], a[10]);
713   b[6] = vaddq_s16(a[6], a[9]);
714   b[7] = vaddq_s16(a[7], a[8]);
715 
716   b[8] = vsubq_s16(a[7], a[8]);
717   b[9] = vsubq_s16(a[6], a[9]);
718   b[10] = vsubq_s16(a[5], a[10]);
719   b[11] = vsubq_s16(a[4], a[11]);
720   b[12] = vsubq_s16(a[3], a[12]);
721   b[13] = vsubq_s16(a[2], a[13]);
722   b[14] = vsubq_s16(a[1], a[14]);
723   b[15] = vsubq_s16(a[0], a[15]);
724 
725   b[16] = a[16];
726   b[17] = a[17];
727   b[18] = a[18];
728   b[19] = a[19];
729 
730   butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
731   butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
732   butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
733   butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
734 
735   b[28] = a[28];
736   b[29] = a[29];
737   b[30] = a[30];
738   b[31] = a[31];
739 
740   // Stage 3. With extreme values for input this calculation rolls over int16_t.
741   // The sources for b[0] get added multiple times and, through testing, have
742   // been shown to overflow starting here.
743   ADD_S16_S32(b, 0, 7, c, 0);
744   ADD_S16_S32(b, 1, 6, c, 1);
745   ADD_S16_S32(b, 2, 5, c, 2);
746   ADD_S16_S32(b, 3, 4, c, 3);
747   SUB_S16_S32(b, 3, 4, c, 4);
748   SUB_S16_S32(b, 2, 5, c, 5);
749   SUB_S16_S32(b, 1, 6, c, 6);
750   SUB_S16_S32(b, 0, 7, c, 7);
751 
752   a[8] = b[8];
753   a[9] = b[9];
754 
755   BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
756   BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
757 
758   a[14] = b[14];
759   a[15] = b[15];
760 
761   ADD_S16_S32(b, 16, 23, c, 16);
762   ADD_S16_S32(b, 17, 22, c, 17);
763   ADD_S16_S32(b, 18, 21, c, 18);
764   ADD_S16_S32(b, 19, 20, c, 19);
765   SUB_S16_S32(b, 19, 20, c, 20);
766   SUB_S16_S32(b, 18, 21, c, 21);
767   SUB_S16_S32(b, 17, 22, c, 22);
768   SUB_S16_S32(b, 16, 23, c, 23);
769   SUB_S16_S32(b, 31, 24, c, 24);
770   SUB_S16_S32(b, 30, 25, c, 25);
771   SUB_S16_S32(b, 29, 26, c, 26);
772   SUB_S16_S32(b, 28, 27, c, 27);
773   ADD_S16_S32(b, 28, 27, c, 28);
774   ADD_S16_S32(b, 29, 26, c, 29);
775   ADD_S16_S32(b, 30, 25, c, 30);
776   ADD_S16_S32(b, 31, 24, c, 31);
777 
778   // Stage 4.
779   ADD_S32(c, 0, 3, d, 0);
780   ADD_S32(c, 1, 2, d, 1);
781   SUB_S32(c, 1, 2, d, 2);
782   SUB_S32(c, 0, 3, d, 3);
783 
784   PASS_THROUGH(c, d, 4);
785 
786   BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
787 
788   PASS_THROUGH(c, d, 7);
789 
790   ADDW_S16_S32(c, 11, a, 8, d, 8);
791   ADDW_S16_S32(c, 10, a, 9, d, 9);
792   SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
793   SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
794   SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
795   SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
796   ADDW_S16_S32(c, 13, b, 14, d, 14);
797   ADDW_S16_S32(c, 12, b, 15, d, 15);
798 
799   PASS_THROUGH(c, d, 16);
800   PASS_THROUGH(c, d, 17);
801 
802   BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
803   BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
804   BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
805   BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
806 
807   PASS_THROUGH(c, d, 22);
808   PASS_THROUGH(c, d, 23);
809   PASS_THROUGH(c, d, 24);
810   PASS_THROUGH(c, d, 25);
811 
812   PASS_THROUGH(c, d, 30);
813   PASS_THROUGH(c, d, 31);
814 
815   // Stage 5.
816   BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
817   BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
818 
819   ADD_S32(d, 4, 5, c, 4);
820   SUB_S32(d, 4, 5, c, 5);
821   SUB_S32(d, 7, 6, c, 6);
822   ADD_S32(d, 7, 6, c, 7);
823 
824   PASS_THROUGH(d, c, 8);
825 
826   BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
827   BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
828 
829   PASS_THROUGH(d, c, 11);
830   PASS_THROUGH(d, c, 12);
831   PASS_THROUGH(d, c, 15);
832 
833   ADD_S32(d, 16, 19, c, 16);
834   ADD_S32(d, 17, 18, c, 17);
835   SUB_S32(d, 17, 18, c, 18);
836   SUB_S32(d, 16, 19, c, 19);
837   SUB_S32(d, 23, 20, c, 20);
838   SUB_S32(d, 22, 21, c, 21);
839   ADD_S32(d, 22, 21, c, 22);
840   ADD_S32(d, 23, 20, c, 23);
841   ADD_S32(d, 24, 27, c, 24);
842   ADD_S32(d, 25, 26, c, 25);
843   SUB_S32(d, 25, 26, c, 26);
844   SUB_S32(d, 24, 27, c, 27);
845   SUB_S32(d, 31, 28, c, 28);
846   SUB_S32(d, 30, 29, c, 29);
847   ADD_S32(d, 30, 29, c, 30);
848   ADD_S32(d, 31, 28, c, 31);
849 
850   // Stage 6.
851   PASS_THROUGH(c, d, 0);
852   PASS_THROUGH(c, d, 1);
853   PASS_THROUGH(c, d, 2);
854   PASS_THROUGH(c, d, 3);
855 
856   BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
857   BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
858 
859   ADD_S32(c, 8, 9, d, 8);
860   SUB_S32(c, 8, 9, d, 9);
861   SUB_S32(c, 11, 10, d, 10);
862   ADD_S32(c, 11, 10, d, 11);
863   ADD_S32(c, 12, 13, d, 12);
864   SUB_S32(c, 12, 13, d, 13);
865   SUB_S32(c, 15, 14, d, 14);
866   ADD_S32(c, 15, 14, d, 15);
867 
868   PASS_THROUGH(c, d, 16);
869   PASS_THROUGH(c, d, 19);
870   PASS_THROUGH(c, d, 20);
871   PASS_THROUGH(c, d, 23);
872   PASS_THROUGH(c, d, 24);
873   PASS_THROUGH(c, d, 27);
874   PASS_THROUGH(c, d, 28);
875   PASS_THROUGH(c, d, 31);
876 
877   BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
878   BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
879   BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
880   BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
881 
882   // Stage 7.
883   PASS_THROUGH(d, c, 0);
884   PASS_THROUGH(d, c, 1);
885   PASS_THROUGH(d, c, 2);
886   PASS_THROUGH(d, c, 3);
887   PASS_THROUGH(d, c, 4);
888   PASS_THROUGH(d, c, 5);
889   PASS_THROUGH(d, c, 6);
890   PASS_THROUGH(d, c, 7);
891 
892   BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
893   BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
894   BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
895   BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
896 
897   ADD_S32(d, 16, 17, c, 16);
898   SUB_S32(d, 16, 17, c, 17);
899   SUB_S32(d, 19, 18, c, 18);
900   ADD_S32(d, 19, 18, c, 19);
901   ADD_S32(d, 20, 21, c, 20);
902   SUB_S32(d, 20, 21, c, 21);
903   SUB_S32(d, 23, 22, c, 22);
904   ADD_S32(d, 23, 22, c, 23);
905   ADD_S32(d, 24, 25, c, 24);
906   SUB_S32(d, 24, 25, c, 25);
907   SUB_S32(d, 27, 26, c, 26);
908   ADD_S32(d, 27, 26, c, 27);
909   ADD_S32(d, 28, 29, c, 28);
910   SUB_S32(d, 28, 29, c, 29);
911   SUB_S32(d, 31, 30, c, 30);
912   ADD_S32(d, 31, 30, c, 31);
913 
914   // Final stage.
915   // Roll rounding into this function so we can pass back int16x8.
916 
917   out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
918   out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
919 
920   out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
921   out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
922   out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
923   out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
924   out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
925 
926   out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
927   out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
928   out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
929   out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
930 
931   out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
932   out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
933   out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
934   out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
935   out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
936 
937   BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
938   out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
939   out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
940 
941   BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
942   out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
943   out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
944 
945   BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
946   out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
947   out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
948 
949   BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
950   out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
951   out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
952 
953   BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
954   out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
955   out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
956 
957   BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
958   out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
959   out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
960 
961   BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
962   out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
963   out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
964 
965   BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
966   out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
967   out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
968 }
969 
970 // Add 1 if positive, 2 if negative, and shift by 2.
971 // In practice, add 1, then add the sign bit, then shift without rounding.
add_round_shift_s16(const int16x8_t a)972 static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
973   const int16x8_t one = vdupq_n_s16(1);
974   const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
975   const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
976   const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
977   return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
978 }
979 
dct_body_second_pass_rd(const int16x8_t * in,int16x8_t * out)980 static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
981   int16x8_t a[32];
982   int16x8_t b[32];
983 
984   // Stage 1. Done as part of the load for the first pass.
985   a[0] = vaddq_s16(in[0], in[31]);
986   a[1] = vaddq_s16(in[1], in[30]);
987   a[2] = vaddq_s16(in[2], in[29]);
988   a[3] = vaddq_s16(in[3], in[28]);
989   a[4] = vaddq_s16(in[4], in[27]);
990   a[5] = vaddq_s16(in[5], in[26]);
991   a[6] = vaddq_s16(in[6], in[25]);
992   a[7] = vaddq_s16(in[7], in[24]);
993   a[8] = vaddq_s16(in[8], in[23]);
994   a[9] = vaddq_s16(in[9], in[22]);
995   a[10] = vaddq_s16(in[10], in[21]);
996   a[11] = vaddq_s16(in[11], in[20]);
997   a[12] = vaddq_s16(in[12], in[19]);
998   a[13] = vaddq_s16(in[13], in[18]);
999   a[14] = vaddq_s16(in[14], in[17]);
1000   a[15] = vaddq_s16(in[15], in[16]);
1001   a[16] = vsubq_s16(in[15], in[16]);
1002   a[17] = vsubq_s16(in[14], in[17]);
1003   a[18] = vsubq_s16(in[13], in[18]);
1004   a[19] = vsubq_s16(in[12], in[19]);
1005   a[20] = vsubq_s16(in[11], in[20]);
1006   a[21] = vsubq_s16(in[10], in[21]);
1007   a[22] = vsubq_s16(in[9], in[22]);
1008   a[23] = vsubq_s16(in[8], in[23]);
1009   a[24] = vsubq_s16(in[7], in[24]);
1010   a[25] = vsubq_s16(in[6], in[25]);
1011   a[26] = vsubq_s16(in[5], in[26]);
1012   a[27] = vsubq_s16(in[4], in[27]);
1013   a[28] = vsubq_s16(in[3], in[28]);
1014   a[29] = vsubq_s16(in[2], in[29]);
1015   a[30] = vsubq_s16(in[1], in[30]);
1016   a[31] = vsubq_s16(in[0], in[31]);
1017 
1018   // Stage 2.
1019   // For the "rd" version, all the values are rounded down after stage 2 to keep
1020   // the values in 16 bits.
1021   b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
1022   b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
1023   b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
1024   b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
1025   b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
1026   b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
1027   b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
1028   b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
1029 
1030   b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
1031   b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
1032   b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
1033   b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
1034   b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
1035   b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
1036   b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
1037   b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
1038 
1039   b[16] = add_round_shift_s16(a[16]);
1040   b[17] = add_round_shift_s16(a[17]);
1041   b[18] = add_round_shift_s16(a[18]);
1042   b[19] = add_round_shift_s16(a[19]);
1043 
1044   butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
1045   butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
1046   butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
1047   butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
1048   b[20] = add_round_shift_s16(b[20]);
1049   b[21] = add_round_shift_s16(b[21]);
1050   b[22] = add_round_shift_s16(b[22]);
1051   b[23] = add_round_shift_s16(b[23]);
1052   b[24] = add_round_shift_s16(b[24]);
1053   b[25] = add_round_shift_s16(b[25]);
1054   b[26] = add_round_shift_s16(b[26]);
1055   b[27] = add_round_shift_s16(b[27]);
1056 
1057   b[28] = add_round_shift_s16(a[28]);
1058   b[29] = add_round_shift_s16(a[29]);
1059   b[30] = add_round_shift_s16(a[30]);
1060   b[31] = add_round_shift_s16(a[31]);
1061 
1062   // Stage 3.
1063   a[0] = vaddq_s16(b[0], b[7]);
1064   a[1] = vaddq_s16(b[1], b[6]);
1065   a[2] = vaddq_s16(b[2], b[5]);
1066   a[3] = vaddq_s16(b[3], b[4]);
1067 
1068   a[4] = vsubq_s16(b[3], b[4]);
1069   a[5] = vsubq_s16(b[2], b[5]);
1070   a[6] = vsubq_s16(b[1], b[6]);
1071   a[7] = vsubq_s16(b[0], b[7]);
1072 
1073   a[8] = b[8];
1074   a[9] = b[9];
1075 
1076   butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
1077   butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
1078 
1079   a[14] = b[14];
1080   a[15] = b[15];
1081 
1082   a[16] = vaddq_s16(b[16], b[23]);
1083   a[17] = vaddq_s16(b[17], b[22]);
1084   a[18] = vaddq_s16(b[18], b[21]);
1085   a[19] = vaddq_s16(b[19], b[20]);
1086 
1087   a[20] = vsubq_s16(b[19], b[20]);
1088   a[21] = vsubq_s16(b[18], b[21]);
1089   a[22] = vsubq_s16(b[17], b[22]);
1090   a[23] = vsubq_s16(b[16], b[23]);
1091 
1092   a[24] = vsubq_s16(b[31], b[24]);
1093   a[25] = vsubq_s16(b[30], b[25]);
1094   a[26] = vsubq_s16(b[29], b[26]);
1095   a[27] = vsubq_s16(b[28], b[27]);
1096 
1097   a[28] = vaddq_s16(b[28], b[27]);
1098   a[29] = vaddq_s16(b[29], b[26]);
1099   a[30] = vaddq_s16(b[30], b[25]);
1100   a[31] = vaddq_s16(b[31], b[24]);
1101 
1102   // Stage 4.
1103   b[0] = vaddq_s16(a[0], a[3]);
1104   b[1] = vaddq_s16(a[1], a[2]);
1105   b[2] = vsubq_s16(a[1], a[2]);
1106   b[3] = vsubq_s16(a[0], a[3]);
1107 
1108   b[4] = a[4];
1109 
1110   butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
1111 
1112   b[7] = a[7];
1113 
1114   b[8] = vaddq_s16(a[8], a[11]);
1115   b[9] = vaddq_s16(a[9], a[10]);
1116   b[10] = vsubq_s16(a[9], a[10]);
1117   b[11] = vsubq_s16(a[8], a[11]);
1118   b[12] = vsubq_s16(a[15], a[12]);
1119   b[13] = vsubq_s16(a[14], a[13]);
1120   b[14] = vaddq_s16(a[14], a[13]);
1121   b[15] = vaddq_s16(a[15], a[12]);
1122 
1123   b[16] = a[16];
1124   b[17] = a[17];
1125 
1126   butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
1127   butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
1128   butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
1129   butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
1130 
1131   b[22] = a[22];
1132   b[23] = a[23];
1133   b[24] = a[24];
1134   b[25] = a[25];
1135 
1136   b[30] = a[30];
1137   b[31] = a[31];
1138 
1139   // Stage 5.
1140   butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
1141   butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
1142 
1143   a[4] = vaddq_s16(b[4], b[5]);
1144   a[5] = vsubq_s16(b[4], b[5]);
1145   a[6] = vsubq_s16(b[7], b[6]);
1146   a[7] = vaddq_s16(b[7], b[6]);
1147 
1148   a[8] = b[8];
1149 
1150   butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
1151   butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
1152 
1153   a[11] = b[11];
1154   a[12] = b[12];
1155 
1156   a[15] = b[15];
1157 
1158   a[16] = vaddq_s16(b[19], b[16]);
1159   a[17] = vaddq_s16(b[18], b[17]);
1160   a[18] = vsubq_s16(b[17], b[18]);
1161   a[19] = vsubq_s16(b[16], b[19]);
1162   a[20] = vsubq_s16(b[23], b[20]);
1163   a[21] = vsubq_s16(b[22], b[21]);
1164   a[22] = vaddq_s16(b[21], b[22]);
1165   a[23] = vaddq_s16(b[20], b[23]);
1166   a[24] = vaddq_s16(b[27], b[24]);
1167   a[25] = vaddq_s16(b[26], b[25]);
1168   a[26] = vsubq_s16(b[25], b[26]);
1169   a[27] = vsubq_s16(b[24], b[27]);
1170   a[28] = vsubq_s16(b[31], b[28]);
1171   a[29] = vsubq_s16(b[30], b[29]);
1172   a[30] = vaddq_s16(b[29], b[30]);
1173   a[31] = vaddq_s16(b[28], b[31]);
1174 
1175   // Stage 6.
1176   b[0] = a[0];
1177   b[1] = a[1];
1178   b[2] = a[2];
1179   b[3] = a[3];
1180 
1181   butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
1182   butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
1183 
1184   b[8] = vaddq_s16(a[8], a[9]);
1185   b[9] = vsubq_s16(a[8], a[9]);
1186   b[10] = vsubq_s16(a[11], a[10]);
1187   b[11] = vaddq_s16(a[11], a[10]);
1188   b[12] = vaddq_s16(a[12], a[13]);
1189   b[13] = vsubq_s16(a[12], a[13]);
1190   b[14] = vsubq_s16(a[15], a[14]);
1191   b[15] = vaddq_s16(a[15], a[14]);
1192 
1193   b[16] = a[16];
1194   b[19] = a[19];
1195   b[20] = a[20];
1196   b[23] = a[23];
1197   b[24] = a[24];
1198   b[27] = a[27];
1199   b[28] = a[28];
1200   b[31] = a[31];
1201 
1202   butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
1203   butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
1204 
1205   butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
1206   butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
1207 
1208   // Stage 7.
1209   a[0] = b[0];
1210   a[1] = b[1];
1211   a[2] = b[2];
1212   a[3] = b[3];
1213   a[4] = b[4];
1214   a[5] = b[5];
1215   a[6] = b[6];
1216   a[7] = b[7];
1217 
1218   butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
1219   butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
1220   butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
1221   butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
1222 
1223   a[16] = vaddq_s16(b[16], b[17]);
1224   a[17] = vsubq_s16(b[16], b[17]);
1225   a[18] = vsubq_s16(b[19], b[18]);
1226   a[19] = vaddq_s16(b[19], b[18]);
1227   a[20] = vaddq_s16(b[20], b[21]);
1228   a[21] = vsubq_s16(b[20], b[21]);
1229   a[22] = vsubq_s16(b[23], b[22]);
1230   a[23] = vaddq_s16(b[23], b[22]);
1231   a[24] = vaddq_s16(b[24], b[25]);
1232   a[25] = vsubq_s16(b[24], b[25]);
1233   a[26] = vsubq_s16(b[27], b[26]);
1234   a[27] = vaddq_s16(b[27], b[26]);
1235   a[28] = vaddq_s16(b[28], b[29]);
1236   a[29] = vsubq_s16(b[28], b[29]);
1237   a[30] = vsubq_s16(b[31], b[30]);
1238   a[31] = vaddq_s16(b[31], b[30]);
1239 
1240   // Final stage.
1241   out[0] = a[0];
1242   out[16] = a[1];
1243   out[8] = a[2];
1244   out[24] = a[3];
1245   out[4] = a[4];
1246   out[20] = a[5];
1247   out[12] = a[6];
1248   out[28] = a[7];
1249   out[2] = a[8];
1250   out[18] = a[9];
1251   out[10] = a[10];
1252   out[26] = a[11];
1253   out[6] = a[12];
1254   out[22] = a[13];
1255   out[14] = a[14];
1256   out[30] = a[15];
1257 
1258   butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
1259   butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
1260                       &out[15]);
1261   butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
1262   butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
1263   butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
1264   butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
1265                       &out[11]);
1266   butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
1267                       &out[19]);
1268   butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
1269 }
1270 
1271 #undef PASS_THROUGH
1272 #undef ADD_S16_S32
1273 #undef SUB_S16_S32
1274 #undef ADDW_S16_S32
1275 #undef SUBW_S16_S32
1276 #undef ADD_S32
1277 #undef SUB_S32
1278 #undef BUTTERFLY_ONE_S16_S32
1279 #undef BUTTERFLY_ONE_S32
1280 #undef BUTTERFLY_TWO_S32
1281 
1282 // Transpose 8x8 to a new location. Don't use transpose_neon.h because those
1283 // are all in-place.
1284 // TODO(johannkoenig): share with other fdcts.
transpose_8x8(const int16x8_t * a,int16x8_t * b)1285 static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
1286   // Swap 16 bit elements.
1287   const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
1288   const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
1289   const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
1290   const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
1291 
1292   // Swap 32 bit elements.
1293   const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
1294                                    vreinterpretq_s32_s16(c1.val[0]));
1295   const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
1296                                    vreinterpretq_s32_s16(c1.val[1]));
1297   const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
1298                                    vreinterpretq_s32_s16(c3.val[0]));
1299   const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
1300                                    vreinterpretq_s32_s16(c3.val[1]));
1301 
1302   // Swap 64 bit elements
1303   const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
1304   const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
1305   const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
1306   const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
1307 
1308   b[0] = e0.val[0];
1309   b[1] = e1.val[0];
1310   b[2] = e2.val[0];
1311   b[3] = e3.val[0];
1312   b[4] = e0.val[1];
1313   b[5] = e1.val[1];
1314   b[6] = e2.val[1];
1315   b[7] = e3.val[1];
1316 }
1317 
vpx_fdct32x32_neon(const int16_t * input,tran_low_t * output,int stride)1318 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
1319   int16x8_t temp0[32];
1320   int16x8_t temp1[32];
1321   int16x8_t temp2[32];
1322   int16x8_t temp3[32];
1323   int16x8_t temp4[32];
1324   int16x8_t temp5[32];
1325 
1326   // Process in 8x32 columns.
1327   load(input, stride, temp0);
1328   dct_body_first_pass(temp0, temp1);
1329 
1330   load(input + 8, stride, temp0);
1331   dct_body_first_pass(temp0, temp2);
1332 
1333   load(input + 16, stride, temp0);
1334   dct_body_first_pass(temp0, temp3);
1335 
1336   load(input + 24, stride, temp0);
1337   dct_body_first_pass(temp0, temp4);
1338 
1339   // Generate the top row by munging the first set of 8 from each one together.
1340   transpose_8x8(&temp1[0], &temp0[0]);
1341   transpose_8x8(&temp2[0], &temp0[8]);
1342   transpose_8x8(&temp3[0], &temp0[16]);
1343   transpose_8x8(&temp4[0], &temp0[24]);
1344 
1345   dct_body_second_pass(temp0, temp5);
1346 
1347   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1348                     &temp5[5], &temp5[6], &temp5[7]);
1349   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1350                     &temp5[13], &temp5[14], &temp5[15]);
1351   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1352                     &temp5[21], &temp5[22], &temp5[23]);
1353   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1354                     &temp5[29], &temp5[30], &temp5[31]);
1355   store(output, temp5);
1356 
1357   // Second row of 8x32.
1358   transpose_8x8(&temp1[8], &temp0[0]);
1359   transpose_8x8(&temp2[8], &temp0[8]);
1360   transpose_8x8(&temp3[8], &temp0[16]);
1361   transpose_8x8(&temp4[8], &temp0[24]);
1362 
1363   dct_body_second_pass(temp0, temp5);
1364 
1365   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1366                     &temp5[5], &temp5[6], &temp5[7]);
1367   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1368                     &temp5[13], &temp5[14], &temp5[15]);
1369   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1370                     &temp5[21], &temp5[22], &temp5[23]);
1371   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1372                     &temp5[29], &temp5[30], &temp5[31]);
1373   store(output + 8 * 32, temp5);
1374 
1375   // Third row of 8x32
1376   transpose_8x8(&temp1[16], &temp0[0]);
1377   transpose_8x8(&temp2[16], &temp0[8]);
1378   transpose_8x8(&temp3[16], &temp0[16]);
1379   transpose_8x8(&temp4[16], &temp0[24]);
1380 
1381   dct_body_second_pass(temp0, temp5);
1382 
1383   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1384                     &temp5[5], &temp5[6], &temp5[7]);
1385   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1386                     &temp5[13], &temp5[14], &temp5[15]);
1387   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1388                     &temp5[21], &temp5[22], &temp5[23]);
1389   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1390                     &temp5[29], &temp5[30], &temp5[31]);
1391   store(output + 16 * 32, temp5);
1392 
1393   // Final row of 8x32.
1394   transpose_8x8(&temp1[24], &temp0[0]);
1395   transpose_8x8(&temp2[24], &temp0[8]);
1396   transpose_8x8(&temp3[24], &temp0[16]);
1397   transpose_8x8(&temp4[24], &temp0[24]);
1398 
1399   dct_body_second_pass(temp0, temp5);
1400 
1401   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1402                     &temp5[5], &temp5[6], &temp5[7]);
1403   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1404                     &temp5[13], &temp5[14], &temp5[15]);
1405   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1406                     &temp5[21], &temp5[22], &temp5[23]);
1407   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1408                     &temp5[29], &temp5[30], &temp5[31]);
1409   store(output + 24 * 32, temp5);
1410 }
1411 
vpx_fdct32x32_rd_neon(const int16_t * input,tran_low_t * output,int stride)1412 void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
1413                            int stride) {
1414   int16x8_t temp0[32];
1415   int16x8_t temp1[32];
1416   int16x8_t temp2[32];
1417   int16x8_t temp3[32];
1418   int16x8_t temp4[32];
1419   int16x8_t temp5[32];
1420 
1421   // Process in 8x32 columns.
1422   load(input, stride, temp0);
1423   dct_body_first_pass(temp0, temp1);
1424 
1425   load(input + 8, stride, temp0);
1426   dct_body_first_pass(temp0, temp2);
1427 
1428   load(input + 16, stride, temp0);
1429   dct_body_first_pass(temp0, temp3);
1430 
1431   load(input + 24, stride, temp0);
1432   dct_body_first_pass(temp0, temp4);
1433 
1434   // Generate the top row by munging the first set of 8 from each one together.
1435   transpose_8x8(&temp1[0], &temp0[0]);
1436   transpose_8x8(&temp2[0], &temp0[8]);
1437   transpose_8x8(&temp3[0], &temp0[16]);
1438   transpose_8x8(&temp4[0], &temp0[24]);
1439 
1440   dct_body_second_pass_rd(temp0, temp5);
1441 
1442   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1443                     &temp5[5], &temp5[6], &temp5[7]);
1444   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1445                     &temp5[13], &temp5[14], &temp5[15]);
1446   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1447                     &temp5[21], &temp5[22], &temp5[23]);
1448   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1449                     &temp5[29], &temp5[30], &temp5[31]);
1450   store(output, temp5);
1451 
1452   // Second row of 8x32.
1453   transpose_8x8(&temp1[8], &temp0[0]);
1454   transpose_8x8(&temp2[8], &temp0[8]);
1455   transpose_8x8(&temp3[8], &temp0[16]);
1456   transpose_8x8(&temp4[8], &temp0[24]);
1457 
1458   dct_body_second_pass_rd(temp0, temp5);
1459 
1460   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1461                     &temp5[5], &temp5[6], &temp5[7]);
1462   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1463                     &temp5[13], &temp5[14], &temp5[15]);
1464   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1465                     &temp5[21], &temp5[22], &temp5[23]);
1466   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1467                     &temp5[29], &temp5[30], &temp5[31]);
1468   store(output + 8 * 32, temp5);
1469 
1470   // Third row of 8x32
1471   transpose_8x8(&temp1[16], &temp0[0]);
1472   transpose_8x8(&temp2[16], &temp0[8]);
1473   transpose_8x8(&temp3[16], &temp0[16]);
1474   transpose_8x8(&temp4[16], &temp0[24]);
1475 
1476   dct_body_second_pass_rd(temp0, temp5);
1477 
1478   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1479                     &temp5[5], &temp5[6], &temp5[7]);
1480   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1481                     &temp5[13], &temp5[14], &temp5[15]);
1482   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1483                     &temp5[21], &temp5[22], &temp5[23]);
1484   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1485                     &temp5[29], &temp5[30], &temp5[31]);
1486   store(output + 16 * 32, temp5);
1487 
1488   // Final row of 8x32.
1489   transpose_8x8(&temp1[24], &temp0[0]);
1490   transpose_8x8(&temp2[24], &temp0[8]);
1491   transpose_8x8(&temp3[24], &temp0[16]);
1492   transpose_8x8(&temp4[24], &temp0[24]);
1493 
1494   dct_body_second_pass_rd(temp0, temp5);
1495 
1496   transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
1497                     &temp5[5], &temp5[6], &temp5[7]);
1498   transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
1499                     &temp5[13], &temp5[14], &temp5[15]);
1500   transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
1501                     &temp5[21], &temp5[22], &temp5[23]);
1502   transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
1503                     &temp5[29], &temp5[30], &temp5[31]);
1504   store(output + 24 * 32, temp5);
1505 }
1506 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
1507         // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
1508