1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/idct_neon.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_dsp/txfm_common.h"
19 
20 // Only for the first pass of the  _34_ variant. Since it only uses values from
21 // the top left 8x8 it can safely assume all the remaining values are 0 and skip
22 // an awful lot of calculations. In fact, only the first 6 columns make the cut.
23 // None of the elements in the 7th or 8th column are used so it skips any calls
24 // to input[67] too.
25 // In C this does a single row of 32 for each call. Here it transposes the top
26 // left 8x8 to allow using SIMD.
27 
28 // vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
29 // coefficients as follows:
30 //    0  1  2  3  4  5  6  7
31 // 0  0  2  5 10 17 25
32 // 1  1  4  8 15 22 30
33 // 2  3  7 12 18 28
34 // 3  6 11 16 23 31
35 // 4  9 14 19 29
36 // 5 13 20 26
37 // 6 21 27 33
38 // 7 24 32
vpx_idct32_6_neon(const tran_low_t * input,int16_t * output)39 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
40   int16x8_t in[8], s1[32], s2[32], s3[32];
41 
42   in[0] = load_tran_low_to_s16q(input);
43   input += 32;
44   in[1] = load_tran_low_to_s16q(input);
45   input += 32;
46   in[2] = load_tran_low_to_s16q(input);
47   input += 32;
48   in[3] = load_tran_low_to_s16q(input);
49   input += 32;
50   in[4] = load_tran_low_to_s16q(input);
51   input += 32;
52   in[5] = load_tran_low_to_s16q(input);
53   input += 32;
54   in[6] = load_tran_low_to_s16q(input);
55   input += 32;
56   in[7] = load_tran_low_to_s16q(input);
57   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
58                     &in[7]);
59 
60   // stage 1
61   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
62   s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
63   // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
64   s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
65 
66   s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
67   s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
68 
69   s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
70   s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
71 
72   // stage 2
73   s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
74   s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
75 
76   // stage 3
77   s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
78   s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
79 
80   s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
81                                                     cospi_28_64);
82   s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
83                                                     cospi_4_64);
84 
85   s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
86                                                     s1[27], cospi_12_64);
87   s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
88                                                     cospi_20_64);
89 
90   s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
91                                                     s1[24], -cospi_20_64);
92   s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
93                                                     s1[24], cospi_12_64);
94 
95   // stage 4
96   s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
97 
98   s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
99                                                    cospi_24_64);
100   s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
101                                                     cospi_8_64);
102 
103   s2[20] = vsubq_s16(s1[23], s1[20]);
104   s2[21] = vsubq_s16(s1[22], s1[21]);
105   s2[22] = vaddq_s16(s1[21], s1[22]);
106   s2[23] = vaddq_s16(s1[20], s1[23]);
107   s2[24] = vaddq_s16(s1[24], s1[27]);
108   s2[25] = vaddq_s16(s1[25], s1[26]);
109   s2[26] = vsubq_s16(s1[25], s1[26]);
110   s2[27] = vsubq_s16(s1[24], s1[27]);
111 
112   // stage 5
113   s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
114   s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
115 
116   s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
117                                                     cospi_24_64);
118   s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
119                                                     cospi_8_64);
120 
121   s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
122                                                     cospi_24_64);
123   s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
124                                                     cospi_8_64);
125 
126   s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
127                                                     s2[27], -cospi_8_64);
128   s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
129                                                     cospi_24_64);
130 
131   s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
132                                                     s2[26], -cospi_8_64);
133   s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
134                                                     cospi_24_64);
135 
136   // stage 6
137   s2[0] = vaddq_s16(s1[0], s1[7]);
138   s2[1] = vaddq_s16(s1[0], s1[6]);
139   s2[2] = vaddq_s16(s1[0], s1[5]);
140   s2[3] = vaddq_s16(s1[0], s1[4]);
141   s2[4] = vsubq_s16(s1[0], s1[4]);
142   s2[5] = vsubq_s16(s1[0], s1[5]);
143   s2[6] = vsubq_s16(s1[0], s1[6]);
144   s2[7] = vsubq_s16(s1[0], s1[7]);
145 
146   s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
147   s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
148 
149   s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
150   s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
151 
152   s2[16] = vaddq_s16(s1[16], s2[23]);
153   s2[17] = vaddq_s16(s1[17], s2[22]);
154   s2[18] = vaddq_s16(s1[18], s1[21]);
155   s2[19] = vaddq_s16(s1[19], s1[20]);
156   s2[20] = vsubq_s16(s1[19], s1[20]);
157   s2[21] = vsubq_s16(s1[18], s1[21]);
158   s2[22] = vsubq_s16(s1[17], s2[22]);
159   s2[23] = vsubq_s16(s1[16], s2[23]);
160 
161   s3[24] = vsubq_s16(s1[31], s2[24]);
162   s3[25] = vsubq_s16(s1[30], s2[25]);
163   s3[26] = vsubq_s16(s1[29], s1[26]);
164   s3[27] = vsubq_s16(s1[28], s1[27]);
165   s2[28] = vaddq_s16(s1[27], s1[28]);
166   s2[29] = vaddq_s16(s1[26], s1[29]);
167   s2[30] = vaddq_s16(s2[25], s1[30]);
168   s2[31] = vaddq_s16(s2[24], s1[31]);
169 
170   // stage 7
171   s1[0] = vaddq_s16(s2[0], s2[15]);
172   s1[1] = vaddq_s16(s2[1], s2[14]);
173   s1[2] = vaddq_s16(s2[2], s2[13]);
174   s1[3] = vaddq_s16(s2[3], s2[12]);
175   s1[4] = vaddq_s16(s2[4], s2[11]);
176   s1[5] = vaddq_s16(s2[5], s2[10]);
177   s1[6] = vaddq_s16(s2[6], s2[9]);
178   s1[7] = vaddq_s16(s2[7], s2[8]);
179   s1[8] = vsubq_s16(s2[7], s2[8]);
180   s1[9] = vsubq_s16(s2[6], s2[9]);
181   s1[10] = vsubq_s16(s2[5], s2[10]);
182   s1[11] = vsubq_s16(s2[4], s2[11]);
183   s1[12] = vsubq_s16(s2[3], s2[12]);
184   s1[13] = vsubq_s16(s2[2], s2[13]);
185   s1[14] = vsubq_s16(s2[1], s2[14]);
186   s1[15] = vsubq_s16(s2[0], s2[15]);
187 
188   s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
189   s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
190 
191   s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
192   s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
193 
194   s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
195   s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
196 
197   s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
198   s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
199 
200   // final stage
201   vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
202   output += 8;
203   vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
204   output += 8;
205   vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
206   output += 8;
207   vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
208   output += 8;
209   vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
210   output += 8;
211   vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
212   output += 8;
213   vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
214   output += 8;
215   vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
216   output += 8;
217 
218   vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
219   output += 8;
220   vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
221   output += 8;
222   vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
223   output += 8;
224   vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
225   output += 8;
226   vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
227   output += 8;
228   vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
229   output += 8;
230   vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
231   output += 8;
232   vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
233   output += 8;
234 
235   vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
236   output += 8;
237   vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
238   output += 8;
239   vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
240   output += 8;
241   vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
242   output += 8;
243   vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
244   output += 8;
245   vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
246   output += 8;
247   vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
248   output += 8;
249   vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
250   output += 8;
251 
252   vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
253   output += 8;
254   vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
255   output += 8;
256   vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
257   output += 8;
258   vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
259   output += 8;
260   vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
261   output += 8;
262   vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
263   output += 8;
264   vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
265   output += 8;
266   vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
267 }
268 
vpx_idct32_8_neon(const int16_t * input,void * const output,int stride,const int highbd_flag)269 void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
270                        const int highbd_flag) {
271   int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
272 
273   load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
274                              &in[5], &in[6], &in[7]);
275 
276   // stage 1
277   s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
278   s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
279 
280   // Different for _8_
281   s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
282   s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
283 
284   s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
285   s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
286 
287   s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
288   s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
289 
290   // stage 2
291   s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
292   s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
293 
294   s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
295   s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
296 
297   // stage 3
298   s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
299   s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
300 
301   s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
302                                                     cospi_28_64);
303   s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
304                                                     cospi_4_64);
305 
306   // Different for _8_
307   s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
308                                                     s1[28], -cospi_4_64);
309   s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
310                                                     cospi_28_64);
311 
312   s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
313                                                     s1[27], cospi_12_64);
314   s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
315                                                     cospi_20_64);
316 
317   s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
318                                                     s1[24], -cospi_20_64);
319   s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
320                                                     s1[24], cospi_12_64);
321 
322   // stage 4
323   s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
324 
325   s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
326                                                    cospi_24_64);
327   s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
328                                                     cospi_8_64);
329 
330   s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
331                                                     s2[12], -cospi_8_64);
332   s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
333                                                     cospi_24_64);
334 
335   s2[16] = vaddq_s16(s1[16], s1[19]);
336 
337   s2[17] = vaddq_s16(s1[17], s1[18]);
338   s2[18] = vsubq_s16(s1[17], s1[18]);
339 
340   s2[19] = vsubq_s16(s1[16], s1[19]);
341 
342   s2[20] = vsubq_s16(s1[23], s1[20]);
343   s2[21] = vsubq_s16(s1[22], s1[21]);
344 
345   s2[22] = vaddq_s16(s1[21], s1[22]);
346   s2[23] = vaddq_s16(s1[20], s1[23]);
347 
348   s2[24] = vaddq_s16(s1[24], s1[27]);
349   s2[25] = vaddq_s16(s1[25], s1[26]);
350   s2[26] = vsubq_s16(s1[25], s1[26]);
351   s2[27] = vsubq_s16(s1[24], s1[27]);
352 
353   s2[28] = vsubq_s16(s1[31], s1[28]);
354   s2[29] = vsubq_s16(s1[30], s1[29]);
355   s2[30] = vaddq_s16(s1[29], s1[30]);
356   s2[31] = vaddq_s16(s1[28], s1[31]);
357 
358   // stage 5
359   s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
360   s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
361 
362   s1[8] = vaddq_s16(s2[8], s2[11]);
363   s1[9] = vaddq_s16(s2[9], s2[10]);
364   s1[10] = vsubq_s16(s2[9], s2[10]);
365   s1[11] = vsubq_s16(s2[8], s2[11]);
366   s1[12] = vsubq_s16(s2[15], s2[12]);
367   s1[13] = vsubq_s16(s2[14], s2[13]);
368   s1[14] = vaddq_s16(s2[13], s2[14]);
369   s1[15] = vaddq_s16(s2[12], s2[15]);
370 
371   s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
372                                                     cospi_24_64);
373   s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
374                                                     cospi_8_64);
375 
376   s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
377                                                     cospi_24_64);
378   s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
379                                                     cospi_8_64);
380 
381   s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
382                                                     s2[27], -cospi_8_64);
383   s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
384                                                     cospi_24_64);
385 
386   s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
387                                                     s2[26], -cospi_8_64);
388   s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
389                                                     cospi_24_64);
390 
391   // stage 6
392   s2[0] = vaddq_s16(s1[0], s1[7]);
393   s2[1] = vaddq_s16(s1[0], s1[6]);
394   s2[2] = vaddq_s16(s1[0], s1[5]);
395   s2[3] = vaddq_s16(s1[0], s1[4]);
396   s2[4] = vsubq_s16(s1[0], s1[4]);
397   s2[5] = vsubq_s16(s1[0], s1[5]);
398   s2[6] = vsubq_s16(s1[0], s1[6]);
399   s2[7] = vsubq_s16(s1[0], s1[7]);
400 
401   s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
402   s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
403 
404   s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
405   s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
406 
407   s1[16] = vaddq_s16(s2[16], s2[23]);
408   s1[17] = vaddq_s16(s2[17], s2[22]);
409   s2[18] = vaddq_s16(s1[18], s1[21]);
410   s2[19] = vaddq_s16(s1[19], s1[20]);
411   s2[20] = vsubq_s16(s1[19], s1[20]);
412   s2[21] = vsubq_s16(s1[18], s1[21]);
413   s1[22] = vsubq_s16(s2[17], s2[22]);
414   s1[23] = vsubq_s16(s2[16], s2[23]);
415 
416   s3[24] = vsubq_s16(s2[31], s2[24]);
417   s3[25] = vsubq_s16(s2[30], s2[25]);
418   s3[26] = vsubq_s16(s1[29], s1[26]);
419   s3[27] = vsubq_s16(s1[28], s1[27]);
420   s2[28] = vaddq_s16(s1[27], s1[28]);
421   s2[29] = vaddq_s16(s1[26], s1[29]);
422   s2[30] = vaddq_s16(s2[25], s2[30]);
423   s2[31] = vaddq_s16(s2[24], s2[31]);
424 
425   // stage 7
426   s1[0] = vaddq_s16(s2[0], s1[15]);
427   s1[1] = vaddq_s16(s2[1], s1[14]);
428   s1[2] = vaddq_s16(s2[2], s2[13]);
429   s1[3] = vaddq_s16(s2[3], s2[12]);
430   s1[4] = vaddq_s16(s2[4], s2[11]);
431   s1[5] = vaddq_s16(s2[5], s2[10]);
432   s1[6] = vaddq_s16(s2[6], s1[9]);
433   s1[7] = vaddq_s16(s2[7], s1[8]);
434   s1[8] = vsubq_s16(s2[7], s1[8]);
435   s1[9] = vsubq_s16(s2[6], s1[9]);
436   s1[10] = vsubq_s16(s2[5], s2[10]);
437   s1[11] = vsubq_s16(s2[4], s2[11]);
438   s1[12] = vsubq_s16(s2[3], s2[12]);
439   s1[13] = vsubq_s16(s2[2], s2[13]);
440   s1[14] = vsubq_s16(s2[1], s1[14]);
441   s1[15] = vsubq_s16(s2[0], s1[15]);
442 
443   s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
444   s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
445 
446   s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
447   s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
448 
449   s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
450   s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
451 
452   s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
453   s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
454 
455   // final stage
456   out[0] = final_add(s1[0], s2[31]);
457   out[1] = final_add(s1[1], s2[30]);
458   out[2] = final_add(s1[2], s2[29]);
459   out[3] = final_add(s1[3], s2[28]);
460   out[4] = final_add(s1[4], s1[27]);
461   out[5] = final_add(s1[5], s1[26]);
462   out[6] = final_add(s1[6], s1[25]);
463   out[7] = final_add(s1[7], s1[24]);
464   out[8] = final_add(s1[8], s2[23]);
465   out[9] = final_add(s1[9], s2[22]);
466   out[10] = final_add(s1[10], s1[21]);
467   out[11] = final_add(s1[11], s1[20]);
468   out[12] = final_add(s1[12], s2[19]);
469   out[13] = final_add(s1[13], s2[18]);
470   out[14] = final_add(s1[14], s1[17]);
471   out[15] = final_add(s1[15], s1[16]);
472   out[16] = final_sub(s1[15], s1[16]);
473   out[17] = final_sub(s1[14], s1[17]);
474   out[18] = final_sub(s1[13], s2[18]);
475   out[19] = final_sub(s1[12], s2[19]);
476   out[20] = final_sub(s1[11], s1[20]);
477   out[21] = final_sub(s1[10], s1[21]);
478   out[22] = final_sub(s1[9], s2[22]);
479   out[23] = final_sub(s1[8], s2[23]);
480   out[24] = final_sub(s1[7], s1[24]);
481   out[25] = final_sub(s1[6], s1[25]);
482   out[26] = final_sub(s1[5], s1[26]);
483   out[27] = final_sub(s1[4], s1[27]);
484   out[28] = final_sub(s1[3], s2[28]);
485   out[29] = final_sub(s1[2], s2[29]);
486   out[30] = final_sub(s1[1], s2[30]);
487   out[31] = final_sub(s1[0], s2[31]);
488 
489   if (highbd_flag) {
490     highbd_add_and_store_bd8(out, output, stride);
491   } else {
492     uint8_t *const outputT = (uint8_t *)output;
493     add_and_store_u8_s16(out + 0, outputT, stride);
494     add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
495     add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
496     add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
497   }
498 }
499 
vpx_idct32x32_34_add_neon(const tran_low_t * input,uint8_t * dest,int stride)500 void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
501                                int stride) {
502   int i;
503   int16_t temp[32 * 8];
504   int16_t *t = temp;
505 
506   vpx_idct32_6_neon(input, t);
507 
508   for (i = 0; i < 32; i += 8) {
509     vpx_idct32_8_neon(t, dest, stride, 0);
510     t += (8 * 8);
511     dest += 8;
512   }
513 }
514