1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/txfm_common.h"
17 
wrap_low_4x2(const int32x4_t * const t32,int16x4_t * const d0,int16x4_t * const d1)18 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
19                                 int16x4_t *const d1) {
20   *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
21   *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
22 }
23 
idct_cospi_8_24_d_kernel(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int32x4_t * const t32)24 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
25                                             const int16x4_t s1,
26                                             const int16x4_t cospi_0_8_16_24,
27                                             int32x4_t *const t32) {
28   t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
29   t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
30   t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
31   t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
32 }
33 
idct_cospi_8_24_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)34 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
35                                      const int16x4_t cospi_0_8_16_24,
36                                      int16x4_t *const d0, int16x4_t *const d1) {
37   int32x4_t t32[2];
38 
39   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
40   wrap_low_4x2(t32, d0, d1);
41 }
42 
idct_cospi_8_24_neg_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)43 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
44                                          const int16x4_t cospi_0_8_16_24,
45                                          int16x4_t *const d0,
46                                          int16x4_t *const d1) {
47   int32x4_t t32[2];
48 
49   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
50   t32[1] = vnegq_s32(t32[1]);
51   wrap_low_4x2(t32, d0, d1);
52 }
53 
idct_cospi_16_16_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)54 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
55                                       const int16x4_t cospi_0_8_16_24,
56                                       int16x4_t *const d0,
57                                       int16x4_t *const d1) {
58   int32x4_t t32[3];
59 
60   t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
61   t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
62   t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
63   wrap_low_4x2(t32, d0, d1);
64 }
65 
vpx_idct16x16_256_add_half1d(const void * const input,int16_t * output,void * const dest,const int stride,const int highbd_flag)66 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
67                                   void *const dest, const int stride,
68                                   const int highbd_flag) {
69   const int16x8_t cospis0 = vld1q_s16(kCospi);
70   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
71   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
72   const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
73   const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
74   const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
75   int16x8_t in[16], step1[16], step2[16], out[16];
76 
77   // Load input (16x8)
78   if (output) {
79     const tran_low_t *inputT = (const tran_low_t *)input;
80     in[0] = load_tran_low_to_s16q(inputT);
81     inputT += 8;
82     in[8] = load_tran_low_to_s16q(inputT);
83     inputT += 8;
84     in[1] = load_tran_low_to_s16q(inputT);
85     inputT += 8;
86     in[9] = load_tran_low_to_s16q(inputT);
87     inputT += 8;
88     in[2] = load_tran_low_to_s16q(inputT);
89     inputT += 8;
90     in[10] = load_tran_low_to_s16q(inputT);
91     inputT += 8;
92     in[3] = load_tran_low_to_s16q(inputT);
93     inputT += 8;
94     in[11] = load_tran_low_to_s16q(inputT);
95     inputT += 8;
96     in[4] = load_tran_low_to_s16q(inputT);
97     inputT += 8;
98     in[12] = load_tran_low_to_s16q(inputT);
99     inputT += 8;
100     in[5] = load_tran_low_to_s16q(inputT);
101     inputT += 8;
102     in[13] = load_tran_low_to_s16q(inputT);
103     inputT += 8;
104     in[6] = load_tran_low_to_s16q(inputT);
105     inputT += 8;
106     in[14] = load_tran_low_to_s16q(inputT);
107     inputT += 8;
108     in[7] = load_tran_low_to_s16q(inputT);
109     inputT += 8;
110     in[15] = load_tran_low_to_s16q(inputT);
111   } else {
112     const int16_t *inputT = (const int16_t *)input;
113     in[0] = vld1q_s16(inputT);
114     inputT += 8;
115     in[8] = vld1q_s16(inputT);
116     inputT += 8;
117     in[1] = vld1q_s16(inputT);
118     inputT += 8;
119     in[9] = vld1q_s16(inputT);
120     inputT += 8;
121     in[2] = vld1q_s16(inputT);
122     inputT += 8;
123     in[10] = vld1q_s16(inputT);
124     inputT += 8;
125     in[3] = vld1q_s16(inputT);
126     inputT += 8;
127     in[11] = vld1q_s16(inputT);
128     inputT += 8;
129     in[4] = vld1q_s16(inputT);
130     inputT += 8;
131     in[12] = vld1q_s16(inputT);
132     inputT += 8;
133     in[5] = vld1q_s16(inputT);
134     inputT += 8;
135     in[13] = vld1q_s16(inputT);
136     inputT += 8;
137     in[6] = vld1q_s16(inputT);
138     inputT += 8;
139     in[14] = vld1q_s16(inputT);
140     inputT += 8;
141     in[7] = vld1q_s16(inputT);
142     inputT += 8;
143     in[15] = vld1q_s16(inputT);
144   }
145 
146   // Transpose
147   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
148                     &in[7]);
149   transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
150                     &in[15]);
151 
152   // stage 1
153   step1[0] = in[0 / 2];
154   step1[1] = in[16 / 2];
155   step1[2] = in[8 / 2];
156   step1[3] = in[24 / 2];
157   step1[4] = in[4 / 2];
158   step1[5] = in[20 / 2];
159   step1[6] = in[12 / 2];
160   step1[7] = in[28 / 2];
161   step1[8] = in[2 / 2];
162   step1[9] = in[18 / 2];
163   step1[10] = in[10 / 2];
164   step1[11] = in[26 / 2];
165   step1[12] = in[6 / 2];
166   step1[13] = in[22 / 2];
167   step1[14] = in[14 / 2];
168   step1[15] = in[30 / 2];
169 
170   // stage 2
171   step2[0] = step1[0];
172   step2[1] = step1[1];
173   step2[2] = step1[2];
174   step2[3] = step1[3];
175   step2[4] = step1[4];
176   step2[5] = step1[5];
177   step2[6] = step1[6];
178   step2[7] = step1[7];
179   idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
180   idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
181                    &step2[14]);
182   idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
183                    &step2[13]);
184   idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
185                   &step2[12]);
186 
187   // stage 3
188   step1[0] = step2[0];
189   step1[1] = step2[1];
190   step1[2] = step2[2];
191   step1[3] = step2[3];
192   idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
193   idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
194   step1[8] = vaddq_s16(step2[8], step2[9]);
195   step1[9] = vsubq_s16(step2[8], step2[9]);
196   step1[10] = vsubq_s16(step2[11], step2[10]);
197   step1[11] = vaddq_s16(step2[11], step2[10]);
198   step1[12] = vaddq_s16(step2[12], step2[13]);
199   step1[13] = vsubq_s16(step2[12], step2[13]);
200   step1[14] = vsubq_s16(step2[15], step2[14]);
201   step1[15] = vaddq_s16(step2[15], step2[14]);
202 
203   // stage 4
204   idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
205   idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
206   step2[4] = vaddq_s16(step1[4], step1[5]);
207   step2[5] = vsubq_s16(step1[4], step1[5]);
208   step2[6] = vsubq_s16(step1[7], step1[6]);
209   step2[7] = vaddq_s16(step1[7], step1[6]);
210   step2[8] = step1[8];
211   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
212                     &step2[14]);
213   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
214                         &step2[10]);
215   step2[11] = step1[11];
216   step2[12] = step1[12];
217   step2[15] = step1[15];
218 
219   // stage 5
220   step1[0] = vaddq_s16(step2[0], step2[3]);
221   step1[1] = vaddq_s16(step2[1], step2[2]);
222   step1[2] = vsubq_s16(step2[1], step2[2]);
223   step1[3] = vsubq_s16(step2[0], step2[3]);
224   step1[4] = step2[4];
225   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
226   step1[7] = step2[7];
227   step1[8] = vaddq_s16(step2[8], step2[11]);
228   step1[9] = vaddq_s16(step2[9], step2[10]);
229   step1[10] = vsubq_s16(step2[9], step2[10]);
230   step1[11] = vsubq_s16(step2[8], step2[11]);
231   step1[12] = vsubq_s16(step2[15], step2[12]);
232   step1[13] = vsubq_s16(step2[14], step2[13]);
233   step1[14] = vaddq_s16(step2[14], step2[13]);
234   step1[15] = vaddq_s16(step2[15], step2[12]);
235 
236   // stage 6
237   step2[0] = vaddq_s16(step1[0], step1[7]);
238   step2[1] = vaddq_s16(step1[1], step1[6]);
239   step2[2] = vaddq_s16(step1[2], step1[5]);
240   step2[3] = vaddq_s16(step1[3], step1[4]);
241   step2[4] = vsubq_s16(step1[3], step1[4]);
242   step2[5] = vsubq_s16(step1[2], step1[5]);
243   step2[6] = vsubq_s16(step1[1], step1[6]);
244   step2[7] = vsubq_s16(step1[0], step1[7]);
245   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
246                      &step2[13]);
247   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
248                      &step2[12]);
249   step2[8] = step1[8];
250   step2[9] = step1[9];
251   step2[14] = step1[14];
252   step2[15] = step1[15];
253 
254   // stage 7
255   idct16x16_add_stage7(step2, out);
256 
257   if (output) {
258     idct16x16_store_pass1(out, output);
259   } else {
260     if (highbd_flag) {
261       idct16x16_add_store_bd8(out, dest, stride);
262     } else {
263       idct16x16_add_store(out, dest, stride);
264     }
265   }
266 }
267 
vpx_idct16x16_38_add_half1d(const void * const input,int16_t * const output,void * const dest,const int stride,const int highbd_flag)268 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
269                                  void *const dest, const int stride,
270                                  const int highbd_flag) {
271   const int16x8_t cospis0 = vld1q_s16(kCospi);
272   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
273   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
274   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
275   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
276   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
277   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
278   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
279   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
280   int16x8_t in[8], step1[16], step2[16], out[16];
281 
282   // Load input (8x8)
283   if (output) {
284     const tran_low_t *inputT = (const tran_low_t *)input;
285     in[0] = load_tran_low_to_s16q(inputT);
286     inputT += 16;
287     in[1] = load_tran_low_to_s16q(inputT);
288     inputT += 16;
289     in[2] = load_tran_low_to_s16q(inputT);
290     inputT += 16;
291     in[3] = load_tran_low_to_s16q(inputT);
292     inputT += 16;
293     in[4] = load_tran_low_to_s16q(inputT);
294     inputT += 16;
295     in[5] = load_tran_low_to_s16q(inputT);
296     inputT += 16;
297     in[6] = load_tran_low_to_s16q(inputT);
298     inputT += 16;
299     in[7] = load_tran_low_to_s16q(inputT);
300   } else {
301     const int16_t *inputT = (const int16_t *)input;
302     in[0] = vld1q_s16(inputT);
303     inputT += 16;
304     in[1] = vld1q_s16(inputT);
305     inputT += 16;
306     in[2] = vld1q_s16(inputT);
307     inputT += 16;
308     in[3] = vld1q_s16(inputT);
309     inputT += 16;
310     in[4] = vld1q_s16(inputT);
311     inputT += 16;
312     in[5] = vld1q_s16(inputT);
313     inputT += 16;
314     in[6] = vld1q_s16(inputT);
315     inputT += 16;
316     in[7] = vld1q_s16(inputT);
317   }
318 
319   // Transpose
320   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
321                     &in[7]);
322 
323   // stage 1
324   step1[0] = in[0 / 2];
325   step1[2] = in[8 / 2];
326   step1[4] = in[4 / 2];
327   step1[6] = in[12 / 2];
328   step1[8] = in[2 / 2];
329   step1[10] = in[10 / 2];
330   step1[12] = in[6 / 2];
331   step1[14] = in[14 / 2];  // 0 in pass 1
332 
333   // stage 2
334   step2[0] = step1[0];
335   step2[2] = step1[2];
336   step2[4] = step1[4];
337   step2[6] = step1[6];
338   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
339   step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
340   step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
341   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
342   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
343   step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
344   step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
345   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
346 
347   // stage 3
348   step1[0] = step2[0];
349   step1[2] = step2[2];
350   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
351   step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
352   step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
353   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
354   step1[8] = vaddq_s16(step2[8], step2[9]);
355   step1[9] = vsubq_s16(step2[8], step2[9]);
356   step1[10] = vsubq_s16(step2[11], step2[10]);
357   step1[11] = vaddq_s16(step2[11], step2[10]);
358   step1[12] = vaddq_s16(step2[12], step2[13]);
359   step1[13] = vsubq_s16(step2[12], step2[13]);
360   step1[14] = vsubq_s16(step2[15], step2[14]);
361   step1[15] = vaddq_s16(step2[15], step2[14]);
362 
363   // stage 4
364   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
365   step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
366   step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
367   step2[4] = vaddq_s16(step1[4], step1[5]);
368   step2[5] = vsubq_s16(step1[4], step1[5]);
369   step2[6] = vsubq_s16(step1[7], step1[6]);
370   step2[7] = vaddq_s16(step1[7], step1[6]);
371   step2[8] = step1[8];
372   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
373                     &step2[14]);
374   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
375                         &step2[10]);
376   step2[11] = step1[11];
377   step2[12] = step1[12];
378   step2[15] = step1[15];
379 
380   // stage 5
381   step1[0] = vaddq_s16(step2[0], step2[3]);
382   step1[1] = vaddq_s16(step2[1], step2[2]);
383   step1[2] = vsubq_s16(step2[1], step2[2]);
384   step1[3] = vsubq_s16(step2[0], step2[3]);
385   step1[4] = step2[4];
386   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
387   step1[7] = step2[7];
388   step1[8] = vaddq_s16(step2[8], step2[11]);
389   step1[9] = vaddq_s16(step2[9], step2[10]);
390   step1[10] = vsubq_s16(step2[9], step2[10]);
391   step1[11] = vsubq_s16(step2[8], step2[11]);
392   step1[12] = vsubq_s16(step2[15], step2[12]);
393   step1[13] = vsubq_s16(step2[14], step2[13]);
394   step1[14] = vaddq_s16(step2[14], step2[13]);
395   step1[15] = vaddq_s16(step2[15], step2[12]);
396 
397   // stage 6
398   step2[0] = vaddq_s16(step1[0], step1[7]);
399   step2[1] = vaddq_s16(step1[1], step1[6]);
400   step2[2] = vaddq_s16(step1[2], step1[5]);
401   step2[3] = vaddq_s16(step1[3], step1[4]);
402   step2[4] = vsubq_s16(step1[3], step1[4]);
403   step2[5] = vsubq_s16(step1[2], step1[5]);
404   step2[6] = vsubq_s16(step1[1], step1[6]);
405   step2[7] = vsubq_s16(step1[0], step1[7]);
406   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
407                      &step2[13]);
408   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
409                      &step2[12]);
410   step2[8] = step1[8];
411   step2[9] = step1[9];
412   step2[14] = step1[14];
413   step2[15] = step1[15];
414 
415   // stage 7
416   idct16x16_add_stage7(step2, out);
417 
418   if (output) {
419     idct16x16_store_pass1(out, output);
420   } else {
421     if (highbd_flag) {
422       idct16x16_add_store_bd8(out, dest, stride);
423     } else {
424       idct16x16_add_store(out, dest, stride);
425     }
426   }
427 }
428 
vpx_idct16x16_10_add_half1d_pass1(const tran_low_t * input,int16_t * output)429 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
430                                        int16_t *output) {
431   const int16x8_t cospis0 = vld1q_s16(kCospi);
432   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
433   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
434   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
435   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
436   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
437   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
438   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
439   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
440   int16x4_t in[4], step1[16], step2[16], out[16];
441 
442   // Load input (4x4)
443   in[0] = load_tran_low_to_s16d(input);
444   input += 16;
445   in[1] = load_tran_low_to_s16d(input);
446   input += 16;
447   in[2] = load_tran_low_to_s16d(input);
448   input += 16;
449   in[3] = load_tran_low_to_s16d(input);
450 
451   // Transpose
452   transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
453 
454   // stage 1
455   step1[0] = in[0 / 2];
456   step1[4] = in[4 / 2];
457   step1[8] = in[2 / 2];
458   step1[12] = in[6 / 2];
459 
460   // stage 2
461   step2[0] = step1[0];
462   step2[4] = step1[4];
463   step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
464   step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
465   step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
466   step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
467 
468   // stage 3
469   step1[0] = step2[0];
470   step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
471   step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
472   step1[8] = step2[8];
473   step1[9] = step2[8];
474   step1[10] = step2[11];
475   step1[11] = step2[11];
476   step1[12] = step2[12];
477   step1[13] = step2[12];
478   step1[14] = step2[15];
479   step1[15] = step2[15];
480 
481   // stage 4
482   step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
483   step2[4] = step1[4];
484   step2[5] = step1[4];
485   step2[6] = step1[7];
486   step2[7] = step1[7];
487   step2[8] = step1[8];
488   idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
489                     &step2[14]);
490   idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
491                         &step2[10]);
492   step2[11] = step1[11];
493   step2[12] = step1[12];
494   step2[15] = step1[15];
495 
496   // stage 5
497   step1[0] = step2[0];
498   step1[1] = step2[1];
499   step1[2] = step2[1];
500   step1[3] = step2[0];
501   step1[4] = step2[4];
502   idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
503   step1[7] = step2[7];
504   step1[8] = vadd_s16(step2[8], step2[11]);
505   step1[9] = vadd_s16(step2[9], step2[10]);
506   step1[10] = vsub_s16(step2[9], step2[10]);
507   step1[11] = vsub_s16(step2[8], step2[11]);
508   step1[12] = vsub_s16(step2[15], step2[12]);
509   step1[13] = vsub_s16(step2[14], step2[13]);
510   step1[14] = vadd_s16(step2[14], step2[13]);
511   step1[15] = vadd_s16(step2[15], step2[12]);
512 
513   // stage 6
514   step2[0] = vadd_s16(step1[0], step1[7]);
515   step2[1] = vadd_s16(step1[1], step1[6]);
516   step2[2] = vadd_s16(step1[2], step1[5]);
517   step2[3] = vadd_s16(step1[3], step1[4]);
518   step2[4] = vsub_s16(step1[3], step1[4]);
519   step2[5] = vsub_s16(step1[2], step1[5]);
520   step2[6] = vsub_s16(step1[1], step1[6]);
521   step2[7] = vsub_s16(step1[0], step1[7]);
522   idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
523                      &step2[13]);
524   idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
525                      &step2[12]);
526   step2[8] = step1[8];
527   step2[9] = step1[9];
528   step2[14] = step1[14];
529   step2[15] = step1[15];
530 
531   // stage 7
532   out[0] = vadd_s16(step2[0], step2[15]);
533   out[1] = vadd_s16(step2[1], step2[14]);
534   out[2] = vadd_s16(step2[2], step2[13]);
535   out[3] = vadd_s16(step2[3], step2[12]);
536   out[4] = vadd_s16(step2[4], step2[11]);
537   out[5] = vadd_s16(step2[5], step2[10]);
538   out[6] = vadd_s16(step2[6], step2[9]);
539   out[7] = vadd_s16(step2[7], step2[8]);
540   out[8] = vsub_s16(step2[7], step2[8]);
541   out[9] = vsub_s16(step2[6], step2[9]);
542   out[10] = vsub_s16(step2[5], step2[10]);
543   out[11] = vsub_s16(step2[4], step2[11]);
544   out[12] = vsub_s16(step2[3], step2[12]);
545   out[13] = vsub_s16(step2[2], step2[13]);
546   out[14] = vsub_s16(step2[1], step2[14]);
547   out[15] = vsub_s16(step2[0], step2[15]);
548 
549   // pass 1: save the result into output
550   vst1_s16(output, out[0]);
551   output += 4;
552   vst1_s16(output, out[1]);
553   output += 4;
554   vst1_s16(output, out[2]);
555   output += 4;
556   vst1_s16(output, out[3]);
557   output += 4;
558   vst1_s16(output, out[4]);
559   output += 4;
560   vst1_s16(output, out[5]);
561   output += 4;
562   vst1_s16(output, out[6]);
563   output += 4;
564   vst1_s16(output, out[7]);
565   output += 4;
566   vst1_s16(output, out[8]);
567   output += 4;
568   vst1_s16(output, out[9]);
569   output += 4;
570   vst1_s16(output, out[10]);
571   output += 4;
572   vst1_s16(output, out[11]);
573   output += 4;
574   vst1_s16(output, out[12]);
575   output += 4;
576   vst1_s16(output, out[13]);
577   output += 4;
578   vst1_s16(output, out[14]);
579   output += 4;
580   vst1_s16(output, out[15]);
581 }
582 
vpx_idct16x16_10_add_half1d_pass2(const int16_t * input,int16_t * const output,void * const dest,const int stride,const int highbd_flag)583 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
584                                        int16_t *const output, void *const dest,
585                                        const int stride,
586                                        const int highbd_flag) {
587   const int16x8_t cospis0 = vld1q_s16(kCospi);
588   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
589   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
590   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
591   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
592   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
593   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
594   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
595   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
596   int16x4_t ind[8];
597   int16x8_t in[4], step1[16], step2[16], out[16];
598 
599   // Load input (4x8)
600   ind[0] = vld1_s16(input);
601   input += 4;
602   ind[1] = vld1_s16(input);
603   input += 4;
604   ind[2] = vld1_s16(input);
605   input += 4;
606   ind[3] = vld1_s16(input);
607   input += 4;
608   ind[4] = vld1_s16(input);
609   input += 4;
610   ind[5] = vld1_s16(input);
611   input += 4;
612   ind[6] = vld1_s16(input);
613   input += 4;
614   ind[7] = vld1_s16(input);
615 
616   // Transpose
617   transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
618                     ind[7], &in[0], &in[1], &in[2], &in[3]);
619 
620   // stage 1
621   step1[0] = in[0 / 2];
622   step1[4] = in[4 / 2];
623   step1[8] = in[2 / 2];
624   step1[12] = in[6 / 2];
625 
626   // stage 2
627   step2[0] = step1[0];
628   step2[4] = step1[4];
629   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
630   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
631   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
632   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
633 
634   // stage 3
635   step1[0] = step2[0];
636   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
637   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
638   step1[8] = step2[8];
639   step1[9] = step2[8];
640   step1[10] = step2[11];
641   step1[11] = step2[11];
642   step1[12] = step2[12];
643   step1[13] = step2[12];
644   step1[14] = step2[15];
645   step1[15] = step2[15];
646 
647   // stage 4
648   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
649   step2[4] = step1[4];
650   step2[5] = step1[4];
651   step2[6] = step1[7];
652   step2[7] = step1[7];
653   step2[8] = step1[8];
654   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
655                     &step2[14]);
656   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
657                         &step2[10]);
658   step2[11] = step1[11];
659   step2[12] = step1[12];
660   step2[15] = step1[15];
661 
662   // stage 5
663   step1[0] = step2[0];
664   step1[1] = step2[1];
665   step1[2] = step2[1];
666   step1[3] = step2[0];
667   step1[4] = step2[4];
668   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
669   step1[7] = step2[7];
670   step1[8] = vaddq_s16(step2[8], step2[11]);
671   step1[9] = vaddq_s16(step2[9], step2[10]);
672   step1[10] = vsubq_s16(step2[9], step2[10]);
673   step1[11] = vsubq_s16(step2[8], step2[11]);
674   step1[12] = vsubq_s16(step2[15], step2[12]);
675   step1[13] = vsubq_s16(step2[14], step2[13]);
676   step1[14] = vaddq_s16(step2[14], step2[13]);
677   step1[15] = vaddq_s16(step2[15], step2[12]);
678 
679   // stage 6
680   step2[0] = vaddq_s16(step1[0], step1[7]);
681   step2[1] = vaddq_s16(step1[1], step1[6]);
682   step2[2] = vaddq_s16(step1[2], step1[5]);
683   step2[3] = vaddq_s16(step1[3], step1[4]);
684   step2[4] = vsubq_s16(step1[3], step1[4]);
685   step2[5] = vsubq_s16(step1[2], step1[5]);
686   step2[6] = vsubq_s16(step1[1], step1[6]);
687   step2[7] = vsubq_s16(step1[0], step1[7]);
688   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
689                      &step2[13]);
690   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
691                      &step2[12]);
692   step2[8] = step1[8];
693   step2[9] = step1[9];
694   step2[14] = step1[14];
695   step2[15] = step1[15];
696 
697   // stage 7
698   idct16x16_add_stage7(step2, out);
699 
700   if (output) {
701     idct16x16_store_pass1(out, output);
702   } else {
703     if (highbd_flag) {
704       idct16x16_add_store_bd8(out, dest, stride);
705     } else {
706       idct16x16_add_store(out, dest, stride);
707     }
708   }
709 }
710 
vpx_idct16x16_256_add_neon(const tran_low_t * input,uint8_t * dest,int stride)711 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
712                                 int stride) {
713   int16_t row_idct_output[16 * 16];
714 
715   // pass 1
716   // Parallel idct on the upper 8 rows
717   vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
718 
719   // Parallel idct on the lower 8 rows
720   vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
721                                stride, 0);
722 
723   // pass 2
724   // Parallel idct to get the left 8 columns
725   vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
726 
727   // Parallel idct to get the right 8 columns
728   vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
729                                0);
730 }
731 
vpx_idct16x16_38_add_neon(const tran_low_t * input,uint8_t * dest,int stride)732 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
733                                int stride) {
734   int16_t row_idct_output[16 * 16];
735 
736   // pass 1
737   // Parallel idct on the upper 8 rows
738   vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
739 
740   // pass 2
741   // Parallel idct to get the left 8 columns
742   vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
743 
744   // Parallel idct to get the right 8 columns
745   vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
746                               0);
747 }
748 
vpx_idct16x16_10_add_neon(const tran_low_t * input,uint8_t * dest,int stride)749 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
750                                int stride) {
751   int16_t row_idct_output[4 * 16];
752 
753   // pass 1
754   // Parallel idct on the upper 8 rows
755   vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
756 
757   // pass 2
758   // Parallel idct to get the left 8 columns
759   vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
760 
761   // Parallel idct to get the right 8 columns
762   vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
763                                     stride, 0);
764 }
765