1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/arm/transpose_neon.h"
16 #include "vpx_dsp/inv_txfm.h"
17 
highbd_idct8x8_1_add_kernel(uint16_t ** dest,const int stride,const int16x8_t res,const int16x8_t max)18 static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
19                                                const int stride,
20                                                const int16x8_t res,
21                                                const int16x8_t max) {
22   const uint16x8_t a = vld1q_u16(*dest);
23   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
24   const int16x8_t c = vminq_s16(b, max);
25   const uint16x8_t d = vqshluq_n_s16(c, 0);
26   vst1q_u16(*dest, d);
27   *dest += stride;
28 }
29 
vpx_highbd_idct8x8_1_add_neon(const tran_low_t * input,uint8_t * dest8,int stride,int bd)30 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
31                                    int stride, int bd) {
32   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
33   const tran_low_t out0 =
34       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
35   const tran_low_t out1 =
36       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
37   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
38   const int16x8_t dc = vdupq_n_s16(a1);
39   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
40 
41   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
42   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
43   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
44   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
45   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
46   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
47   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
48   highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
49 }
50 
idct8x8_12_half1d_bd10(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)51 static INLINE void idct8x8_12_half1d_bd10(
52     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
53     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
54     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
55     int32x4_t *const io7) {
56   int32x4_t step1[8], step2[8];
57 
58   transpose_s32_4x4(io0, io1, io2, io3);
59 
60   // stage 1
61   step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
62   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
63   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
64   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
65   step1[4] = vrshrq_n_s32(step1[4], 14);
66   step1[5] = vrshrq_n_s32(step1[5], 14);
67   step1[6] = vrshrq_n_s32(step1[6], 14);
68   step1[7] = vrshrq_n_s32(step1[7], 14);
69 
70   // stage 2
71   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
72   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
73   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
74   step2[1] = vrshrq_n_s32(step2[1], 14);
75   step2[2] = vrshrq_n_s32(step2[2], 14);
76   step2[3] = vrshrq_n_s32(step2[3], 14);
77 
78   step2[4] = vaddq_s32(step1[4], step1[5]);
79   step2[5] = vsubq_s32(step1[4], step1[5]);
80   step2[6] = vsubq_s32(step1[7], step1[6]);
81   step2[7] = vaddq_s32(step1[7], step1[6]);
82 
83   // stage 3
84   step1[0] = vaddq_s32(step2[1], step2[3]);
85   step1[1] = vaddq_s32(step2[1], step2[2]);
86   step1[2] = vsubq_s32(step2[1], step2[2]);
87   step1[3] = vsubq_s32(step2[1], step2[3]);
88 
89   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
90   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
91   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
92   step1[5] = vrshrq_n_s32(step1[5], 14);
93   step1[6] = vrshrq_n_s32(step1[6], 14);
94 
95   // stage 4
96   *io0 = vaddq_s32(step1[0], step2[7]);
97   *io1 = vaddq_s32(step1[1], step1[6]);
98   *io2 = vaddq_s32(step1[2], step1[5]);
99   *io3 = vaddq_s32(step1[3], step2[4]);
100   *io4 = vsubq_s32(step1[3], step2[4]);
101   *io5 = vsubq_s32(step1[2], step1[5]);
102   *io6 = vsubq_s32(step1[1], step1[6]);
103   *io7 = vsubq_s32(step1[0], step2[7]);
104 }
105 
idct8x8_12_half1d_bd12(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)106 static INLINE void idct8x8_12_half1d_bd12(
107     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
108     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
109     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
110     int32x4_t *const io7) {
111   int32x2_t input_1l, input_1h, input_3l, input_3h;
112   int32x2_t step1l[2], step1h[2];
113   int32x4_t step1[8], step2[8];
114   int64x2_t t64[8];
115   int32x2_t t32[8];
116 
117   transpose_s32_4x4(io0, io1, io2, io3);
118 
119   // stage 1
120   input_1l = vget_low_s32(*io1);
121   input_1h = vget_high_s32(*io1);
122   input_3l = vget_low_s32(*io3);
123   input_3h = vget_high_s32(*io3);
124   step1l[0] = vget_low_s32(*io0);
125   step1h[0] = vget_high_s32(*io0);
126   step1l[1] = vget_low_s32(*io2);
127   step1h[1] = vget_high_s32(*io2);
128 
129   t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
130   t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
131   t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
132   t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
133   t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
134   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
135   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
136   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
137   t32[0] = vrshrn_n_s64(t64[0], 14);
138   t32[1] = vrshrn_n_s64(t64[1], 14);
139   t32[2] = vrshrn_n_s64(t64[2], 14);
140   t32[3] = vrshrn_n_s64(t64[3], 14);
141   t32[4] = vrshrn_n_s64(t64[4], 14);
142   t32[5] = vrshrn_n_s64(t64[5], 14);
143   t32[6] = vrshrn_n_s64(t64[6], 14);
144   t32[7] = vrshrn_n_s64(t64[7], 14);
145   step1[4] = vcombine_s32(t32[0], t32[1]);
146   step1[5] = vcombine_s32(t32[2], t32[3]);
147   step1[6] = vcombine_s32(t32[4], t32[5]);
148   step1[7] = vcombine_s32(t32[6], t32[7]);
149 
150   // stage 2
151   t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
152   t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
153   t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
154   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
155   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
156   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
157   t32[2] = vrshrn_n_s64(t64[2], 14);
158   t32[3] = vrshrn_n_s64(t64[3], 14);
159   t32[4] = vrshrn_n_s64(t64[4], 14);
160   t32[5] = vrshrn_n_s64(t64[5], 14);
161   t32[6] = vrshrn_n_s64(t64[6], 14);
162   t32[7] = vrshrn_n_s64(t64[7], 14);
163   step2[1] = vcombine_s32(t32[2], t32[3]);
164   step2[2] = vcombine_s32(t32[4], t32[5]);
165   step2[3] = vcombine_s32(t32[6], t32[7]);
166 
167   step2[4] = vaddq_s32(step1[4], step1[5]);
168   step2[5] = vsubq_s32(step1[4], step1[5]);
169   step2[6] = vsubq_s32(step1[7], step1[6]);
170   step2[7] = vaddq_s32(step1[7], step1[6]);
171 
172   // stage 3
173   step1[0] = vaddq_s32(step2[1], step2[3]);
174   step1[1] = vaddq_s32(step2[1], step2[2]);
175   step1[2] = vsubq_s32(step2[1], step2[2]);
176   step1[3] = vsubq_s32(step2[1], step2[3]);
177 
178   t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
179   t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
180   t64[0] =
181       vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
182   t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
183                           vget_high_s32(cospis0), 0);
184   t64[2] =
185       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
186   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
187                           vget_high_s32(cospis0), 0);
188   t32[0] = vrshrn_n_s64(t64[0], 14);
189   t32[1] = vrshrn_n_s64(t64[1], 14);
190   t32[2] = vrshrn_n_s64(t64[2], 14);
191   t32[3] = vrshrn_n_s64(t64[3], 14);
192   step1[5] = vcombine_s32(t32[0], t32[1]);
193   step1[6] = vcombine_s32(t32[2], t32[3]);
194 
195   // stage 4
196   *io0 = vaddq_s32(step1[0], step2[7]);
197   *io1 = vaddq_s32(step1[1], step1[6]);
198   *io2 = vaddq_s32(step1[2], step1[5]);
199   *io3 = vaddq_s32(step1[3], step2[4]);
200   *io4 = vsubq_s32(step1[3], step2[4]);
201   *io5 = vsubq_s32(step1[2], step1[5]);
202   *io6 = vsubq_s32(step1[1], step1[6]);
203   *io7 = vsubq_s32(step1[0], step2[7]);
204 }
205 
highbd_add8x8(int16x8_t a0,int16x8_t a1,int16x8_t a2,int16x8_t a3,int16x8_t a4,int16x8_t a5,int16x8_t a6,int16x8_t a7,uint16_t * dest,const int stride,const int bd)206 static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
207                                  int16x8_t a3, int16x8_t a4, int16x8_t a5,
208                                  int16x8_t a6, int16x8_t a7, uint16_t *dest,
209                                  const int stride, const int bd) {
210   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
211   const uint16_t *dst = dest;
212   uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
213   uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
214   int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
215 
216   d0 = vld1q_u16(dst);
217   dst += stride;
218   d1 = vld1q_u16(dst);
219   dst += stride;
220   d2 = vld1q_u16(dst);
221   dst += stride;
222   d3 = vld1q_u16(dst);
223   dst += stride;
224   d4 = vld1q_u16(dst);
225   dst += stride;
226   d5 = vld1q_u16(dst);
227   dst += stride;
228   d6 = vld1q_u16(dst);
229   dst += stride;
230   d7 = vld1q_u16(dst);
231 
232   d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
233   d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
234   d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
235   d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
236   d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
237   d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
238   d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
239   d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
240 
241   d0_s16 = vminq_s16(d0_s16, max);
242   d1_s16 = vminq_s16(d1_s16, max);
243   d2_s16 = vminq_s16(d2_s16, max);
244   d3_s16 = vminq_s16(d3_s16, max);
245   d4_s16 = vminq_s16(d4_s16, max);
246   d5_s16 = vminq_s16(d5_s16, max);
247   d6_s16 = vminq_s16(d6_s16, max);
248   d7_s16 = vminq_s16(d7_s16, max);
249   d0_u16 = vqshluq_n_s16(d0_s16, 0);
250   d1_u16 = vqshluq_n_s16(d1_s16, 0);
251   d2_u16 = vqshluq_n_s16(d2_s16, 0);
252   d3_u16 = vqshluq_n_s16(d3_s16, 0);
253   d4_u16 = vqshluq_n_s16(d4_s16, 0);
254   d5_u16 = vqshluq_n_s16(d5_s16, 0);
255   d6_u16 = vqshluq_n_s16(d6_s16, 0);
256   d7_u16 = vqshluq_n_s16(d7_s16, 0);
257 
258   vst1q_u16(dest, d0_u16);
259   dest += stride;
260   vst1q_u16(dest, d1_u16);
261   dest += stride;
262   vst1q_u16(dest, d2_u16);
263   dest += stride;
264   vst1q_u16(dest, d3_u16);
265   dest += stride;
266   vst1q_u16(dest, d4_u16);
267   dest += stride;
268   vst1q_u16(dest, d5_u16);
269   dest += stride;
270   vst1q_u16(dest, d6_u16);
271   dest += stride;
272   vst1q_u16(dest, d7_u16);
273 }
274 
vpx_highbd_idct8x8_12_add_neon(const tran_low_t * input,uint8_t * dest8,int stride,int bd)275 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,
276                                     int stride, int bd) {
277   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
278   int32x4_t a0 = vld1q_s32(input);
279   int32x4_t a1 = vld1q_s32(input + 8);
280   int32x4_t a2 = vld1q_s32(input + 16);
281   int32x4_t a3 = vld1q_s32(input + 24);
282   int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
283 
284   if (bd == 8) {
285     const int16x8_t cospis = vld1q_s16(kCospi);
286     const int16x8_t cospisd = vaddq_s16(cospis, cospis);
287     const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
288     const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
289     const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
290     int16x4_t b0 = vmovn_s32(a0);
291     int16x4_t b1 = vmovn_s32(a1);
292     int16x4_t b2 = vmovn_s32(a2);
293     int16x4_t b3 = vmovn_s32(a3);
294     int16x4_t b4, b5, b6, b7;
295 
296     idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
297                          &b5, &b6, &b7);
298     idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
299                          b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
300     c0 = vrshrq_n_s16(c0, 5);
301     c1 = vrshrq_n_s16(c1, 5);
302     c2 = vrshrq_n_s16(c2, 5);
303     c3 = vrshrq_n_s16(c3, 5);
304     c4 = vrshrq_n_s16(c4, 5);
305     c5 = vrshrq_n_s16(c5, 5);
306     c6 = vrshrq_n_s16(c6, 5);
307     c7 = vrshrq_n_s16(c7, 5);
308   } else {
309     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
310     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
311     int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
312 
313     if (bd == 10) {
314       idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
315                              &a6, &a7);
316       idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
317                              &a10, &a11);
318       idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
319                              &a14, &a15);
320     } else {
321       idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
322                              &a6, &a7);
323       idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
324                              &a10, &a11);
325       idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
326                              &a14, &a15);
327     }
328     c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
329     c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
330     c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
331     c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
332     c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
333     c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
334     c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
335     c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
336   }
337   highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
338 }
339 
idct8x8_64_half1d_bd10(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)340 static INLINE void idct8x8_64_half1d_bd10(
341     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
342     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
343     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
344     int32x4_t *const io7) {
345   int32x4_t step1[8], step2[8];
346 
347   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
348 
349   // stage 1
350   step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
351   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
352   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
353   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
354 
355   step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
356   step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
357   step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
358   step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
359 
360   step1[4] = vrshrq_n_s32(step1[4], 14);
361   step1[5] = vrshrq_n_s32(step1[5], 14);
362   step1[6] = vrshrq_n_s32(step1[6], 14);
363   step1[7] = vrshrq_n_s32(step1[7], 14);
364 
365   // stage 2
366   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
367   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
368   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
369 
370   step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
371   step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
372   step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
373   step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
374 
375   step2[0] = vrshrq_n_s32(step2[0], 14);
376   step2[1] = vrshrq_n_s32(step2[1], 14);
377   step2[2] = vrshrq_n_s32(step2[2], 14);
378   step2[3] = vrshrq_n_s32(step2[3], 14);
379 
380   step2[4] = vaddq_s32(step1[4], step1[5]);
381   step2[5] = vsubq_s32(step1[4], step1[5]);
382   step2[6] = vsubq_s32(step1[7], step1[6]);
383   step2[7] = vaddq_s32(step1[7], step1[6]);
384 
385   // stage 3
386   step1[0] = vaddq_s32(step2[0], step2[3]);
387   step1[1] = vaddq_s32(step2[1], step2[2]);
388   step1[2] = vsubq_s32(step2[1], step2[2]);
389   step1[3] = vsubq_s32(step2[0], step2[3]);
390 
391   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
392   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
393   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
394   step1[5] = vrshrq_n_s32(step1[5], 14);
395   step1[6] = vrshrq_n_s32(step1[6], 14);
396 
397   // stage 4
398   *io0 = vaddq_s32(step1[0], step2[7]);
399   *io1 = vaddq_s32(step1[1], step1[6]);
400   *io2 = vaddq_s32(step1[2], step1[5]);
401   *io3 = vaddq_s32(step1[3], step2[4]);
402   *io4 = vsubq_s32(step1[3], step2[4]);
403   *io5 = vsubq_s32(step1[2], step1[5]);
404   *io6 = vsubq_s32(step1[1], step1[6]);
405   *io7 = vsubq_s32(step1[0], step2[7]);
406 }
407 
idct8x8_64_half1d_bd12(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)408 static INLINE void idct8x8_64_half1d_bd12(
409     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
410     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
411     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
412     int32x4_t *const io7) {
413   int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
414       input_7l, input_7h;
415   int32x2_t step1l[4], step1h[4];
416   int32x4_t step1[8], step2[8];
417   int64x2_t t64[8];
418   int32x2_t t32[8];
419 
420   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
421 
422   // stage 1
423   input_1l = vget_low_s32(*io1);
424   input_1h = vget_high_s32(*io1);
425   input_3l = vget_low_s32(*io3);
426   input_3h = vget_high_s32(*io3);
427   input_5l = vget_low_s32(*io5);
428   input_5h = vget_high_s32(*io5);
429   input_7l = vget_low_s32(*io7);
430   input_7h = vget_high_s32(*io7);
431   step1l[0] = vget_low_s32(*io0);
432   step1h[0] = vget_high_s32(*io0);
433   step1l[1] = vget_low_s32(*io2);
434   step1h[1] = vget_high_s32(*io2);
435   step1l[2] = vget_low_s32(*io4);
436   step1h[2] = vget_high_s32(*io4);
437   step1l[3] = vget_low_s32(*io6);
438   step1h[3] = vget_high_s32(*io6);
439 
440   t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
441   t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
442   t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
443   t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
444   t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
445   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
446   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
447   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
448   t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
449   t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
450   t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
451   t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
452   t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
453   t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
454   t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
455   t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
456   t32[0] = vrshrn_n_s64(t64[0], 14);
457   t32[1] = vrshrn_n_s64(t64[1], 14);
458   t32[2] = vrshrn_n_s64(t64[2], 14);
459   t32[3] = vrshrn_n_s64(t64[3], 14);
460   t32[4] = vrshrn_n_s64(t64[4], 14);
461   t32[5] = vrshrn_n_s64(t64[5], 14);
462   t32[6] = vrshrn_n_s64(t64[6], 14);
463   t32[7] = vrshrn_n_s64(t64[7], 14);
464   step1[4] = vcombine_s32(t32[0], t32[1]);
465   step1[5] = vcombine_s32(t32[2], t32[3]);
466   step1[6] = vcombine_s32(t32[4], t32[5]);
467   step1[7] = vcombine_s32(t32[6], t32[7]);
468 
469   // stage 2
470   t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
471   t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
472   t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
473   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
474   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
475   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
476   t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
477   t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
478   t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
479   t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
480   t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
481   t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
482   t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
483   t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
484   t32[0] = vrshrn_n_s64(t64[0], 14);
485   t32[1] = vrshrn_n_s64(t64[1], 14);
486   t32[2] = vrshrn_n_s64(t64[2], 14);
487   t32[3] = vrshrn_n_s64(t64[3], 14);
488   t32[4] = vrshrn_n_s64(t64[4], 14);
489   t32[5] = vrshrn_n_s64(t64[5], 14);
490   t32[6] = vrshrn_n_s64(t64[6], 14);
491   t32[7] = vrshrn_n_s64(t64[7], 14);
492   step2[0] = vcombine_s32(t32[0], t32[1]);
493   step2[1] = vcombine_s32(t32[2], t32[3]);
494   step2[2] = vcombine_s32(t32[4], t32[5]);
495   step2[3] = vcombine_s32(t32[6], t32[7]);
496 
497   step2[4] = vaddq_s32(step1[4], step1[5]);
498   step2[5] = vsubq_s32(step1[4], step1[5]);
499   step2[6] = vsubq_s32(step1[7], step1[6]);
500   step2[7] = vaddq_s32(step1[7], step1[6]);
501 
502   // stage 3
503   step1[0] = vaddq_s32(step2[0], step2[3]);
504   step1[1] = vaddq_s32(step2[1], step2[2]);
505   step1[2] = vsubq_s32(step2[1], step2[2]);
506   step1[3] = vsubq_s32(step2[0], step2[3]);
507 
508   t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
509   t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
510   t64[0] =
511       vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
512   t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
513                           vget_high_s32(cospis0), 0);
514   t64[2] =
515       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
516   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
517                           vget_high_s32(cospis0), 0);
518   t32[0] = vrshrn_n_s64(t64[0], 14);
519   t32[1] = vrshrn_n_s64(t64[1], 14);
520   t32[2] = vrshrn_n_s64(t64[2], 14);
521   t32[3] = vrshrn_n_s64(t64[3], 14);
522   step1[5] = vcombine_s32(t32[0], t32[1]);
523   step1[6] = vcombine_s32(t32[2], t32[3]);
524 
525   // stage 4
526   *io0 = vaddq_s32(step1[0], step2[7]);
527   *io1 = vaddq_s32(step1[1], step1[6]);
528   *io2 = vaddq_s32(step1[2], step1[5]);
529   *io3 = vaddq_s32(step1[3], step2[4]);
530   *io4 = vsubq_s32(step1[3], step2[4]);
531   *io5 = vsubq_s32(step1[2], step1[5]);
532   *io6 = vsubq_s32(step1[1], step1[6]);
533   *io7 = vsubq_s32(step1[0], step2[7]);
534 }
535 
vpx_highbd_idct8x8_64_add_neon(const tran_low_t * input,uint8_t * dest8,int stride,int bd)536 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,
537                                     int stride, int bd) {
538   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
539   int32x4_t a0 = vld1q_s32(input);
540   int32x4_t a1 = vld1q_s32(input + 4);
541   int32x4_t a2 = vld1q_s32(input + 8);
542   int32x4_t a3 = vld1q_s32(input + 12);
543   int32x4_t a4 = vld1q_s32(input + 16);
544   int32x4_t a5 = vld1q_s32(input + 20);
545   int32x4_t a6 = vld1q_s32(input + 24);
546   int32x4_t a7 = vld1q_s32(input + 28);
547   int32x4_t a8 = vld1q_s32(input + 32);
548   int32x4_t a9 = vld1q_s32(input + 36);
549   int32x4_t a10 = vld1q_s32(input + 40);
550   int32x4_t a11 = vld1q_s32(input + 44);
551   int32x4_t a12 = vld1q_s32(input + 48);
552   int32x4_t a13 = vld1q_s32(input + 52);
553   int32x4_t a14 = vld1q_s32(input + 56);
554   int32x4_t a15 = vld1q_s32(input + 60);
555   int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
556 
557   if (bd == 8) {
558     const int16x8_t cospis = vld1q_s16(kCospi);
559     const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
560     const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
561     int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
562     int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
563     int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
564     int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
565     int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
566     int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
567     int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
568     int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
569 
570     idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
571     idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
572 
573     c0 = vrshrq_n_s16(b0, 5);
574     c1 = vrshrq_n_s16(b1, 5);
575     c2 = vrshrq_n_s16(b2, 5);
576     c3 = vrshrq_n_s16(b3, 5);
577     c4 = vrshrq_n_s16(b4, 5);
578     c5 = vrshrq_n_s16(b5, 5);
579     c6 = vrshrq_n_s16(b6, 5);
580     c7 = vrshrq_n_s16(b7, 5);
581   } else {
582     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
583     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
584 
585     if (bd == 10) {
586       idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
587                              &a6, &a7);
588       idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
589                              &a14, &a15);
590       idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
591                              &a3, &a11);
592       idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
593                              &a7, &a15);
594     } else {
595       idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
596                              &a6, &a7);
597       idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
598                              &a14, &a15);
599       idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
600                              &a3, &a11);
601       idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
602                              &a7, &a15);
603     }
604     c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
605     c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
606     c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
607     c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
608     c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
609     c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
610     c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
611     c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
612   }
613   highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
614 }
615