1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
12 #define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx_dsp/arm/idct_neon.h"
18 #include "vpx_dsp/inv_txfm.h"
19 
highbd_idct4x4_1_add_kernel1(uint16_t ** dest,const int stride,const int16x8_t res,const int16x8_t max)20 static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
21                                                 const int stride,
22                                                 const int16x8_t res,
23                                                 const int16x8_t max) {
24   const uint16x4_t a0 = vld1_u16(*dest);
25   const uint16x4_t a1 = vld1_u16(*dest + stride);
26   const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
27   // Note: In some profile tests, res is quite close to +/-32767.
28   // We use saturating addition.
29   const int16x8_t b = vqaddq_s16(res, a);
30   const int16x8_t c = vminq_s16(b, max);
31   const uint16x8_t d = vqshluq_n_s16(c, 0);
32   vst1_u16(*dest, vget_low_u16(d));
33   *dest += stride;
34   vst1_u16(*dest, vget_high_u16(d));
35   *dest += stride;
36 }
37 
idct4x4_16_kernel_bd10(const int32x4_t cospis,int32x4_t * const a)38 static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
39                                           int32x4_t *const a) {
40   int32x4_t b0, b1, b2, b3;
41 
42   transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
43   b0 = vaddq_s32(a[0], a[2]);
44   b1 = vsubq_s32(a[0], a[2]);
45   b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
46   b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
47   b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
48   b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
49   b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
50   b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
51   b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
52   b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
53   b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
54   b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
55   a[0] = vaddq_s32(b0, b3);
56   a[1] = vaddq_s32(b1, b2);
57   a[2] = vsubq_s32(b1, b2);
58   a[3] = vsubq_s32(b0, b3);
59 }
60 
idct4x4_16_kernel_bd12(const int32x4_t cospis,int32x4_t * const a)61 static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
62                                           int32x4_t *const a) {
63   int32x4_t b0, b1, b2, b3;
64   int64x2_t c[12];
65 
66   transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
67   b0 = vaddq_s32(a[0], a[2]);
68   b1 = vsubq_s32(a[0], a[2]);
69   c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
70   c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
71   c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
72   c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
73   c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
74   c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
75   c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
76   c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
77   c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
78   c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
79   c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
80   c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
81   c[4] = vsubq_s64(c[4], c[8]);
82   c[5] = vsubq_s64(c[5], c[9]);
83   c[6] = vaddq_s64(c[6], c[10]);
84   c[7] = vaddq_s64(c[7], c[11]);
85   b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
86                     vrshrn_n_s64(c[1], DCT_CONST_BITS));
87   b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
88                     vrshrn_n_s64(c[3], DCT_CONST_BITS));
89   b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
90                     vrshrn_n_s64(c[5], DCT_CONST_BITS));
91   b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
92                     vrshrn_n_s64(c[7], DCT_CONST_BITS));
93   a[0] = vaddq_s32(b0, b3);
94   a[1] = vaddq_s32(b1, b2);
95   a[2] = vsubq_s32(b1, b2);
96   a[3] = vsubq_s32(b0, b3);
97 }
98 
highbd_add8x8(int16x8_t * const a,uint16_t * dest,const int stride,const int bd)99 static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
100                                  const int stride, const int bd) {
101   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
102   const uint16_t *dst = dest;
103   uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
104   uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
105   int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
106 
107   d0 = vld1q_u16(dst);
108   dst += stride;
109   d1 = vld1q_u16(dst);
110   dst += stride;
111   d2 = vld1q_u16(dst);
112   dst += stride;
113   d3 = vld1q_u16(dst);
114   dst += stride;
115   d4 = vld1q_u16(dst);
116   dst += stride;
117   d5 = vld1q_u16(dst);
118   dst += stride;
119   d6 = vld1q_u16(dst);
120   dst += stride;
121   d7 = vld1q_u16(dst);
122 
123   d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
124   d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
125   d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
126   d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
127   d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
128   d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
129   d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
130   d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
131 
132   d0_s16 = vminq_s16(d0_s16, max);
133   d1_s16 = vminq_s16(d1_s16, max);
134   d2_s16 = vminq_s16(d2_s16, max);
135   d3_s16 = vminq_s16(d3_s16, max);
136   d4_s16 = vminq_s16(d4_s16, max);
137   d5_s16 = vminq_s16(d5_s16, max);
138   d6_s16 = vminq_s16(d6_s16, max);
139   d7_s16 = vminq_s16(d7_s16, max);
140   d0_u16 = vqshluq_n_s16(d0_s16, 0);
141   d1_u16 = vqshluq_n_s16(d1_s16, 0);
142   d2_u16 = vqshluq_n_s16(d2_s16, 0);
143   d3_u16 = vqshluq_n_s16(d3_s16, 0);
144   d4_u16 = vqshluq_n_s16(d4_s16, 0);
145   d5_u16 = vqshluq_n_s16(d5_s16, 0);
146   d6_u16 = vqshluq_n_s16(d6_s16, 0);
147   d7_u16 = vqshluq_n_s16(d7_s16, 0);
148 
149   vst1q_u16(dest, d0_u16);
150   dest += stride;
151   vst1q_u16(dest, d1_u16);
152   dest += stride;
153   vst1q_u16(dest, d2_u16);
154   dest += stride;
155   vst1q_u16(dest, d3_u16);
156   dest += stride;
157   vst1q_u16(dest, d4_u16);
158   dest += stride;
159   vst1q_u16(dest, d5_u16);
160   dest += stride;
161   vst1q_u16(dest, d6_u16);
162   dest += stride;
163   vst1q_u16(dest, d7_u16);
164 }
165 
idct8x8_64_half1d_bd10(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)166 static INLINE void idct8x8_64_half1d_bd10(
167     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
168     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
169     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
170     int32x4_t *const io7) {
171   int32x4_t step1[8], step2[8];
172 
173   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
174 
175   // stage 1
176   step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
177   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
178   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
179   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
180 
181   step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
182   step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
183   step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
184   step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
185 
186   step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
187   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
188   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
189   step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
190 
191   // stage 2
192   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
193   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
194   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
195 
196   step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
197   step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
198   step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
199   step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
200 
201   step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
202   step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
203   step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
204   step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
205 
206   step2[4] = vaddq_s32(step1[4], step1[5]);
207   step2[5] = vsubq_s32(step1[4], step1[5]);
208   step2[6] = vsubq_s32(step1[7], step1[6]);
209   step2[7] = vaddq_s32(step1[7], step1[6]);
210 
211   // stage 3
212   step1[0] = vaddq_s32(step2[0], step2[3]);
213   step1[1] = vaddq_s32(step2[1], step2[2]);
214   step1[2] = vsubq_s32(step2[1], step2[2]);
215   step1[3] = vsubq_s32(step2[0], step2[3]);
216 
217   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
218   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
219   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
220   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
221   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
222 
223   // stage 4
224   *io0 = vaddq_s32(step1[0], step2[7]);
225   *io1 = vaddq_s32(step1[1], step1[6]);
226   *io2 = vaddq_s32(step1[2], step1[5]);
227   *io3 = vaddq_s32(step1[3], step2[4]);
228   *io4 = vsubq_s32(step1[3], step2[4]);
229   *io5 = vsubq_s32(step1[2], step1[5]);
230   *io6 = vsubq_s32(step1[1], step1[6]);
231   *io7 = vsubq_s32(step1[0], step2[7]);
232 }
233 
idct8x8_64_half1d_bd12(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)234 static INLINE void idct8x8_64_half1d_bd12(
235     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
236     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
237     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
238     int32x4_t *const io7) {
239   int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
240       input7h;
241   int32x2_t step1l[4], step1h[4];
242   int32x4_t step1[8], step2[8];
243   int64x2_t t64[8];
244   int32x2_t t32[8];
245 
246   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
247 
248   // stage 1
249   input1l = vget_low_s32(*io1);
250   input1h = vget_high_s32(*io1);
251   input3l = vget_low_s32(*io3);
252   input3h = vget_high_s32(*io3);
253   input5l = vget_low_s32(*io5);
254   input5h = vget_high_s32(*io5);
255   input7l = vget_low_s32(*io7);
256   input7h = vget_high_s32(*io7);
257   step1l[0] = vget_low_s32(*io0);
258   step1h[0] = vget_high_s32(*io0);
259   step1l[1] = vget_low_s32(*io2);
260   step1h[1] = vget_high_s32(*io2);
261   step1l[2] = vget_low_s32(*io4);
262   step1h[2] = vget_high_s32(*io4);
263   step1l[3] = vget_low_s32(*io6);
264   step1h[3] = vget_high_s32(*io6);
265 
266   t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
267   t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
268   t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
269   t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
270   t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
271   t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
272   t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
273   t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
274   t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
275   t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
276   t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
277   t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
278   t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
279   t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
280   t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
281   t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
282   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
283   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
284   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
285   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
286   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
287   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
288   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
289   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
290   step1[4] = vcombine_s32(t32[0], t32[1]);
291   step1[5] = vcombine_s32(t32[2], t32[3]);
292   step1[6] = vcombine_s32(t32[4], t32[5]);
293   step1[7] = vcombine_s32(t32[6], t32[7]);
294 
295   // stage 2
296   t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
297   t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
298   t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
299   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
300   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
301   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
302   t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
303   t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
304   t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
305   t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
306   t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
307   t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
308   t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
309   t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
310   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
311   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
312   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
313   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
314   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
315   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
316   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
317   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
318   step2[0] = vcombine_s32(t32[0], t32[1]);
319   step2[1] = vcombine_s32(t32[2], t32[3]);
320   step2[2] = vcombine_s32(t32[4], t32[5]);
321   step2[3] = vcombine_s32(t32[6], t32[7]);
322 
323   step2[4] = vaddq_s32(step1[4], step1[5]);
324   step2[5] = vsubq_s32(step1[4], step1[5]);
325   step2[6] = vsubq_s32(step1[7], step1[6]);
326   step2[7] = vaddq_s32(step1[7], step1[6]);
327 
328   // stage 3
329   step1[0] = vaddq_s32(step2[0], step2[3]);
330   step1[1] = vaddq_s32(step2[1], step2[2]);
331   step1[2] = vsubq_s32(step2[1], step2[2]);
332   step1[3] = vsubq_s32(step2[0], step2[3]);
333 
334   t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
335   t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
336   t64[0] =
337       vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
338   t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
339                           vget_high_s32(cospis0), 0);
340   t64[2] =
341       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
342   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
343                           vget_high_s32(cospis0), 0);
344   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
345   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
346   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
347   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
348   step1[5] = vcombine_s32(t32[0], t32[1]);
349   step1[6] = vcombine_s32(t32[2], t32[3]);
350 
351   // stage 4
352   *io0 = vaddq_s32(step1[0], step2[7]);
353   *io1 = vaddq_s32(step1[1], step1[6]);
354   *io2 = vaddq_s32(step1[2], step1[5]);
355   *io3 = vaddq_s32(step1[3], step2[4]);
356   *io4 = vsubq_s32(step1[3], step2[4]);
357   *io5 = vsubq_s32(step1[2], step1[5]);
358   *io6 = vsubq_s32(step1[1], step1[6]);
359   *io7 = vsubq_s32(step1[0], step2[7]);
360 }
361 
highbd_idct16x16_store_pass1(const int32x4x2_t * const out,int32_t * output)362 static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
363                                                 int32_t *output) {
364   // Save the result into output
365   vst1q_s32(output + 0, out[0].val[0]);
366   vst1q_s32(output + 4, out[0].val[1]);
367   output += 16;
368   vst1q_s32(output + 0, out[1].val[0]);
369   vst1q_s32(output + 4, out[1].val[1]);
370   output += 16;
371   vst1q_s32(output + 0, out[2].val[0]);
372   vst1q_s32(output + 4, out[2].val[1]);
373   output += 16;
374   vst1q_s32(output + 0, out[3].val[0]);
375   vst1q_s32(output + 4, out[3].val[1]);
376   output += 16;
377   vst1q_s32(output + 0, out[4].val[0]);
378   vst1q_s32(output + 4, out[4].val[1]);
379   output += 16;
380   vst1q_s32(output + 0, out[5].val[0]);
381   vst1q_s32(output + 4, out[5].val[1]);
382   output += 16;
383   vst1q_s32(output + 0, out[6].val[0]);
384   vst1q_s32(output + 4, out[6].val[1]);
385   output += 16;
386   vst1q_s32(output + 0, out[7].val[0]);
387   vst1q_s32(output + 4, out[7].val[1]);
388   output += 16;
389   vst1q_s32(output + 0, out[8].val[0]);
390   vst1q_s32(output + 4, out[8].val[1]);
391   output += 16;
392   vst1q_s32(output + 0, out[9].val[0]);
393   vst1q_s32(output + 4, out[9].val[1]);
394   output += 16;
395   vst1q_s32(output + 0, out[10].val[0]);
396   vst1q_s32(output + 4, out[10].val[1]);
397   output += 16;
398   vst1q_s32(output + 0, out[11].val[0]);
399   vst1q_s32(output + 4, out[11].val[1]);
400   output += 16;
401   vst1q_s32(output + 0, out[12].val[0]);
402   vst1q_s32(output + 4, out[12].val[1]);
403   output += 16;
404   vst1q_s32(output + 0, out[13].val[0]);
405   vst1q_s32(output + 4, out[13].val[1]);
406   output += 16;
407   vst1q_s32(output + 0, out[14].val[0]);
408   vst1q_s32(output + 4, out[14].val[1]);
409   output += 16;
410   vst1q_s32(output + 0, out[15].val[0]);
411   vst1q_s32(output + 4, out[15].val[1]);
412 }
413 
highbd_idct16x16_add_store(const int32x4x2_t * const out,uint16_t * dest,const int stride,const int bd)414 static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
415                                               uint16_t *dest, const int stride,
416                                               const int bd) {
417   // Add the result to dest
418   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
419   int16x8_t o[16];
420   o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
421                       vrshrn_n_s32(out[0].val[1], 6));
422   o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
423                       vrshrn_n_s32(out[1].val[1], 6));
424   o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
425                       vrshrn_n_s32(out[2].val[1], 6));
426   o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
427                       vrshrn_n_s32(out[3].val[1], 6));
428   o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
429                       vrshrn_n_s32(out[4].val[1], 6));
430   o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
431                       vrshrn_n_s32(out[5].val[1], 6));
432   o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
433                       vrshrn_n_s32(out[6].val[1], 6));
434   o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
435                       vrshrn_n_s32(out[7].val[1], 6));
436   o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
437                       vrshrn_n_s32(out[8].val[1], 6));
438   o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
439                       vrshrn_n_s32(out[9].val[1], 6));
440   o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
441                        vrshrn_n_s32(out[10].val[1], 6));
442   o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
443                        vrshrn_n_s32(out[11].val[1], 6));
444   o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
445                        vrshrn_n_s32(out[12].val[1], 6));
446   o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
447                        vrshrn_n_s32(out[13].val[1], 6));
448   o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
449                        vrshrn_n_s32(out[14].val[1], 6));
450   o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
451                        vrshrn_n_s32(out[15].val[1], 6));
452   highbd_idct16x16_add8x1(o[0], max, &dest, stride);
453   highbd_idct16x16_add8x1(o[1], max, &dest, stride);
454   highbd_idct16x16_add8x1(o[2], max, &dest, stride);
455   highbd_idct16x16_add8x1(o[3], max, &dest, stride);
456   highbd_idct16x16_add8x1(o[4], max, &dest, stride);
457   highbd_idct16x16_add8x1(o[5], max, &dest, stride);
458   highbd_idct16x16_add8x1(o[6], max, &dest, stride);
459   highbd_idct16x16_add8x1(o[7], max, &dest, stride);
460   highbd_idct16x16_add8x1(o[8], max, &dest, stride);
461   highbd_idct16x16_add8x1(o[9], max, &dest, stride);
462   highbd_idct16x16_add8x1(o[10], max, &dest, stride);
463   highbd_idct16x16_add8x1(o[11], max, &dest, stride);
464   highbd_idct16x16_add8x1(o[12], max, &dest, stride);
465   highbd_idct16x16_add8x1(o[13], max, &dest, stride);
466   highbd_idct16x16_add8x1(o[14], max, &dest, stride);
467   highbd_idct16x16_add8x1(o[15], max, &dest, stride);
468 }
469 
470 void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
471                                          uint16_t *dest, const int stride,
472                                          const int bd);
473 
474 #endif  // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
475