1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vp8_rtcd.h"
14 
vp8_short_fdct4x4_neon(int16_t * input,int16_t * output,int pitch)15 void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) {
16   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
17   int16x4_t d16s16, d17s16, d26s16, dEmptys16;
18   uint16x4_t d4u16;
19   int16x8_t q0s16, q1s16;
20   int32x4_t q9s32, q10s32, q11s32, q12s32;
21   int16x4x2_t v2tmp0, v2tmp1;
22   int32x2x2_t v2tmp2, v2tmp3;
23 
24   d16s16 = vdup_n_s16(5352);
25   d17s16 = vdup_n_s16(2217);
26   q9s32 = vdupq_n_s32(14500);
27   q10s32 = vdupq_n_s32(7500);
28   q11s32 = vdupq_n_s32(12000);
29   q12s32 = vdupq_n_s32(51000);
30 
31   // Part one
32   pitch >>= 1;
33   d0s16 = vld1_s16(input);
34   input += pitch;
35   d1s16 = vld1_s16(input);
36   input += pitch;
37   d2s16 = vld1_s16(input);
38   input += pitch;
39   d3s16 = vld1_s16(input);
40 
41   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16));
42   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16));
43   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
44                     vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
45   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
46                     vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
47 
48   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
49   d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
50   d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
51   d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
52 
53   d4s16 = vshl_n_s16(d4s16, 3);
54   d5s16 = vshl_n_s16(d5s16, 3);
55   d6s16 = vshl_n_s16(d6s16, 3);
56   d7s16 = vshl_n_s16(d7s16, 3);
57 
58   d0s16 = vadd_s16(d4s16, d5s16);
59   d2s16 = vsub_s16(d4s16, d5s16);
60 
61   q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
62   q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
63   q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
64   q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
65 
66   d1s16 = vshrn_n_s32(q9s32, 12);
67   d3s16 = vshrn_n_s32(q10s32, 12);
68 
69   // Part two
70   v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16));
71   v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16));
72   v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),   // d0
73                     vreinterpret_s16_s32(v2tmp3.val[0]));  // d1
74   v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),   // d2
75                     vreinterpret_s16_s32(v2tmp3.val[1]));  // d3
76 
77   d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
78   d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
79   d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
80   d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
81 
82   d26s16 = vdup_n_s16(7);
83   d4s16 = vadd_s16(d4s16, d26s16);
84 
85   d0s16 = vadd_s16(d4s16, d5s16);
86   d2s16 = vsub_s16(d4s16, d5s16);
87 
88   q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
89   q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
90 
91   dEmptys16 = vdup_n_s16(0);
92   d4u16 = vceq_s16(d7s16, dEmptys16);
93 
94   d0s16 = vshr_n_s16(d0s16, 4);
95   d2s16 = vshr_n_s16(d2s16, 4);
96 
97   q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
98   q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
99 
100   d4u16 = vmvn_u16(d4u16);
101   d1s16 = vshrn_n_s32(q11s32, 16);
102   d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
103   d3s16 = vshrn_n_s32(q12s32, 16);
104 
105   q0s16 = vcombine_s16(d0s16, d1s16);
106   q1s16 = vcombine_s16(d2s16, d3s16);
107 
108   vst1q_s16(output, q0s16);
109   vst1q_s16(output + 8, q1s16);
110   return;
111 }
112 
vp8_short_fdct8x4_neon(int16_t * input,int16_t * output,int pitch)113 void vp8_short_fdct8x4_neon(int16_t *input, int16_t *output, int pitch) {
114   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
115   int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
116   uint16x4_t d28u16, d29u16;
117   uint16x8_t q14u16;
118   int16x8_t q0s16, q1s16, q2s16, q3s16;
119   int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
120   int32x4_t q9s32, q10s32, q11s32, q12s32;
121   int16x8x2_t v2tmp0, v2tmp1;
122   int32x4x2_t v2tmp2, v2tmp3;
123 
124   d16s16 = vdup_n_s16(5352);
125   d17s16 = vdup_n_s16(2217);
126   q9s32 = vdupq_n_s32(14500);
127   q10s32 = vdupq_n_s32(7500);
128 
129   // Part one
130   pitch >>= 1;
131   q0s16 = vld1q_s16(input);
132   input += pitch;
133   q1s16 = vld1q_s16(input);
134   input += pitch;
135   q2s16 = vld1q_s16(input);
136   input += pitch;
137   q3s16 = vld1q_s16(input);
138 
139   v2tmp2 =
140       vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16));
141   v2tmp3 =
142       vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16));
143   v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
144                      vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
145   v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
146                      vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
147 
148   q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
149   q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
150   q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
151   q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
152 
153   q11s16 = vshlq_n_s16(q11s16, 3);
154   q12s16 = vshlq_n_s16(q12s16, 3);
155   q13s16 = vshlq_n_s16(q13s16, 3);
156   q14s16 = vshlq_n_s16(q14s16, 3);
157 
158   q0s16 = vaddq_s16(q11s16, q12s16);
159   q2s16 = vsubq_s16(q11s16, q12s16);
160 
161   q11s32 = q9s32;
162   q12s32 = q10s32;
163 
164   d26s16 = vget_low_s16(q13s16);
165   d27s16 = vget_high_s16(q13s16);
166   d28s16 = vget_low_s16(q14s16);
167   d29s16 = vget_high_s16(q14s16);
168 
169   q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
170   q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
171   q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
172   q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
173 
174   q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
175   q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
176   q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
177   q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
178 
179   d2s16 = vshrn_n_s32(q9s32, 12);
180   d6s16 = vshrn_n_s32(q10s32, 12);
181   d3s16 = vshrn_n_s32(q11s32, 12);
182   d7s16 = vshrn_n_s32(q12s32, 12);
183   q1s16 = vcombine_s16(d2s16, d3s16);
184   q3s16 = vcombine_s16(d6s16, d7s16);
185 
186   // Part two
187   q9s32 = vdupq_n_s32(12000);
188   q10s32 = vdupq_n_s32(51000);
189 
190   v2tmp2 =
191       vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16));
192   v2tmp3 =
193       vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16));
194   v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]),   // q0
195                      vreinterpretq_s16_s32(v2tmp3.val[0]));  // q1
196   v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]),   // q2
197                      vreinterpretq_s16_s32(v2tmp3.val[1]));  // q3
198 
199   q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
200   q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
201   q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
202   q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
203 
204   q15s16 = vdupq_n_s16(7);
205   q11s16 = vaddq_s16(q11s16, q15s16);
206   q0s16 = vaddq_s16(q11s16, q12s16);
207   q1s16 = vsubq_s16(q11s16, q12s16);
208 
209   q11s32 = q9s32;
210   q12s32 = q10s32;
211 
212   d0s16 = vget_low_s16(q0s16);
213   d1s16 = vget_high_s16(q0s16);
214   d2s16 = vget_low_s16(q1s16);
215   d3s16 = vget_high_s16(q1s16);
216 
217   d0s16 = vshr_n_s16(d0s16, 4);
218   d4s16 = vshr_n_s16(d1s16, 4);
219   d2s16 = vshr_n_s16(d2s16, 4);
220   d6s16 = vshr_n_s16(d3s16, 4);
221 
222   d26s16 = vget_low_s16(q13s16);
223   d27s16 = vget_high_s16(q13s16);
224   d28s16 = vget_low_s16(q14s16);
225   d29s16 = vget_high_s16(q14s16);
226 
227   q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
228   q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
229   q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
230   q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
231 
232   q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
233   q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
234   q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
235   q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
236 
237   d1s16 = vshrn_n_s32(q9s32, 16);
238   d3s16 = vshrn_n_s32(q10s32, 16);
239   d5s16 = vshrn_n_s32(q11s32, 16);
240   d7s16 = vshrn_n_s32(q12s32, 16);
241 
242   qEmptys16 = vdupq_n_s16(0);
243   q14u16 = vceqq_s16(q14s16, qEmptys16);
244   q14u16 = vmvnq_u16(q14u16);
245 
246   d28u16 = vget_low_u16(q14u16);
247   d29u16 = vget_high_u16(q14u16);
248   d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
249   d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
250 
251   q0s16 = vcombine_s16(d0s16, d1s16);
252   q1s16 = vcombine_s16(d2s16, d3s16);
253   q2s16 = vcombine_s16(d4s16, d5s16);
254   q3s16 = vcombine_s16(d6s16, d7s16);
255 
256   vst1q_s16(output, q0s16);
257   vst1q_s16(output + 8, q1s16);
258   vst1q_s16(output + 16, q2s16);
259   vst1q_s16(output + 24, q3s16);
260   return;
261 }
262