1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)18 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
19                                                const int16x8_t dequant,
20                                                tran_low_t *dqcoeff) {
21   const int32x4_t dqcoeff_0 =
22       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
23   const int32x4_t dqcoeff_1 =
24       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
25 
26 #if CONFIG_VP9_HIGHBITDEPTH
27   vst1q_s32(dqcoeff, dqcoeff_0);
28   vst1q_s32(dqcoeff + 4, dqcoeff_1);
29 #else
30   vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
31 #endif  // CONFIG_VP9_HIGHBITDEPTH
32 }
33 
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)34 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
35                          int skip_block, const int16_t *zbin_ptr,
36                          const int16_t *round_ptr, const int16_t *quant_ptr,
37                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
38                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
39                          uint16_t *eob_ptr, const int16_t *scan,
40                          const int16_t *iscan) {
41   const int16x8_t one = vdupq_n_s16(1);
42   const int16x8_t neg_one = vdupq_n_s16(-1);
43   uint16x8_t eob_max;
44   (void)scan;
45   (void)skip_block;
46   assert(!skip_block);
47 
48   // Process first 8 values which include a dc component.
49   {
50     // Only the first element of each vector is DC.
51     const int16x8_t zbin = vld1q_s16(zbin_ptr);
52     const int16x8_t round = vld1q_s16(round_ptr);
53     const int16x8_t quant = vld1q_s16(quant_ptr);
54     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
55     const int16x8_t dequant = vld1q_s16(dequant_ptr);
56     // Add one because the eob does not index from 0.
57     const uint16x8_t v_iscan =
58         vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
59 
60     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
61     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
62     const int16x8_t coeff_abs = vabsq_s16(coeff);
63 
64     const int16x8_t zbin_mask =
65         vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
66 
67     const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
68 
69     // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
70     int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
71 
72     qcoeff = vaddq_s16(qcoeff, rounded);
73 
74     // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
75     qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
76 
77     // Restore the sign bit.
78     qcoeff = veorq_s16(qcoeff, coeff_sign);
79     qcoeff = vsubq_s16(qcoeff, coeff_sign);
80 
81     qcoeff = vandq_s16(qcoeff, zbin_mask);
82 
83     // Set non-zero elements to -1 and use that to extract values for eob.
84     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
85 
86     coeff_ptr += 8;
87     iscan += 8;
88 
89     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
90     qcoeff_ptr += 8;
91 
92     calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
93     dqcoeff_ptr += 8;
94   }
95 
96   n_coeffs -= 8;
97 
98   {
99     const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
100     const int16x8_t round = vdupq_n_s16(round_ptr[1]);
101     const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
102     const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
103     const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
104 
105     do {
106       // Add one because the eob is not its index.
107       const uint16x8_t v_iscan =
108           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
109 
110       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
111       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
112       const int16x8_t coeff_abs = vabsq_s16(coeff);
113 
114       const int16x8_t zbin_mask =
115           vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
116 
117       const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
118 
119       // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
120       int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
121 
122       qcoeff = vaddq_s16(qcoeff, rounded);
123 
124       // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
125       qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
126 
127       // Restore the sign bit.
128       qcoeff = veorq_s16(qcoeff, coeff_sign);
129       qcoeff = vsubq_s16(qcoeff, coeff_sign);
130 
131       qcoeff = vandq_s16(qcoeff, zbin_mask);
132 
133       // Set non-zero elements to -1 and use that to extract values for eob.
134       eob_max =
135           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
136 
137       coeff_ptr += 8;
138       iscan += 8;
139 
140       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
141       qcoeff_ptr += 8;
142 
143       calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
144       dqcoeff_ptr += 8;
145 
146       n_coeffs -= 8;
147     } while (n_coeffs > 0);
148   }
149 
150 #ifdef __aarch64__
151   *eob_ptr = vmaxvq_u16(eob_max);
152 #else
153   {
154     const uint16x4_t eob_max_0 =
155         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
156     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
157     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
158     vst1_lane_u16(eob_ptr, eob_max_2, 0);
159   }
160 #endif  // __aarch64__
161 }
162 
extract_sign_bit(int32x4_t a)163 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
164   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
165 }
166 
calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)167 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
168                                                      const int16x8_t dequant,
169                                                      tran_low_t *dqcoeff) {
170   int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
171   int32x4_t dqcoeff_1 =
172       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
173 
174   // Add 1 if negative to round towards zero because the C uses division.
175   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
176   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
177 
178 #if CONFIG_VP9_HIGHBITDEPTH
179   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
180   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
181   vst1q_s32(dqcoeff, dqcoeff_0);
182   vst1q_s32(dqcoeff + 4, dqcoeff_1);
183 #else
184   vst1q_s16(dqcoeff,
185             vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
186 #endif  // CONFIG_VP9_HIGHBITDEPTH
187 }
188 
189 // Main difference is that zbin values are halved before comparison and dqcoeff
190 // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)191 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
192                                int skip_block, const int16_t *zbin_ptr,
193                                const int16_t *round_ptr,
194                                const int16_t *quant_ptr,
195                                const int16_t *quant_shift_ptr,
196                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
197                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
198                                const int16_t *scan, const int16_t *iscan) {
199   const int16x8_t one = vdupq_n_s16(1);
200   const int16x8_t neg_one = vdupq_n_s16(-1);
201   uint16x8_t eob_max;
202   int i;
203   (void)scan;
204   (void)n_coeffs;  // Because we will always calculate 32*32.
205   (void)skip_block;
206   assert(!skip_block);
207 
208   // Process first 8 values which include a dc component.
209   {
210     // Only the first element of each vector is DC.
211     const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
212     const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
213     const int16x8_t quant = vld1q_s16(quant_ptr);
214     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
215     const int16x8_t dequant = vld1q_s16(dequant_ptr);
216     // Add one because the eob does not index from 0.
217     const uint16x8_t v_iscan =
218         vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
219 
220     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
221     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
222     const int16x8_t coeff_abs = vabsq_s16(coeff);
223 
224     const int16x8_t zbin_mask =
225         vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
226 
227     const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
228 
229     // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
230     int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
231 
232     qcoeff = vaddq_s16(qcoeff, rounded);
233 
234     // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
235     qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
236 
237     // Restore the sign bit.
238     qcoeff = veorq_s16(qcoeff, coeff_sign);
239     qcoeff = vsubq_s16(qcoeff, coeff_sign);
240 
241     qcoeff = vandq_s16(qcoeff, zbin_mask);
242 
243     // Set non-zero elements to -1 and use that to extract values for eob.
244     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
245 
246     coeff_ptr += 8;
247     iscan += 8;
248 
249     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
250     qcoeff_ptr += 8;
251 
252     calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
253     dqcoeff_ptr += 8;
254   }
255 
256   {
257     const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
258     const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
259     const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
260     const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
261     const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
262 
263     for (i = 1; i < 32 * 32 / 8; ++i) {
264       // Add one because the eob is not its index.
265       const uint16x8_t v_iscan =
266           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
267 
268       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
269       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
270       const int16x8_t coeff_abs = vabsq_s16(coeff);
271 
272       const int16x8_t zbin_mask =
273           vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
274 
275       const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
276 
277       // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
278       int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
279 
280       qcoeff = vaddq_s16(qcoeff, rounded);
281 
282       // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
283       qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
284 
285       // Restore the sign bit.
286       qcoeff = veorq_s16(qcoeff, coeff_sign);
287       qcoeff = vsubq_s16(qcoeff, coeff_sign);
288 
289       qcoeff = vandq_s16(qcoeff, zbin_mask);
290 
291       // Set non-zero elements to -1 and use that to extract values for eob.
292       eob_max =
293           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
294 
295       coeff_ptr += 8;
296       iscan += 8;
297 
298       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
299       qcoeff_ptr += 8;
300 
301       calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
302       dqcoeff_ptr += 8;
303     }
304   }
305 
306 #ifdef __aarch64__
307   *eob_ptr = vmaxvq_u16(eob_max);
308 #else
309   {
310     const uint16x4_t eob_max_0 =
311         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
312     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
313     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
314     vst1_lane_u16(eob_ptr, eob_max_2, 0);
315   }
316 #endif  // __aarch64__
317 }
318