1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <math.h>
15 
16 #include "av1/common/txb_common.h"
17 #include "av1/encoder/encodetxb.h"
18 #include "av1/common/arm/mem_neon.h"
19 
av1_txb_init_levels_neon(const tran_low_t * const coeff,const int width,const int height,uint8_t * const levels)20 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
21                               const int height, uint8_t *const levels) {
22   const int stride = width + TX_PAD_HOR;
23   memset(levels - TX_PAD_TOP * stride, 0,
24          sizeof(*levels) * TX_PAD_TOP * stride);
25   memset(levels + stride * height, 0,
26          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
27 
28   const int32x4_t zeros = vdupq_n_s32(0);
29   int i = 0;
30   uint8_t *ls = levels;
31   const tran_low_t *cf = coeff;
32   if (width == 4) {
33     do {
34       const int32x4_t coeffA = vld1q_s32(cf);
35       const int32x4_t coeffB = vld1q_s32(cf + width);
36       const int16x8_t coeffAB =
37           vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
38       const int16x8_t absAB = vqabsq_s16(coeffAB);
39       const int8x8_t absABs = vqmovn_s16(absAB);
40 #if defined(__aarch64__)
41       const int8x16_t absAB8 =
42           vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
43       const uint8x16_t lsAB =
44           vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
45 #else
46       const int32x2x2_t absAB8 =
47           vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
48       const uint8x16_t lsAB =
49           vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
50 #endif
51       vst1q_u8(ls, lsAB);
52       ls += (stride << 1);
53       cf += (width << 1);
54       i += 2;
55     } while (i < height);
56   } else if (width == 8) {
57     do {
58       const int32x4_t coeffA = vld1q_s32(cf);
59       const int32x4_t coeffB = vld1q_s32(cf + 4);
60       const int16x8_t coeffAB =
61           vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
62       const int16x8_t absAB = vqabsq_s16(coeffAB);
63       const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
64           vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
65       vst1q_u8(ls, absAB8);
66       ls += stride;
67       cf += width;
68       i += 1;
69     } while (i < height);
70   } else {
71     do {
72       int j = 0;
73       do {
74         const int32x4_t coeffA = vld1q_s32(cf);
75         const int32x4_t coeffB = vld1q_s32(cf + 4);
76         const int32x4_t coeffC = vld1q_s32(cf + 8);
77         const int32x4_t coeffD = vld1q_s32(cf + 12);
78         const int16x8_t coeffAB =
79             vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
80         const int16x8_t coeffCD =
81             vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
82         const int16x8_t absAB = vqabsq_s16(coeffAB);
83         const int16x8_t absCD = vqabsq_s16(coeffCD);
84         const uint8x16_t absABCD = vreinterpretq_u8_s8(
85             vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
86         vst1q_u8((ls + j), absABCD);
87         j += 16;
88         cf += 16;
89       } while (j < width);
90       *(int32_t *)(ls + width) = 0;
91       ls += stride;
92       i += 1;
93     } while (i < height);
94   }
95 }
96 
97 // get_4_nz_map_contexts_2d coefficients:
98 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
99   { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
100   { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
101 };
102 
103 // get_4_nz_map_contexts_hor coefficients:
104 /* clang-format off */
105 #define SIG_COEF_CONTEXTS_2D_X4_051010                        \
106   (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
107   ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
108 /* clang-format on */
109 
110 // get_4_nz_map_contexts_ver coefficients:
111 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
112   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
113   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
114   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
115   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
116   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
117   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
118   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
119   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
120 };
121 
122 // get_8_coeff_contexts_2d coefficients:
123 // if (height == 8)
124 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
125   { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
126   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
127 };
128 // if (height < 8)
129 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
130   { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
131   { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
132 };
133 
134 // if (height > 8)
135 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
136   { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
137   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
138 };
139 
140 // get_4_nz_map_contexts_ver coefficients:
141 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
142   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
143   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
144   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
145   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
146   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
147   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
148   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
149   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
150 };
151 
152 // get_16n_coeff_contexts_2d coefficients:
153 // real_width == real_height
154 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
155   { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
156   { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
157   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
158   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
159 };
160 
161 // real_width > real_height
162 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
163   { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
164   { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
165   { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
166 };
167 
168 // real_width < real_height
169 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
170   { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
171   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
172   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
173 };
174 
175 // get_16n_coeff_contexts_hor coefficients:
176 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
177   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
178   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
179   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
180   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
181   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
182   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
183   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
184   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
185 };
186 
187 // end of coefficients declaration area
188 
load_8bit_4x4_to_1_reg(const uint8_t * const src,const int byte_stride)189 static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
190                                                 const int byte_stride) {
191 #ifdef __aarch64__
192   uint32x4_t v_data = vld1q_u32((uint32_t *)src);
193   v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
194   v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
195   v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
196 
197   return vreinterpretq_u8_u32(v_data);
198 #else
199   return load_unaligned_u8q(src, byte_stride);
200 #endif
201 }
202 
load_8bit_8x2_to_1_reg(const uint8_t * const src,const int byte_stride)203 static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
204                                                 const int byte_stride) {
205 #ifdef __aarch64__
206   uint64x2_t v_data = vld1q_u64((uint64_t *)src);
207   v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
208 
209   return vreinterpretq_u8_u64(v_data);
210 #else
211   uint8x8_t v_data_low = vld1_u8(src);
212   uint8x8_t v_data_high = vld1_u8(src + byte_stride);
213 
214   return vcombine_u8(v_data_low, v_data_high);
215 #endif
216 }
217 
load_8bit_16x1_to_1_reg(const uint8_t * const src,const int byte_stride)218 static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
219                                                  const int byte_stride) {
220   (void)byte_stride;
221   return vld1q_u8(src);
222 }
223 
load_levels_4x4x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)224 static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
225                                      const ptrdiff_t *const offsets,
226                                      uint8x16_t *const level) {
227   level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
228   level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
229   level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
230   level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
231   level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
232 }
233 
load_levels_8x2x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)234 static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
235                                      const ptrdiff_t *const offsets,
236                                      uint8x16_t *const level) {
237   level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
238   level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
239   level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
240   level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
241   level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
242 }
243 
load_levels_16x1x5(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,uint8x16_t * const level)244 static INLINE void load_levels_16x1x5(const uint8_t *const src,
245                                       const int stride,
246                                       const ptrdiff_t *const offsets,
247                                       uint8x16_t *const level) {
248   level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
249   level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
250   level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
251   level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
252   level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
253 }
254 
get_coeff_contexts_kernel(uint8x16_t * const level)255 static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
256   const uint8x16_t const_3 = vdupq_n_u8(3);
257   const uint8x16_t const_4 = vdupq_n_u8(4);
258   uint8x16_t count;
259 
260   count = vminq_u8(level[0], const_3);
261   level[1] = vminq_u8(level[1], const_3);
262   level[2] = vminq_u8(level[2], const_3);
263   level[3] = vminq_u8(level[3], const_3);
264   level[4] = vminq_u8(level[4], const_3);
265   count = vaddq_u8(count, level[1]);
266   count = vaddq_u8(count, level[2]);
267   count = vaddq_u8(count, level[3]);
268   count = vaddq_u8(count, level[4]);
269 
270   count = vrshrq_n_u8(count, 1);
271   count = vminq_u8(count, const_4);
272   return count;
273 }
274 
get_4_nz_map_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * const coeff_contexts)275 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
276                                             const int height,
277                                             const ptrdiff_t *const offsets,
278                                             uint8_t *const coeff_contexts) {
279   const int stride = 4 + TX_PAD_HOR;
280   const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
281 
282   uint8x16_t pos_to_offset =
283       vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
284 
285   uint8x16_t count;
286   uint8x16_t level[5];
287   uint8_t *cc = coeff_contexts;
288 
289   assert(!(height % 4));
290 
291   int row = height;
292   do {
293     load_levels_4x4x5(levels, stride, offsets, level);
294     count = get_coeff_contexts_kernel(level);
295     count = vaddq_u8(count, pos_to_offset);
296     vst1q_u8(cc, count);
297     pos_to_offset = pos_to_offset_large;
298     levels += 4 * stride;
299     cc += 16;
300     row -= 4;
301   } while (row);
302 
303   coeff_contexts[0] = 0;
304 }
305 
get_4_nz_map_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)306 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
307                                              const int height,
308                                              const ptrdiff_t *const offsets,
309                                              uint8_t *coeff_contexts) {
310   const int stride = 4 + TX_PAD_HOR;
311 
312   const uint8x16_t pos_to_offset =
313       vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
314 
315   uint8x16_t count;
316   uint8x16_t level[5];
317 
318   assert(!(height % 4));
319 
320   int row = height;
321   do {
322     load_levels_4x4x5(levels, stride, offsets, level);
323     count = get_coeff_contexts_kernel(level);
324     count = vaddq_u8(count, pos_to_offset);
325     vst1q_u8(coeff_contexts, count);
326     levels += 4 * stride;
327     coeff_contexts += 16;
328     row -= 4;
329   } while (row);
330 }
331 
get_4_nz_map_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)332 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
333                                              const int height,
334                                              const ptrdiff_t *const offsets,
335                                              uint8_t *coeff_contexts) {
336   const int stride = 4 + TX_PAD_HOR;
337   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
338 
339   uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);
340 
341   uint8x16_t count;
342   uint8x16_t level[5];
343 
344   assert(!(height % 4));
345 
346   int row = height;
347   do {
348     load_levels_4x4x5(levels, stride, offsets, level);
349     count = get_coeff_contexts_kernel(level);
350     count = vaddq_u8(count, pos_to_offset);
351     vst1q_u8(coeff_contexts, count);
352     pos_to_offset = pos_to_offset_large;
353     levels += 4 * stride;
354     coeff_contexts += 16;
355     row -= 4;
356   } while (row);
357 }
358 
get_8_coeff_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)359 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
360                                            const int height,
361                                            const ptrdiff_t *const offsets,
362                                            uint8_t *coeff_contexts) {
363   const int stride = 8 + TX_PAD_HOR;
364   uint8_t *cc = coeff_contexts;
365   uint8x16_t count;
366   uint8x16_t level[5];
367   uint8x16_t pos_to_offset[3];
368 
369   assert(!(height % 2));
370 
371   if (height == 8) {
372     pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
373     pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
374   } else if (height < 8) {
375     pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
376     pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
377   } else {
378     pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
379     pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
380   }
381   pos_to_offset[2] = vdupq_n_u8(21);
382 
383   int row = height;
384   do {
385     load_levels_8x2x5(levels, stride, offsets, level);
386     count = get_coeff_contexts_kernel(level);
387     count = vaddq_u8(count, pos_to_offset[0]);
388     vst1q_u8(cc, count);
389     pos_to_offset[0] = pos_to_offset[1];
390     pos_to_offset[1] = pos_to_offset[2];
391     levels += 2 * stride;
392     cc += 16;
393     row -= 2;
394   } while (row);
395 
396   coeff_contexts[0] = 0;
397 }
398 
get_8_coeff_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)399 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
400                                             const int height,
401                                             const ptrdiff_t *const offsets,
402                                             uint8_t *coeff_contexts) {
403   const int stride = 8 + TX_PAD_HOR;
404 
405   const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);
406 
407   uint8x16_t count;
408   uint8x16_t level[5];
409 
410   assert(!(height % 2));
411 
412   int row = height;
413   do {
414     load_levels_8x2x5(levels, stride, offsets, level);
415     count = get_coeff_contexts_kernel(level);
416     count = vaddq_u8(count, pos_to_offset);
417     vst1q_u8(coeff_contexts, count);
418     levels += 2 * stride;
419     coeff_contexts += 16;
420     row -= 2;
421   } while (row);
422 }
423 
get_8_coeff_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)424 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
425                                             const int height,
426                                             const ptrdiff_t *const offsets,
427                                             uint8_t *coeff_contexts) {
428   const int stride = 8 + TX_PAD_HOR;
429   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
430 
431   uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
432                                          vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
433 
434   uint8x16_t count;
435   uint8x16_t level[5];
436 
437   assert(!(height % 2));
438 
439   int row = height;
440   do {
441     load_levels_8x2x5(levels, stride, offsets, level);
442     count = get_coeff_contexts_kernel(level);
443     count = vaddq_u8(count, pos_to_offset);
444     vst1q_u8(coeff_contexts, count);
445     pos_to_offset = pos_to_offset_large;
446     levels += 2 * stride;
447     coeff_contexts += 16;
448     row -= 2;
449   } while (row);
450 }
451 
get_16n_coeff_contexts_2d(const uint8_t * levels,const int real_width,const int real_height,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)452 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
453                                              const int real_width,
454                                              const int real_height,
455                                              const int width, const int height,
456                                              const ptrdiff_t *const offsets,
457                                              uint8_t *coeff_contexts) {
458   const int stride = width + TX_PAD_HOR;
459   uint8_t *cc = coeff_contexts;
460   int row = height;
461   uint8x16_t pos_to_offset[5];
462   uint8x16_t pos_to_offset_large[3];
463   uint8x16_t count;
464   uint8x16_t level[5];
465 
466   assert(!(width % 16));
467 
468   pos_to_offset_large[2] = vdupq_n_u8(21);
469   if (real_width == real_height) {
470     pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
471     pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
472     pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
473     pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
474     pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
475         pos_to_offset_large[2];
476   } else if (real_width > real_height) {
477     pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
478     pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
479     pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
480         vld1q_u8(c_16_po_2d_g[2]);
481     pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
482   } else {  // real_width < real_height
483     pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
484     pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
485     pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
486     pos_to_offset[4] = pos_to_offset_large[2];
487     pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
488   }
489 
490   do {
491     int w = width;
492 
493     do {
494       load_levels_16x1x5(levels, stride, offsets, level);
495       count = get_coeff_contexts_kernel(level);
496       count = vaddq_u8(count, pos_to_offset[0]);
497       vst1q_u8(cc, count);
498       levels += 16;
499       cc += 16;
500       w -= 16;
501       pos_to_offset[0] = pos_to_offset_large[0];
502     } while (w);
503 
504     pos_to_offset[0] = pos_to_offset[1];
505     pos_to_offset[1] = pos_to_offset[2];
506     pos_to_offset[2] = pos_to_offset[3];
507     pos_to_offset[3] = pos_to_offset[4];
508     pos_to_offset_large[0] = pos_to_offset_large[1];
509     pos_to_offset_large[1] = pos_to_offset_large[2];
510     levels += TX_PAD_HOR;
511   } while (--row);
512 
513   coeff_contexts[0] = 0;
514 }
515 
get_16n_coeff_contexts_hor(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)516 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
517                                               const int width, const int height,
518                                               const ptrdiff_t *const offsets,
519                                               uint8_t *coeff_contexts) {
520   const int stride = width + TX_PAD_HOR;
521 
522   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
523 
524   uint8x16_t count;
525   uint8x16_t level[5];
526 
527   assert(!(width % 16));
528 
529   int row = height;
530   do {
531     uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);
532 
533     int w = width;
534     do {
535       load_levels_16x1x5(levels, stride, offsets, level);
536       count = get_coeff_contexts_kernel(level);
537       count = vaddq_u8(count, pos_to_offset);
538       vst1q_u8(coeff_contexts, count);
539       pos_to_offset = pos_to_offset_large;
540       levels += 16;
541       coeff_contexts += 16;
542       w -= 16;
543     } while (w);
544 
545     levels += TX_PAD_HOR;
546   } while (--row);
547 }
548 
get_16n_coeff_contexts_ver(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,uint8_t * coeff_contexts)549 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
550                                               const int width, const int height,
551                                               const ptrdiff_t *const offsets,
552                                               uint8_t *coeff_contexts) {
553   const int stride = width + TX_PAD_HOR;
554 
555   uint8x16_t pos_to_offset[3];
556   uint8x16_t count;
557   uint8x16_t level[5];
558 
559   assert(!(width % 16));
560 
561   pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
562   pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
563   pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
564 
565   int row = height;
566   do {
567     int w = width;
568     do {
569       load_levels_16x1x5(levels, stride, offsets, level);
570       count = get_coeff_contexts_kernel(level);
571       count = vaddq_u8(count, pos_to_offset[0]);
572       vst1q_u8(coeff_contexts, count);
573       levels += 16;
574       coeff_contexts += 16;
575       w -= 16;
576     } while (w);
577 
578     pos_to_offset[0] = pos_to_offset[1];
579     pos_to_offset[1] = pos_to_offset[2];
580     levels += TX_PAD_HOR;
581   } while (--row);
582 }
583 
584 // Note: levels[] must be in the range [0, 127], inclusive.
av1_get_nz_map_contexts_neon(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,const TX_SIZE tx_size,const TX_CLASS tx_class,int8_t * const coeff_contexts)585 void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
586                                   const int16_t *const scan, const uint16_t eob,
587                                   const TX_SIZE tx_size,
588                                   const TX_CLASS tx_class,
589                                   int8_t *const coeff_contexts) {
590   const int last_idx = eob - 1;
591   if (!last_idx) {
592     coeff_contexts[0] = 0;
593     return;
594   }
595 
596   uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
597 
598   const int real_width = tx_size_wide[tx_size];
599   const int real_height = tx_size_high[tx_size];
600   const int width = get_txb_wide(tx_size);
601   const int height = get_txb_high(tx_size);
602   const int stride = width + TX_PAD_HOR;
603   ptrdiff_t offsets[3];
604 
605   /* coeff_contexts must be 16 byte aligned. */
606   assert(!((intptr_t)coeff_contexts & 0xf));
607 
608   if (tx_class == TX_CLASS_2D) {
609     offsets[0] = 0 * stride + 2;
610     offsets[1] = 1 * stride + 1;
611     offsets[2] = 2 * stride + 0;
612 
613     if (width == 4) {
614       get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
615     } else if (width == 8) {
616       get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
617     } else {
618       get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
619                                 offsets, coefficients);
620     }
621   } else if (tx_class == TX_CLASS_HORIZ) {
622     offsets[0] = 2;
623     offsets[1] = 3;
624     offsets[2] = 4;
625     if (width == 4) {
626       get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
627     } else if (width == 8) {
628       get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
629     } else {
630       get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
631     }
632   } else {  // TX_CLASS_VERT
633     offsets[0] = 2 * stride;
634     offsets[1] = 3 * stride;
635     offsets[2] = 4 * stride;
636     if (width == 4) {
637       get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
638     } else if (width == 8) {
639       get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
640     } else {
641       get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
642     }
643   }
644 
645   const int bwl = get_txb_bwl(tx_size);
646   const int pos = scan[last_idx];
647   if (last_idx <= (height << bwl) / 8)
648     coeff_contexts[pos] = 1;
649   else if (last_idx <= (height << bwl) / 4)
650     coeff_contexts[pos] = 2;
651   else
652     coeff_contexts[pos] = 3;
653 }
654