1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h> // SSE2
14 #include <stdint.h>
15 #include "EbDefinitions.h"
16 #include "EbCabacContextModel.h"
17 #include "EbCommonUtils.h"
18 #include "EbFullLoop.h"
19 
loadh_epi64(const void * const src,const __m128i s)20 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
21     return _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
22 }
23 
load_8bit_4x4_to_1_reg_sse2(const void * const src,const int32_t byte_stride)24 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
25                                                   const int32_t     byte_stride) {
26     return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
27                           *(const int32_t *)((int8_t *)src + 1 * byte_stride),
28                           *(const int32_t *)((int8_t *)src + 2 * byte_stride),
29                           *(const int32_t *)((int8_t *)src + 3 * byte_stride));
30 }
31 
load_8bit_8x2_to_1_reg_sse2(const void * const src,const int32_t byte_stride)32 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
33                                                   const int32_t     byte_stride) {
34     __m128i dst;
35     dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
36     dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
37     return dst;
38 }
39 
load_levels_4x4x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)40 static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, const int32_t stride,
41                                           const ptrdiff_t *const offsets, __m128i *const level) {
42     level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
43     level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
44     level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
45     level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
46     level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
47 }
48 
load_levels_8x2x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)49 static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, const int32_t stride,
50                                           const ptrdiff_t *const offsets, __m128i *const level) {
51     level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
52     level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
53     level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
54     level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
55     level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
56 }
57 
load_levels_16x1x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)58 static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, const int32_t stride,
59                                            const ptrdiff_t *const offsets, __m128i *const level) {
60     level[0] = _mm_loadu_si128((__m128i *)(src + 1));
61     level[1] = _mm_loadu_si128((__m128i *)(src + stride));
62     level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
63     level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
64     level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
65 }
66 
get_coeff_contexts_kernel_sse2(__m128i * const level)67 static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
68     const __m128i const_3 = _mm_set1_epi8(3);
69     const __m128i const_4 = _mm_set1_epi8(4);
70     __m128i       count;
71 
72     count    = _mm_min_epu8(level[0], const_3);
73     level[1] = _mm_min_epu8(level[1], const_3);
74     level[2] = _mm_min_epu8(level[2], const_3);
75     level[3] = _mm_min_epu8(level[3], const_3);
76     level[4] = _mm_min_epu8(level[4], const_3);
77     count    = _mm_add_epi8(count, level[1]);
78     count    = _mm_add_epi8(count, level[2]);
79     count    = _mm_add_epi8(count, level[3]);
80     count    = _mm_add_epi8(count, level[4]);
81     count    = _mm_avg_epu8(count, _mm_setzero_si128());
82     count    = _mm_min_epu8(count, const_4);
83     return count;
84 }
85 
get_4_nz_map_contexts_2d(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * const coeff_contexts)86 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, const int32_t height,
87                                             const ptrdiff_t *const offsets,
88                                             int8_t *const          coeff_contexts) {
89     const int32_t stride              = 4 + TX_PAD_HOR;
90     const __m128i pos_to_offset_large = _mm_set1_epi8(21);
91     __m128i       pos_to_offset       = (height == 4)
92                     ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
93                     : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21);
94     __m128i       count;
95     __m128i       level[5];
96     int8_t *      cc  = coeff_contexts;
97     int32_t       row = height;
98 
99     assert(!(height % 4));
100 
101     do {
102         load_levels_4x4x5_sse2(levels, stride, offsets, level);
103         count = get_coeff_contexts_kernel_sse2(level);
104         count = _mm_add_epi8(count, pos_to_offset);
105         _mm_storeu_si128((__m128i *)cc, count);
106         pos_to_offset = pos_to_offset_large;
107         levels += 4 * stride;
108         cc += 16;
109         row -= 4;
110     } while (row);
111 
112     coeff_contexts[0] = 0;
113 }
114 
get_8_coeff_contexts_2d(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)115 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, const int32_t height,
116                                            const ptrdiff_t *const offsets, int8_t *coeff_contexts) {
117     const int32_t stride = 8 + TX_PAD_HOR;
118     int8_t *      cc     = coeff_contexts;
119     int32_t       row    = height;
120     __m128i       count;
121     __m128i       level[5];
122     __m128i       pos_to_offset[3];
123 
124     assert(!(height % 2));
125 
126     if (height == 8) {
127         pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
128         pos_to_offset[1] = _mm_setr_epi8(
129             6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21);
130     } else if (height < 8) {
131         pos_to_offset[0] = _mm_setr_epi8(
132             0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21);
133         pos_to_offset[1] = _mm_setr_epi8(
134             16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21);
135     } else {
136         pos_to_offset[0] = _mm_setr_epi8(
137             0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
138         pos_to_offset[1] = _mm_setr_epi8(
139             6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21);
140     }
141     pos_to_offset[2] = _mm_set1_epi8(21);
142 
143     do {
144         load_levels_8x2x5_sse2(levels, stride, offsets, level);
145         count = get_coeff_contexts_kernel_sse2(level);
146         count = _mm_add_epi8(count, pos_to_offset[0]);
147         _mm_storeu_si128((__m128i *)cc, count);
148         pos_to_offset[0] = pos_to_offset[1];
149         pos_to_offset[1] = pos_to_offset[2];
150         levels += 2 * stride;
151         cc += 16;
152         row -= 2;
153     } while (row);
154 
155     coeff_contexts[0] = 0;
156 }
157 
get_16n_coeff_contexts_2d(const uint8_t * levels,const int32_t real_width,const int32_t real_height,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)158 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, const int32_t real_width,
159                                              const int32_t real_height, const int32_t width,
160                                              const int32_t height, const ptrdiff_t *const offsets,
161                                              int8_t *coeff_contexts) {
162     const int32_t stride = width + TX_PAD_HOR;
163     int8_t *      cc     = coeff_contexts;
164     int32_t       row    = height;
165     __m128i       pos_to_offset[5];
166     __m128i       pos_to_offset_large[3];
167     __m128i       count;
168     __m128i       level[5];
169 
170     assert(!(width % 16));
171 
172     pos_to_offset_large[2] = _mm_set1_epi8(21);
173     if (real_width == real_height) {
174         pos_to_offset[0] = _mm_setr_epi8(
175             0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
176         pos_to_offset[1] = _mm_setr_epi8(
177             1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
178         pos_to_offset[2] = _mm_setr_epi8(
179             6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
180         pos_to_offset[3] = _mm_setr_epi8(
181             6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
182         pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
183     } else if (real_width > real_height) {
184         pos_to_offset[0] = _mm_setr_epi8(
185             0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
186         pos_to_offset[1] = _mm_setr_epi8(
187             16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
188         pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
189             16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
190         pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
191     } else { // real_width < real_height
192         pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
193             11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
194         pos_to_offset[2] = _mm_setr_epi8(
195             6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
196         pos_to_offset[3] = _mm_setr_epi8(
197             6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
198         pos_to_offset[4]       = pos_to_offset_large[2];
199         pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
200     }
201 
202     do {
203         int32_t w = width;
204 
205         do {
206             load_levels_16x1x5_sse2(levels, stride, offsets, level);
207             count = get_coeff_contexts_kernel_sse2(level);
208             count = _mm_add_epi8(count, pos_to_offset[0]);
209             _mm_storeu_si128((__m128i *)cc, count);
210             levels += 16;
211             cc += 16;
212             w -= 16;
213             pos_to_offset[0] = pos_to_offset_large[0];
214         } while (w);
215 
216         pos_to_offset[0]       = pos_to_offset[1];
217         pos_to_offset[1]       = pos_to_offset[2];
218         pos_to_offset[2]       = pos_to_offset[3];
219         pos_to_offset[3]       = pos_to_offset[4];
220         pos_to_offset_large[0] = pos_to_offset_large[1];
221         pos_to_offset_large[1] = pos_to_offset_large[2];
222         levels += TX_PAD_HOR;
223     } while (--row);
224 
225     coeff_contexts[0] = 0;
226 }
227 
get_4_nz_map_contexts_hor(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)228 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, const int32_t height,
229                                              const ptrdiff_t *const offsets,
230                                              int8_t *               coeff_contexts) {
231     const int32_t stride        = 4 + TX_PAD_HOR;
232     const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
233                                                 SIG_COEF_CONTEXTS_2D + 5,
234                                                 SIG_COEF_CONTEXTS_2D + 10,
235                                                 SIG_COEF_CONTEXTS_2D + 10,
236                                                 SIG_COEF_CONTEXTS_2D + 0,
237                                                 SIG_COEF_CONTEXTS_2D + 5,
238                                                 SIG_COEF_CONTEXTS_2D + 10,
239                                                 SIG_COEF_CONTEXTS_2D + 10,
240                                                 SIG_COEF_CONTEXTS_2D + 0,
241                                                 SIG_COEF_CONTEXTS_2D + 5,
242                                                 SIG_COEF_CONTEXTS_2D + 10,
243                                                 SIG_COEF_CONTEXTS_2D + 10,
244                                                 SIG_COEF_CONTEXTS_2D + 0,
245                                                 SIG_COEF_CONTEXTS_2D + 5,
246                                                 SIG_COEF_CONTEXTS_2D + 10,
247                                                 SIG_COEF_CONTEXTS_2D + 10);
248     __m128i       count;
249     __m128i       level[5];
250     int32_t       row = height;
251 
252     assert(!(height % 4));
253 
254     do {
255         load_levels_4x4x5_sse2(levels, stride, offsets, level);
256         count = get_coeff_contexts_kernel_sse2(level);
257         count = _mm_add_epi8(count, pos_to_offset);
258         _mm_storeu_si128((__m128i *)coeff_contexts, count);
259         levels += 4 * stride;
260         coeff_contexts += 16;
261         row -= 4;
262     } while (row);
263 }
264 
get_4_nz_map_contexts_ver(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)265 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, const int32_t height,
266                                              const ptrdiff_t *const offsets,
267                                              int8_t *               coeff_contexts) {
268     const int32_t stride              = 4 + TX_PAD_HOR;
269     const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
270     __m128i       pos_to_offset       = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
271                                           SIG_COEF_CONTEXTS_2D + 0,
272                                           SIG_COEF_CONTEXTS_2D + 0,
273                                           SIG_COEF_CONTEXTS_2D + 0,
274                                           SIG_COEF_CONTEXTS_2D + 5,
275                                           SIG_COEF_CONTEXTS_2D + 5,
276                                           SIG_COEF_CONTEXTS_2D + 5,
277                                           SIG_COEF_CONTEXTS_2D + 5,
278                                           SIG_COEF_CONTEXTS_2D + 10,
279                                           SIG_COEF_CONTEXTS_2D + 10,
280                                           SIG_COEF_CONTEXTS_2D + 10,
281                                           SIG_COEF_CONTEXTS_2D + 10,
282                                           SIG_COEF_CONTEXTS_2D + 10,
283                                           SIG_COEF_CONTEXTS_2D + 10,
284                                           SIG_COEF_CONTEXTS_2D + 10,
285                                           SIG_COEF_CONTEXTS_2D + 10);
286     __m128i       count;
287     __m128i       level[5];
288     int32_t       row = height;
289 
290     assert(!(height % 4));
291 
292     do {
293         load_levels_4x4x5_sse2(levels, stride, offsets, level);
294         count = get_coeff_contexts_kernel_sse2(level);
295         count = _mm_add_epi8(count, pos_to_offset);
296         _mm_storeu_si128((__m128i *)coeff_contexts, count);
297         pos_to_offset = pos_to_offset_large;
298         levels += 4 * stride;
299         coeff_contexts += 16;
300         row -= 4;
301     } while (row);
302 }
303 
get_8_coeff_contexts_hor(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)304 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, const int32_t height,
305                                             const ptrdiff_t *const offsets,
306                                             int8_t *               coeff_contexts) {
307     const int32_t stride        = 8 + TX_PAD_HOR;
308     const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
309                                                 SIG_COEF_CONTEXTS_2D + 5,
310                                                 SIG_COEF_CONTEXTS_2D + 10,
311                                                 SIG_COEF_CONTEXTS_2D + 10,
312                                                 SIG_COEF_CONTEXTS_2D + 10,
313                                                 SIG_COEF_CONTEXTS_2D + 10,
314                                                 SIG_COEF_CONTEXTS_2D + 10,
315                                                 SIG_COEF_CONTEXTS_2D + 10,
316                                                 SIG_COEF_CONTEXTS_2D + 0,
317                                                 SIG_COEF_CONTEXTS_2D + 5,
318                                                 SIG_COEF_CONTEXTS_2D + 10,
319                                                 SIG_COEF_CONTEXTS_2D + 10,
320                                                 SIG_COEF_CONTEXTS_2D + 10,
321                                                 SIG_COEF_CONTEXTS_2D + 10,
322                                                 SIG_COEF_CONTEXTS_2D + 10,
323                                                 SIG_COEF_CONTEXTS_2D + 10);
324     int32_t       row           = height;
325     __m128i       count;
326     __m128i       level[5];
327 
328     assert(!(height % 2));
329 
330     do {
331         load_levels_8x2x5_sse2(levels, stride, offsets, level);
332         count = get_coeff_contexts_kernel_sse2(level);
333         count = _mm_add_epi8(count, pos_to_offset);
334         _mm_storeu_si128((__m128i *)coeff_contexts, count);
335         levels += 2 * stride;
336         coeff_contexts += 16;
337         row -= 2;
338     } while (row);
339 }
340 
get_8_coeff_contexts_ver(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)341 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, const int32_t height,
342                                             const ptrdiff_t *const offsets,
343                                             int8_t *               coeff_contexts) {
344     const int32_t stride              = 8 + TX_PAD_HOR;
345     const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
346     __m128i       pos_to_offset       = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
347                                           SIG_COEF_CONTEXTS_2D + 0,
348                                           SIG_COEF_CONTEXTS_2D + 0,
349                                           SIG_COEF_CONTEXTS_2D + 0,
350                                           SIG_COEF_CONTEXTS_2D + 0,
351                                           SIG_COEF_CONTEXTS_2D + 0,
352                                           SIG_COEF_CONTEXTS_2D + 0,
353                                           SIG_COEF_CONTEXTS_2D + 0,
354                                           SIG_COEF_CONTEXTS_2D + 5,
355                                           SIG_COEF_CONTEXTS_2D + 5,
356                                           SIG_COEF_CONTEXTS_2D + 5,
357                                           SIG_COEF_CONTEXTS_2D + 5,
358                                           SIG_COEF_CONTEXTS_2D + 5,
359                                           SIG_COEF_CONTEXTS_2D + 5,
360                                           SIG_COEF_CONTEXTS_2D + 5,
361                                           SIG_COEF_CONTEXTS_2D + 5);
362     int32_t       row                 = height;
363     __m128i       count;
364     __m128i       level[5];
365 
366     assert(!(height % 2));
367 
368     do {
369         load_levels_8x2x5_sse2(levels, stride, offsets, level);
370         count = get_coeff_contexts_kernel_sse2(level);
371         count = _mm_add_epi8(count, pos_to_offset);
372         _mm_storeu_si128((__m128i *)coeff_contexts, count);
373         pos_to_offset = pos_to_offset_large;
374         levels += 2 * stride;
375         coeff_contexts += 16;
376         row -= 2;
377     } while (row);
378 }
379 
get_16n_coeff_contexts_hor(const uint8_t * levels,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)380 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, const int32_t width,
381                                               const int32_t height, const ptrdiff_t *const offsets,
382                                               int8_t *coeff_contexts) {
383     const int32_t stride              = width + TX_PAD_HOR;
384     const __m128i pos_to_offset_large = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10,
385                                                       SIG_COEF_CONTEXTS_2D + 10,
386                                                       SIG_COEF_CONTEXTS_2D + 10,
387                                                       SIG_COEF_CONTEXTS_2D + 10,
388                                                       SIG_COEF_CONTEXTS_2D + 10,
389                                                       SIG_COEF_CONTEXTS_2D + 10,
390                                                       SIG_COEF_CONTEXTS_2D + 10,
391                                                       SIG_COEF_CONTEXTS_2D + 10,
392                                                       SIG_COEF_CONTEXTS_2D + 10,
393                                                       SIG_COEF_CONTEXTS_2D + 10,
394                                                       SIG_COEF_CONTEXTS_2D + 10,
395                                                       SIG_COEF_CONTEXTS_2D + 10,
396                                                       SIG_COEF_CONTEXTS_2D + 10,
397                                                       SIG_COEF_CONTEXTS_2D + 10,
398                                                       SIG_COEF_CONTEXTS_2D + 10,
399                                                       SIG_COEF_CONTEXTS_2D + 10);
400     __m128i       count;
401     __m128i       level[5];
402     int32_t       row = height;
403 
404     assert(!(width % 16));
405 
406     do {
407         __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
408                                               SIG_COEF_CONTEXTS_2D + 5,
409                                               SIG_COEF_CONTEXTS_2D + 10,
410                                               SIG_COEF_CONTEXTS_2D + 10,
411                                               SIG_COEF_CONTEXTS_2D + 10,
412                                               SIG_COEF_CONTEXTS_2D + 10,
413                                               SIG_COEF_CONTEXTS_2D + 10,
414                                               SIG_COEF_CONTEXTS_2D + 10,
415                                               SIG_COEF_CONTEXTS_2D + 10,
416                                               SIG_COEF_CONTEXTS_2D + 10,
417                                               SIG_COEF_CONTEXTS_2D + 10,
418                                               SIG_COEF_CONTEXTS_2D + 10,
419                                               SIG_COEF_CONTEXTS_2D + 10,
420                                               SIG_COEF_CONTEXTS_2D + 10,
421                                               SIG_COEF_CONTEXTS_2D + 10,
422                                               SIG_COEF_CONTEXTS_2D + 10);
423         int32_t w             = width;
424 
425         do {
426             load_levels_16x1x5_sse2(levels, stride, offsets, level);
427             count = get_coeff_contexts_kernel_sse2(level);
428             count = _mm_add_epi8(count, pos_to_offset);
429             _mm_storeu_si128((__m128i *)coeff_contexts, count);
430             pos_to_offset = pos_to_offset_large;
431             levels += 16;
432             coeff_contexts += 16;
433             w -= 16;
434         } while (w);
435 
436         levels += TX_PAD_HOR;
437     } while (--row);
438 }
439 
get_16n_coeff_contexts_ver(const uint8_t * levels,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)440 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, const int32_t width,
441                                               const int32_t height, const ptrdiff_t *const offsets,
442                                               int8_t *coeff_contexts) {
443     const int32_t stride = width + TX_PAD_HOR;
444     __m128i       pos_to_offset[3];
445     __m128i       count;
446     __m128i       level[5];
447     int32_t       row = height;
448 
449     assert(!(width % 16));
450 
451     pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
452     pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
453     pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
454 
455     do {
456         int32_t w = width;
457 
458         do {
459             load_levels_16x1x5_sse2(levels, stride, offsets, level);
460             count = get_coeff_contexts_kernel_sse2(level);
461             count = _mm_add_epi8(count, pos_to_offset[0]);
462             _mm_storeu_si128((__m128i *)coeff_contexts, count);
463             levels += 16;
464             coeff_contexts += 16;
465             w -= 16;
466         } while (w);
467 
468         pos_to_offset[0] = pos_to_offset[1];
469         pos_to_offset[1] = pos_to_offset[2];
470         levels += TX_PAD_HOR;
471     } while (--row);
472 }
473 
svt_av1_get_nz_map_contexts_sse2(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,TxSize tx_size,const TxClass tx_class,int8_t * const coeff_contexts)474 void svt_av1_get_nz_map_contexts_sse2(const uint8_t *const levels, const int16_t *const scan,
475                                       const uint16_t eob, TxSize tx_size, const TxClass tx_class,
476                                       int8_t *const coeff_contexts) {
477     const int32_t last_idx = eob - 1;
478     if (!last_idx) {
479         coeff_contexts[0] = 0;
480         return;
481     }
482 
483     const int32_t real_width  = tx_size_wide[tx_size];
484     const int32_t real_height = tx_size_high[tx_size];
485     const int32_t width       = get_txb_wide(tx_size);
486     const int32_t height      = get_txb_high(tx_size);
487 
488     const int32_t stride = width + TX_PAD_HOR;
489 
490     ptrdiff_t offsets[3];
491 
492     /* coeff_contexts must be 16 byte aligned. */
493     assert(!((intptr_t)coeff_contexts & 0xf));
494 
495     if (tx_class == TX_CLASS_2D) {
496         offsets[0] = 0 * stride + 2;
497         offsets[1] = 1 * stride + 1;
498         offsets[2] = 2 * stride + 0;
499 
500         if (width == 4)
501             get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
502         else if (width == 8)
503             get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
504         else
505             get_16n_coeff_contexts_2d(
506                 levels, real_width, real_height, width, height, offsets, coeff_contexts);
507     } else if (tx_class == TX_CLASS_HORIZ) {
508         offsets[0] = 2;
509         offsets[1] = 3;
510         offsets[2] = 4;
511         if (width == 4)
512             get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
513         else if (width == 8)
514             get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
515         else {
516             get_16n_coeff_contexts_hor(levels, width, height, offsets, coeff_contexts);
517         }
518     } else { // TX_CLASS_VERT
519         offsets[0] = 2 * stride;
520         offsets[1] = 3 * stride;
521         offsets[2] = 4 * stride;
522         if (width == 4)
523             get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
524         else if (width == 8)
525             get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
526         else {
527             get_16n_coeff_contexts_ver(levels, width, height, offsets, coeff_contexts);
528         }
529     }
530 
531     const int32_t bwl = get_txb_bwl_tab[tx_size];
532     const int32_t pos = scan[last_idx];
533     if (last_idx <= (height << bwl) / 8)
534         coeff_contexts[pos] = 1;
535     else if (last_idx <= (height << bwl) / 4)
536         coeff_contexts[pos] = 2;
537     else
538         coeff_contexts[pos] = 3;
539 }
540