1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h> // SSE2
14 #include <stdint.h>
15 #include "EbDefinitions.h"
16 #include "EbCabacContextModel.h"
17 #include "EbCommonUtils.h"
18 #include "EbFullLoop.h"
19
loadh_epi64(const void * const src,const __m128i s)20 static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
21 return _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
22 }
23
load_8bit_4x4_to_1_reg_sse2(const void * const src,const int32_t byte_stride)24 static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
25 const int32_t byte_stride) {
26 return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
27 *(const int32_t *)((int8_t *)src + 1 * byte_stride),
28 *(const int32_t *)((int8_t *)src + 2 * byte_stride),
29 *(const int32_t *)((int8_t *)src + 3 * byte_stride));
30 }
31
load_8bit_8x2_to_1_reg_sse2(const void * const src,const int32_t byte_stride)32 static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
33 const int32_t byte_stride) {
34 __m128i dst;
35 dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
36 dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
37 return dst;
38 }
39
load_levels_4x4x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)40 static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, const int32_t stride,
41 const ptrdiff_t *const offsets, __m128i *const level) {
42 level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
43 level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
44 level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
45 level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
46 level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
47 }
48
load_levels_8x2x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)49 static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, const int32_t stride,
50 const ptrdiff_t *const offsets, __m128i *const level) {
51 level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
52 level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
53 level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
54 level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
55 level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
56 }
57
load_levels_16x1x5_sse2(const uint8_t * const src,const int32_t stride,const ptrdiff_t * const offsets,__m128i * const level)58 static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, const int32_t stride,
59 const ptrdiff_t *const offsets, __m128i *const level) {
60 level[0] = _mm_loadu_si128((__m128i *)(src + 1));
61 level[1] = _mm_loadu_si128((__m128i *)(src + stride));
62 level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
63 level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
64 level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
65 }
66
get_coeff_contexts_kernel_sse2(__m128i * const level)67 static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
68 const __m128i const_3 = _mm_set1_epi8(3);
69 const __m128i const_4 = _mm_set1_epi8(4);
70 __m128i count;
71
72 count = _mm_min_epu8(level[0], const_3);
73 level[1] = _mm_min_epu8(level[1], const_3);
74 level[2] = _mm_min_epu8(level[2], const_3);
75 level[3] = _mm_min_epu8(level[3], const_3);
76 level[4] = _mm_min_epu8(level[4], const_3);
77 count = _mm_add_epi8(count, level[1]);
78 count = _mm_add_epi8(count, level[2]);
79 count = _mm_add_epi8(count, level[3]);
80 count = _mm_add_epi8(count, level[4]);
81 count = _mm_avg_epu8(count, _mm_setzero_si128());
82 count = _mm_min_epu8(count, const_4);
83 return count;
84 }
85
get_4_nz_map_contexts_2d(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * const coeff_contexts)86 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, const int32_t height,
87 const ptrdiff_t *const offsets,
88 int8_t *const coeff_contexts) {
89 const int32_t stride = 4 + TX_PAD_HOR;
90 const __m128i pos_to_offset_large = _mm_set1_epi8(21);
91 __m128i pos_to_offset = (height == 4)
92 ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
93 : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21);
94 __m128i count;
95 __m128i level[5];
96 int8_t * cc = coeff_contexts;
97 int32_t row = height;
98
99 assert(!(height % 4));
100
101 do {
102 load_levels_4x4x5_sse2(levels, stride, offsets, level);
103 count = get_coeff_contexts_kernel_sse2(level);
104 count = _mm_add_epi8(count, pos_to_offset);
105 _mm_storeu_si128((__m128i *)cc, count);
106 pos_to_offset = pos_to_offset_large;
107 levels += 4 * stride;
108 cc += 16;
109 row -= 4;
110 } while (row);
111
112 coeff_contexts[0] = 0;
113 }
114
get_8_coeff_contexts_2d(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)115 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, const int32_t height,
116 const ptrdiff_t *const offsets, int8_t *coeff_contexts) {
117 const int32_t stride = 8 + TX_PAD_HOR;
118 int8_t * cc = coeff_contexts;
119 int32_t row = height;
120 __m128i count;
121 __m128i level[5];
122 __m128i pos_to_offset[3];
123
124 assert(!(height % 2));
125
126 if (height == 8) {
127 pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
128 pos_to_offset[1] = _mm_setr_epi8(
129 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21);
130 } else if (height < 8) {
131 pos_to_offset[0] = _mm_setr_epi8(
132 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21);
133 pos_to_offset[1] = _mm_setr_epi8(
134 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21);
135 } else {
136 pos_to_offset[0] = _mm_setr_epi8(
137 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
138 pos_to_offset[1] = _mm_setr_epi8(
139 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21);
140 }
141 pos_to_offset[2] = _mm_set1_epi8(21);
142
143 do {
144 load_levels_8x2x5_sse2(levels, stride, offsets, level);
145 count = get_coeff_contexts_kernel_sse2(level);
146 count = _mm_add_epi8(count, pos_to_offset[0]);
147 _mm_storeu_si128((__m128i *)cc, count);
148 pos_to_offset[0] = pos_to_offset[1];
149 pos_to_offset[1] = pos_to_offset[2];
150 levels += 2 * stride;
151 cc += 16;
152 row -= 2;
153 } while (row);
154
155 coeff_contexts[0] = 0;
156 }
157
get_16n_coeff_contexts_2d(const uint8_t * levels,const int32_t real_width,const int32_t real_height,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)158 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, const int32_t real_width,
159 const int32_t real_height, const int32_t width,
160 const int32_t height, const ptrdiff_t *const offsets,
161 int8_t *coeff_contexts) {
162 const int32_t stride = width + TX_PAD_HOR;
163 int8_t * cc = coeff_contexts;
164 int32_t row = height;
165 __m128i pos_to_offset[5];
166 __m128i pos_to_offset_large[3];
167 __m128i count;
168 __m128i level[5];
169
170 assert(!(width % 16));
171
172 pos_to_offset_large[2] = _mm_set1_epi8(21);
173 if (real_width == real_height) {
174 pos_to_offset[0] = _mm_setr_epi8(
175 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
176 pos_to_offset[1] = _mm_setr_epi8(
177 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
178 pos_to_offset[2] = _mm_setr_epi8(
179 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
180 pos_to_offset[3] = _mm_setr_epi8(
181 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
182 pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
183 } else if (real_width > real_height) {
184 pos_to_offset[0] = _mm_setr_epi8(
185 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
186 pos_to_offset[1] = _mm_setr_epi8(
187 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
188 pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
189 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
190 pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
191 } else { // real_width < real_height
192 pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
193 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
194 pos_to_offset[2] = _mm_setr_epi8(
195 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
196 pos_to_offset[3] = _mm_setr_epi8(
197 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
198 pos_to_offset[4] = pos_to_offset_large[2];
199 pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
200 }
201
202 do {
203 int32_t w = width;
204
205 do {
206 load_levels_16x1x5_sse2(levels, stride, offsets, level);
207 count = get_coeff_contexts_kernel_sse2(level);
208 count = _mm_add_epi8(count, pos_to_offset[0]);
209 _mm_storeu_si128((__m128i *)cc, count);
210 levels += 16;
211 cc += 16;
212 w -= 16;
213 pos_to_offset[0] = pos_to_offset_large[0];
214 } while (w);
215
216 pos_to_offset[0] = pos_to_offset[1];
217 pos_to_offset[1] = pos_to_offset[2];
218 pos_to_offset[2] = pos_to_offset[3];
219 pos_to_offset[3] = pos_to_offset[4];
220 pos_to_offset_large[0] = pos_to_offset_large[1];
221 pos_to_offset_large[1] = pos_to_offset_large[2];
222 levels += TX_PAD_HOR;
223 } while (--row);
224
225 coeff_contexts[0] = 0;
226 }
227
get_4_nz_map_contexts_hor(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)228 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, const int32_t height,
229 const ptrdiff_t *const offsets,
230 int8_t * coeff_contexts) {
231 const int32_t stride = 4 + TX_PAD_HOR;
232 const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
233 SIG_COEF_CONTEXTS_2D + 5,
234 SIG_COEF_CONTEXTS_2D + 10,
235 SIG_COEF_CONTEXTS_2D + 10,
236 SIG_COEF_CONTEXTS_2D + 0,
237 SIG_COEF_CONTEXTS_2D + 5,
238 SIG_COEF_CONTEXTS_2D + 10,
239 SIG_COEF_CONTEXTS_2D + 10,
240 SIG_COEF_CONTEXTS_2D + 0,
241 SIG_COEF_CONTEXTS_2D + 5,
242 SIG_COEF_CONTEXTS_2D + 10,
243 SIG_COEF_CONTEXTS_2D + 10,
244 SIG_COEF_CONTEXTS_2D + 0,
245 SIG_COEF_CONTEXTS_2D + 5,
246 SIG_COEF_CONTEXTS_2D + 10,
247 SIG_COEF_CONTEXTS_2D + 10);
248 __m128i count;
249 __m128i level[5];
250 int32_t row = height;
251
252 assert(!(height % 4));
253
254 do {
255 load_levels_4x4x5_sse2(levels, stride, offsets, level);
256 count = get_coeff_contexts_kernel_sse2(level);
257 count = _mm_add_epi8(count, pos_to_offset);
258 _mm_storeu_si128((__m128i *)coeff_contexts, count);
259 levels += 4 * stride;
260 coeff_contexts += 16;
261 row -= 4;
262 } while (row);
263 }
264
get_4_nz_map_contexts_ver(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)265 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, const int32_t height,
266 const ptrdiff_t *const offsets,
267 int8_t * coeff_contexts) {
268 const int32_t stride = 4 + TX_PAD_HOR;
269 const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
270 __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
271 SIG_COEF_CONTEXTS_2D + 0,
272 SIG_COEF_CONTEXTS_2D + 0,
273 SIG_COEF_CONTEXTS_2D + 0,
274 SIG_COEF_CONTEXTS_2D + 5,
275 SIG_COEF_CONTEXTS_2D + 5,
276 SIG_COEF_CONTEXTS_2D + 5,
277 SIG_COEF_CONTEXTS_2D + 5,
278 SIG_COEF_CONTEXTS_2D + 10,
279 SIG_COEF_CONTEXTS_2D + 10,
280 SIG_COEF_CONTEXTS_2D + 10,
281 SIG_COEF_CONTEXTS_2D + 10,
282 SIG_COEF_CONTEXTS_2D + 10,
283 SIG_COEF_CONTEXTS_2D + 10,
284 SIG_COEF_CONTEXTS_2D + 10,
285 SIG_COEF_CONTEXTS_2D + 10);
286 __m128i count;
287 __m128i level[5];
288 int32_t row = height;
289
290 assert(!(height % 4));
291
292 do {
293 load_levels_4x4x5_sse2(levels, stride, offsets, level);
294 count = get_coeff_contexts_kernel_sse2(level);
295 count = _mm_add_epi8(count, pos_to_offset);
296 _mm_storeu_si128((__m128i *)coeff_contexts, count);
297 pos_to_offset = pos_to_offset_large;
298 levels += 4 * stride;
299 coeff_contexts += 16;
300 row -= 4;
301 } while (row);
302 }
303
get_8_coeff_contexts_hor(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)304 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, const int32_t height,
305 const ptrdiff_t *const offsets,
306 int8_t * coeff_contexts) {
307 const int32_t stride = 8 + TX_PAD_HOR;
308 const __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
309 SIG_COEF_CONTEXTS_2D + 5,
310 SIG_COEF_CONTEXTS_2D + 10,
311 SIG_COEF_CONTEXTS_2D + 10,
312 SIG_COEF_CONTEXTS_2D + 10,
313 SIG_COEF_CONTEXTS_2D + 10,
314 SIG_COEF_CONTEXTS_2D + 10,
315 SIG_COEF_CONTEXTS_2D + 10,
316 SIG_COEF_CONTEXTS_2D + 0,
317 SIG_COEF_CONTEXTS_2D + 5,
318 SIG_COEF_CONTEXTS_2D + 10,
319 SIG_COEF_CONTEXTS_2D + 10,
320 SIG_COEF_CONTEXTS_2D + 10,
321 SIG_COEF_CONTEXTS_2D + 10,
322 SIG_COEF_CONTEXTS_2D + 10,
323 SIG_COEF_CONTEXTS_2D + 10);
324 int32_t row = height;
325 __m128i count;
326 __m128i level[5];
327
328 assert(!(height % 2));
329
330 do {
331 load_levels_8x2x5_sse2(levels, stride, offsets, level);
332 count = get_coeff_contexts_kernel_sse2(level);
333 count = _mm_add_epi8(count, pos_to_offset);
334 _mm_storeu_si128((__m128i *)coeff_contexts, count);
335 levels += 2 * stride;
336 coeff_contexts += 16;
337 row -= 2;
338 } while (row);
339 }
340
get_8_coeff_contexts_ver(const uint8_t * levels,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)341 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, const int32_t height,
342 const ptrdiff_t *const offsets,
343 int8_t * coeff_contexts) {
344 const int32_t stride = 8 + TX_PAD_HOR;
345 const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
346 __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
347 SIG_COEF_CONTEXTS_2D + 0,
348 SIG_COEF_CONTEXTS_2D + 0,
349 SIG_COEF_CONTEXTS_2D + 0,
350 SIG_COEF_CONTEXTS_2D + 0,
351 SIG_COEF_CONTEXTS_2D + 0,
352 SIG_COEF_CONTEXTS_2D + 0,
353 SIG_COEF_CONTEXTS_2D + 0,
354 SIG_COEF_CONTEXTS_2D + 5,
355 SIG_COEF_CONTEXTS_2D + 5,
356 SIG_COEF_CONTEXTS_2D + 5,
357 SIG_COEF_CONTEXTS_2D + 5,
358 SIG_COEF_CONTEXTS_2D + 5,
359 SIG_COEF_CONTEXTS_2D + 5,
360 SIG_COEF_CONTEXTS_2D + 5,
361 SIG_COEF_CONTEXTS_2D + 5);
362 int32_t row = height;
363 __m128i count;
364 __m128i level[5];
365
366 assert(!(height % 2));
367
368 do {
369 load_levels_8x2x5_sse2(levels, stride, offsets, level);
370 count = get_coeff_contexts_kernel_sse2(level);
371 count = _mm_add_epi8(count, pos_to_offset);
372 _mm_storeu_si128((__m128i *)coeff_contexts, count);
373 pos_to_offset = pos_to_offset_large;
374 levels += 2 * stride;
375 coeff_contexts += 16;
376 row -= 2;
377 } while (row);
378 }
379
get_16n_coeff_contexts_hor(const uint8_t * levels,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)380 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, const int32_t width,
381 const int32_t height, const ptrdiff_t *const offsets,
382 int8_t *coeff_contexts) {
383 const int32_t stride = width + TX_PAD_HOR;
384 const __m128i pos_to_offset_large = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10,
385 SIG_COEF_CONTEXTS_2D + 10,
386 SIG_COEF_CONTEXTS_2D + 10,
387 SIG_COEF_CONTEXTS_2D + 10,
388 SIG_COEF_CONTEXTS_2D + 10,
389 SIG_COEF_CONTEXTS_2D + 10,
390 SIG_COEF_CONTEXTS_2D + 10,
391 SIG_COEF_CONTEXTS_2D + 10,
392 SIG_COEF_CONTEXTS_2D + 10,
393 SIG_COEF_CONTEXTS_2D + 10,
394 SIG_COEF_CONTEXTS_2D + 10,
395 SIG_COEF_CONTEXTS_2D + 10,
396 SIG_COEF_CONTEXTS_2D + 10,
397 SIG_COEF_CONTEXTS_2D + 10,
398 SIG_COEF_CONTEXTS_2D + 10,
399 SIG_COEF_CONTEXTS_2D + 10);
400 __m128i count;
401 __m128i level[5];
402 int32_t row = height;
403
404 assert(!(width % 16));
405
406 do {
407 __m128i pos_to_offset = _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0,
408 SIG_COEF_CONTEXTS_2D + 5,
409 SIG_COEF_CONTEXTS_2D + 10,
410 SIG_COEF_CONTEXTS_2D + 10,
411 SIG_COEF_CONTEXTS_2D + 10,
412 SIG_COEF_CONTEXTS_2D + 10,
413 SIG_COEF_CONTEXTS_2D + 10,
414 SIG_COEF_CONTEXTS_2D + 10,
415 SIG_COEF_CONTEXTS_2D + 10,
416 SIG_COEF_CONTEXTS_2D + 10,
417 SIG_COEF_CONTEXTS_2D + 10,
418 SIG_COEF_CONTEXTS_2D + 10,
419 SIG_COEF_CONTEXTS_2D + 10,
420 SIG_COEF_CONTEXTS_2D + 10,
421 SIG_COEF_CONTEXTS_2D + 10,
422 SIG_COEF_CONTEXTS_2D + 10);
423 int32_t w = width;
424
425 do {
426 load_levels_16x1x5_sse2(levels, stride, offsets, level);
427 count = get_coeff_contexts_kernel_sse2(level);
428 count = _mm_add_epi8(count, pos_to_offset);
429 _mm_storeu_si128((__m128i *)coeff_contexts, count);
430 pos_to_offset = pos_to_offset_large;
431 levels += 16;
432 coeff_contexts += 16;
433 w -= 16;
434 } while (w);
435
436 levels += TX_PAD_HOR;
437 } while (--row);
438 }
439
get_16n_coeff_contexts_ver(const uint8_t * levels,const int32_t width,const int32_t height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)440 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, const int32_t width,
441 const int32_t height, const ptrdiff_t *const offsets,
442 int8_t *coeff_contexts) {
443 const int32_t stride = width + TX_PAD_HOR;
444 __m128i pos_to_offset[3];
445 __m128i count;
446 __m128i level[5];
447 int32_t row = height;
448
449 assert(!(width % 16));
450
451 pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
452 pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
453 pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
454
455 do {
456 int32_t w = width;
457
458 do {
459 load_levels_16x1x5_sse2(levels, stride, offsets, level);
460 count = get_coeff_contexts_kernel_sse2(level);
461 count = _mm_add_epi8(count, pos_to_offset[0]);
462 _mm_storeu_si128((__m128i *)coeff_contexts, count);
463 levels += 16;
464 coeff_contexts += 16;
465 w -= 16;
466 } while (w);
467
468 pos_to_offset[0] = pos_to_offset[1];
469 pos_to_offset[1] = pos_to_offset[2];
470 levels += TX_PAD_HOR;
471 } while (--row);
472 }
473
svt_av1_get_nz_map_contexts_sse2(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,TxSize tx_size,const TxClass tx_class,int8_t * const coeff_contexts)474 void svt_av1_get_nz_map_contexts_sse2(const uint8_t *const levels, const int16_t *const scan,
475 const uint16_t eob, TxSize tx_size, const TxClass tx_class,
476 int8_t *const coeff_contexts) {
477 const int32_t last_idx = eob - 1;
478 if (!last_idx) {
479 coeff_contexts[0] = 0;
480 return;
481 }
482
483 const int32_t real_width = tx_size_wide[tx_size];
484 const int32_t real_height = tx_size_high[tx_size];
485 const int32_t width = get_txb_wide(tx_size);
486 const int32_t height = get_txb_high(tx_size);
487
488 const int32_t stride = width + TX_PAD_HOR;
489
490 ptrdiff_t offsets[3];
491
492 /* coeff_contexts must be 16 byte aligned. */
493 assert(!((intptr_t)coeff_contexts & 0xf));
494
495 if (tx_class == TX_CLASS_2D) {
496 offsets[0] = 0 * stride + 2;
497 offsets[1] = 1 * stride + 1;
498 offsets[2] = 2 * stride + 0;
499
500 if (width == 4)
501 get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
502 else if (width == 8)
503 get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
504 else
505 get_16n_coeff_contexts_2d(
506 levels, real_width, real_height, width, height, offsets, coeff_contexts);
507 } else if (tx_class == TX_CLASS_HORIZ) {
508 offsets[0] = 2;
509 offsets[1] = 3;
510 offsets[2] = 4;
511 if (width == 4)
512 get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
513 else if (width == 8)
514 get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
515 else {
516 get_16n_coeff_contexts_hor(levels, width, height, offsets, coeff_contexts);
517 }
518 } else { // TX_CLASS_VERT
519 offsets[0] = 2 * stride;
520 offsets[1] = 3 * stride;
521 offsets[2] = 4 * stride;
522 if (width == 4)
523 get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
524 else if (width == 8)
525 get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
526 else {
527 get_16n_coeff_contexts_ver(levels, width, height, offsets, coeff_contexts);
528 }
529 }
530
531 const int32_t bwl = get_txb_bwl_tab[tx_size];
532 const int32_t pos = scan[last_idx];
533 if (last_idx <= (height << bwl) / 8)
534 coeff_contexts[pos] = 1;
535 else if (last_idx <= (height << bwl) / 4)
536 coeff_contexts[pos] = 2;
537 else
538 coeff_contexts[pos] = 3;
539 }
540