1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13 #include "common_dsp_rtcd.h"
14 #include <tmmintrin.h>
15
16 // Weights are quadratic from '1' to '1 / BlockSize', scaled by
17 // 2^sm_weight_log2_scale.
18 static const int32_t sm_weight_log2_scale = 8;
19
20 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
21 #define MAX_BLOCK_DIM 64
22
23 /* clang-format off */
24 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
25 // Unused, because we always offset by bs, which is at least 2.
26 0, 0,
27 // bs = 2
28 255, 128,
29 // bs = 4
30 255, 149, 85, 64,
31 // bs = 8
32 255, 197, 146, 105, 73, 50, 37, 32,
33 // bs = 16
34 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
35 // bs = 32
36 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
37 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
38 // bs = 64
39 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
40 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
41 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
42 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
43 };
44
45 // -----------------------------------------------------------------------------
46 // PAETH_PRED
47
48 // -----------------------------------------------------------------------------
49 // SMOOTH_PRED
50
51 // pixels[0]: above and below_pred interleave vector
52 // pixels[1]: left vector
53 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)54 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
55 int32_t height, __m128i *pixels) {
56 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
57 if (height == 4)
58 pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
59 else if (height == 8)
60 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
61 else
62 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
63
64 pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
65
66 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
67 const __m128i zero = _mm_setzero_si128();
68 d = _mm_unpacklo_epi8(d, zero);
69 pixels[0] = _mm_unpacklo_epi16(d, bp);
70 }
71
72 // weight_h[0]: weight_h vector
73 // weight_h[1]: scale - weight_h vector
74 // weight_h[2]: same as [0], second half for height = 16 only
75 // weight_h[3]: same as [1], second half for height = 16 only
76 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int32_t height,__m128i * weight_h,__m128i * weight_w)77 static INLINE void load_weight_w4(const uint8_t *weight_array, int32_t height,
78 __m128i *weight_h, __m128i *weight_w) {
79 const __m128i zero = _mm_setzero_si128();
80 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
81 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
82 weight_h[0] = _mm_unpacklo_epi8(t, zero);
83 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
84 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
85
86 if (height == 8) {
87 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
88 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
89 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
90 }
91 else if (height == 16) {
92 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
93 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
94 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
95 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
96 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
97 }
98 }
99
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)100 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
101 const __m128i *ww, int32_t h, uint8_t *dst,
102 ptrdiff_t stride, int32_t second_half) {
103 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
104 const __m128i one = _mm_set1_epi16(1);
105 const __m128i inc = _mm_set1_epi16(0x202);
106 const __m128i gat = _mm_set1_epi32(0xc080400);
107 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
108 __m128i d = _mm_set1_epi16(0x100);
109
110 for (int32_t i = 0; i < h; ++i) {
111 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
112 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
113 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
114 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
115
116 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
117 b = _mm_unpacklo_epi16(b, pixel[2]);
118 __m128i sum = _mm_madd_epi16(b, ww[0]);
119
120 sum = _mm_add_epi32(s, sum);
121 sum = _mm_add_epi32(sum, round);
122 sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
123
124 sum = _mm_shuffle_epi8(sum, gat);
125 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
126 dst += stride;
127
128 rep = _mm_add_epi16(rep, one);
129 d = _mm_add_epi16(d, inc);
130 }
131 }
132
svt_aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)133 void svt_aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
134 const uint8_t *above, const uint8_t *left) {
135 __m128i pixels[3];
136 load_pixel_w4(above, left, 4, pixels);
137
138 __m128i wh[4], ww[2];
139 load_weight_w4(sm_weight_arrays, 4, wh, ww);
140
141 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
142 }
143
svt_aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)144 void svt_aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
145 const uint8_t *above, const uint8_t *left) {
146 __m128i pixels[3];
147 load_pixel_w4(above, left, 8, pixels);
148
149 __m128i wh[4], ww[2];
150 load_weight_w4(sm_weight_arrays, 8, wh, ww);
151
152 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
153 }
154
svt_aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)155 void svt_aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
156 const uint8_t *above,
157 const uint8_t *left) {
158 __m128i pixels[3];
159 load_pixel_w4(above, left, 16, pixels);
160
161 __m128i wh[4], ww[2];
162 load_weight_w4(sm_weight_arrays, 16, wh, ww);
163
164 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
165 dst += stride << 3;
166 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
167 }
168
169 // pixels[0]: above and below_pred interleave vector, first half
170 // pixels[1]: above and below_pred interleave vector, second half
171 // pixels[2]: left vector
172 // pixels[3]: right_pred vector
173 // pixels[4]: above and below_pred interleave vector, first half
174 // pixels[5]: above and below_pred interleave vector, second half
175 // pixels[6]: left vector + 16
176 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)177 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
178 int32_t height, __m128i *pixels) {
179 const __m128i zero = _mm_setzero_si128();
180 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
181 __m128i d = _mm_loadl_epi64((const __m128i *)above);
182 d = _mm_unpacklo_epi8(d, zero);
183 pixels[0] = _mm_unpacklo_epi16(d, bp);
184 pixels[1] = _mm_unpackhi_epi16(d, bp);
185
186 pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
187
188 if (height == 4)
189 pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
190 else if (height == 8)
191 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
192 else if (height == 16)
193 pixels[2] = _mm_loadu_si128((const __m128i *)left);
194 else {
195 pixels[2] = _mm_loadu_si128((const __m128i *)left);
196 pixels[4] = pixels[0];
197 pixels[5] = pixels[1];
198 pixels[6] = _mm_loadu_si128((const __m128i *)(left + 16));
199 pixels[7] = pixels[3];
200 }
201 }
202
203 // weight_h[0]: weight_h vector
204 // weight_h[1]: scale - weight_h vector
205 // weight_h[2]: same as [0], offset 8
206 // weight_h[3]: same as [1], offset 8
207 // weight_h[4]: same as [0], offset 16
208 // weight_h[5]: same as [1], offset 16
209 // weight_h[6]: same as [0], offset 24
210 // weight_h[7]: same as [1], offset 24
211 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
212 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_h,__m128i * weight_w)213 static INLINE void load_weight_w8(const uint8_t *weight_array, int32_t height,
214 __m128i *weight_h, __m128i *weight_w) {
215 const __m128i zero = _mm_setzero_si128();
216 const int32_t we_offset = height < 8 ? 4 : 8;
217 __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
218 weight_h[0] = _mm_unpacklo_epi8(we, zero);
219 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
220 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
221
222 if (height == 4) {
223 we = _mm_srli_si128(we, 4);
224 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
225 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
226 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
227 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
228 }
229 else {
230 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
231 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
232 }
233
234 if (height == 16) {
235 we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
236 weight_h[0] = _mm_unpacklo_epi8(we, zero);
237 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
238 weight_h[2] = _mm_unpackhi_epi8(we, zero);
239 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
240 }
241 else if (height == 32) {
242 const __m128i weight_lo =
243 _mm_loadu_si128((const __m128i *)&weight_array[32]);
244 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
245 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
246 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
247 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
248 const __m128i weight_hi =
249 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
250 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
251 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
252 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
253 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
254 }
255 }
256
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)257 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
258 const __m128i *ww, int32_t h, uint8_t *dst,
259 ptrdiff_t stride, int32_t second_half) {
260 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
261 const __m128i one = _mm_set1_epi16(1);
262 const __m128i inc = _mm_set1_epi16(0x202);
263 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
264
265 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
266 __m128i d = _mm_set1_epi16(0x100);
267
268 int32_t i;
269 for (i = 0; i < h; ++i) {
270 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
271 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
272 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
273 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
274 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
275
276 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
277 b = _mm_unpacklo_epi16(b, pixels[3]);
278 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
279 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
280
281 s0 = _mm_add_epi32(s0, sum0);
282 s0 = _mm_add_epi32(s0, round);
283 s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
284
285 s1 = _mm_add_epi32(s1, sum1);
286 s1 = _mm_add_epi32(s1, round);
287 s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
288
289 sum0 = _mm_packus_epi16(s0, s1);
290 sum0 = _mm_shuffle_epi8(sum0, gat);
291 _mm_storel_epi64((__m128i *)dst, sum0);
292 dst += stride;
293
294 rep = _mm_add_epi16(rep, one);
295 d = _mm_add_epi16(d, inc);
296 }
297 }
298
svt_aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)299 void svt_aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
300 const uint8_t *above, const uint8_t *left) {
301 __m128i pixels[4];
302 load_pixel_w8(above, left, 4, pixels);
303
304 __m128i wh[4], ww[2];
305 load_weight_w8(sm_weight_arrays, 4, wh, ww);
306
307 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
308 }
309
svt_aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)310 void svt_aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
311 const uint8_t *above, const uint8_t *left) {
312 __m128i pixels[4];
313 load_pixel_w8(above, left, 8, pixels);
314
315 __m128i wh[4], ww[2];
316 load_weight_w8(sm_weight_arrays, 8, wh, ww);
317
318 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
319 }
320
svt_aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)321 void svt_aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
322 const uint8_t *above,
323 const uint8_t *left) {
324 __m128i pixels[4];
325 load_pixel_w8(above, left, 16, pixels);
326
327 __m128i wh[4], ww[2];
328 load_weight_w8(sm_weight_arrays, 16, wh, ww);
329
330 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
331 dst += stride << 3;
332 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
333 }
334
svt_aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void svt_aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
336 const uint8_t *above,
337 const uint8_t *left) {
338 __m128i pixels[8];
339 load_pixel_w8(above, left, 32, pixels);
340
341 __m128i wh[8], ww[2];
342 load_weight_w8(sm_weight_arrays, 32, wh, ww);
343
344 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
345 dst += stride << 3;
346 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
347 dst += stride << 3;
348 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
349 dst += stride << 3;
350 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
351 }
352
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)353 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
354 const uint8_t *above,
355 const uint8_t *left, uint32_t bw,
356 uint32_t bh) {
357 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
358 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
359 const __m128i zero = _mm_setzero_si128();
360 const __m128i scale_value =
361 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
362 const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
363 const __m128i dup16 = _mm_set1_epi32(0x01000100);
364 const __m128i top_right =
365 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
366 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
367 const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
368
369 for (uint32_t y = 0; y < bh; ++y) {
370 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
371 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
372 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
373 __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
374 const __m128i wl_y =
375 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
376 pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
377 pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
378
379 for (uint32_t x = 0; x < bw; x += 8) {
380 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
381 const __m128i weights_x =
382 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
383 const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
384 const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
385 const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
386
387 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
388 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
389
390 const __m128i scale_m_weights_x =
391 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
392 const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
393 const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
394 const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
395
396 pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
397 pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
398
399 pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
400 pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
401
402 pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
403 pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
404
405 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
406 pred = _mm_shuffle_epi8(pred, gat);
407 _mm_storel_epi64((__m128i *)(dst + x), pred);
408 }
409 dst += stride;
410 }
411 }
412
svt_aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)413 void svt_aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
414 const uint8_t *above,
415 const uint8_t *left) {
416 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
417 }
418
svt_aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)419 void svt_aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
420 const uint8_t *above,
421 const uint8_t *left) {
422 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
423 }
424
svt_aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void svt_aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
429 }
430
svt_aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)431 void svt_aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
432 const uint8_t *above,
433 const uint8_t *left) {
434 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
435 }
436
svt_aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)437 void svt_aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
438 const uint8_t *above,
439 const uint8_t *left) {
440 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
441 }
442
svt_aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void svt_aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
444 const uint8_t *above,
445 const uint8_t *left) {
446 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
447 }
448
svt_aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)449 void svt_aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
450 const uint8_t *above,
451 const uint8_t *left) {
452 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
453 }
454
svt_aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)455 void svt_aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
456 const uint8_t *above,
457 const uint8_t *left) {
458 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
459 }
460
svt_aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)461 void svt_aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
462 const uint8_t *above,
463 const uint8_t *left) {
464 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
465 }
466
svt_aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)467 void svt_aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
468 const uint8_t *above,
469 const uint8_t *left) {
470 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
471 }
472
svt_aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)473 void svt_aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
474 const uint8_t *above,
475 const uint8_t *left) {
476 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
477 }
478
svt_aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)479 void svt_aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
480 const uint8_t *above,
481 const uint8_t *left) {
482 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
483 }
484
485 // -----------------------------------------------------------------------------
486 // SMOOTH_V_PRED
487
488 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)489 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
490 int32_t height, __m128i *pixels) {
491 const __m128i zero = _mm_setzero_si128();
492 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
493 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
494 d = _mm_unpacklo_epi8(d, zero);
495 pixels[0] = _mm_unpacklo_epi16(d, bp);
496 }
497
498 // weights[0]: weights_h vector
499 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int32_t height,__m128i * weights)500 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int32_t height,
501 __m128i *weights) {
502 const __m128i zero = _mm_setzero_si128();
503 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
504
505 if (height == 4) {
506 const __m128i weight =
507 _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
508 weights[0] = _mm_unpacklo_epi8(weight, zero);
509 weights[1] = _mm_sub_epi16(d, weights[0]);
510 }
511 else if (height == 8) {
512 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
513 weights[0] = _mm_unpacklo_epi8(weight, zero);
514 weights[1] = _mm_sub_epi16(d, weights[0]);
515 }
516 else {
517 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
518 weights[0] = _mm_unpacklo_epi8(weight, zero);
519 weights[1] = _mm_sub_epi16(d, weights[0]);
520 weights[2] = _mm_unpackhi_epi8(weight, zero);
521 weights[3] = _mm_sub_epi16(d, weights[2]);
522 }
523 }
524
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int32_t h,uint8_t * dst,ptrdiff_t stride)525 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
526 const __m128i *weight, int32_t h, uint8_t *dst,
527 ptrdiff_t stride) {
528 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
529 const __m128i inc = _mm_set1_epi16(0x202);
530 const __m128i gat = _mm_set1_epi32(0xc080400);
531 __m128i d = _mm_set1_epi16(0x100);
532
533 for (int32_t i = 0; i < h; ++i) {
534 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
535 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
536 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
537 __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
538 sum = _mm_add_epi32(sum, pred_round);
539 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
540 sum = _mm_shuffle_epi8(sum, gat);
541 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
542 dst += stride;
543 d = _mm_add_epi16(d, inc);
544 }
545 }
546
svt_aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void svt_aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
548 const uint8_t *above,
549 const uint8_t *left) {
550 __m128i pixels;
551 load_pixel_v_w4(above, left, 4, &pixels);
552
553 __m128i weights[2];
554 load_weight_v_w4(sm_weight_arrays, 4, weights);
555
556 smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
557 }
558
svt_aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)559 void svt_aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
560 const uint8_t *above,
561 const uint8_t *left) {
562 __m128i pixels;
563 load_pixel_v_w4(above, left, 8, &pixels);
564
565 __m128i weights[2];
566 load_weight_v_w4(sm_weight_arrays, 8, weights);
567
568 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
569 }
570
svt_aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)571 void svt_aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
572 const uint8_t *above,
573 const uint8_t *left) {
574 __m128i pixels;
575 load_pixel_v_w4(above, left, 16, &pixels);
576
577 __m128i weights[4];
578 load_weight_v_w4(sm_weight_arrays, 16, weights);
579
580 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
581 dst += stride << 3;
582 smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
583 }
584
585 // pixels[0]: above and below_pred interleave vector, first half
586 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)587 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
588 int32_t height, __m128i *pixels) {
589 const __m128i zero = _mm_setzero_si128();
590 __m128i d = _mm_loadl_epi64((const __m128i *)above);
591 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
592 d = _mm_unpacklo_epi8(d, zero);
593 pixels[0] = _mm_unpacklo_epi16(d, bp);
594 pixels[1] = _mm_unpackhi_epi16(d, bp);
595 }
596
597 // weight_h[0]: weight_h vector
598 // weight_h[1]: scale - weight_h vector
599 // weight_h[2]: same as [0], offset 8
600 // weight_h[3]: same as [1], offset 8
601 // weight_h[4]: same as [0], offset 16
602 // weight_h[5]: same as [1], offset 16
603 // weight_h[6]: same as [0], offset 24
604 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_h)605 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int32_t height,
606 __m128i *weight_h) {
607 const __m128i zero = _mm_setzero_si128();
608 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
609
610 if (height < 16) {
611 const int32_t offset = height < 8 ? 4 : 8;
612 const __m128i weight =
613 _mm_loadu_si128((const __m128i *)&weight_array[offset]);
614 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
615 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
616 }
617 else if (height == 16) {
618 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
619 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
620 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
621 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
622 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
623 }
624 else {
625 const __m128i weight_lo =
626 _mm_loadu_si128((const __m128i *)&weight_array[32]);
627 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
628 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
629 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
630 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
631 const __m128i weight_hi =
632 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
633 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
634 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
635 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
636 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
637 }
638 }
639
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int32_t h,uint8_t * dst,ptrdiff_t stride)640 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
641 int32_t h, uint8_t *dst, ptrdiff_t stride) {
642 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
643 const __m128i inc = _mm_set1_epi16(0x202);
644 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
645 __m128i d = _mm_set1_epi16(0x100);
646
647 for (int32_t i = 0; i < h; ++i) {
648 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
649 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
650 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
651 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
652 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
653
654 s0 = _mm_add_epi32(s0, pred_round);
655 s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
656
657 s1 = _mm_add_epi32(s1, pred_round);
658 s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
659
660 __m128i sum01 = _mm_packus_epi16(s0, s1);
661 sum01 = _mm_shuffle_epi8(sum01, gat);
662 _mm_storel_epi64((__m128i *)dst, sum01);
663 dst += stride;
664
665 d = _mm_add_epi16(d, inc);
666 }
667 }
668
svt_aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void svt_aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
670 const uint8_t *above,
671 const uint8_t *left) {
672 __m128i pixels[2];
673 load_pixel_v_w8(above, left, 4, pixels);
674
675 __m128i wh[2];
676 load_weight_v_w8(sm_weight_arrays, 4, wh);
677
678 smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
679 }
680
svt_aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)681 void svt_aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
682 const uint8_t *above,
683 const uint8_t *left) {
684 __m128i pixels[2];
685 load_pixel_v_w8(above, left, 8, pixels);
686
687 __m128i wh[2];
688 load_weight_v_w8(sm_weight_arrays, 8, wh);
689
690 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
691 }
692
svt_aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)693 void svt_aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
694 const uint8_t *above,
695 const uint8_t *left) {
696 __m128i pixels[2];
697 load_pixel_v_w8(above, left, 16, pixels);
698
699 __m128i wh[4];
700 load_weight_v_w8(sm_weight_arrays, 16, wh);
701
702 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
703 dst += stride << 3;
704 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
705 }
706
svt_aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)707 void svt_aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
708 const uint8_t *above,
709 const uint8_t *left) {
710 __m128i pixels[2];
711 load_pixel_v_w8(above, left, 32, pixels);
712
713 __m128i wh[8];
714 load_weight_v_w8(sm_weight_arrays, 32, wh);
715
716 smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
717 dst += stride << 3;
718 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
719 dst += stride << 3;
720 smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
721 dst += stride << 3;
722 smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
723 }
724
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)725 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
726 const uint8_t *above,
727 const uint8_t *left, uint32_t bw,
728 uint32_t bh) {
729 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
730 const __m128i zero = _mm_setzero_si128();
731 const __m128i scale_value =
732 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
733 const __m128i dup16 = _mm_set1_epi32(0x01000100);
734 const __m128i bottom_left =
735 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
736 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
737 const __m128i round =
738 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
739
740 for (uint32_t y = 0; y < bh; ++y) {
741 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
742 const __m128i scale_m_weights_y =
743 _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
744 const __m128i wl_y =
745 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
746
747 for (uint32_t x = 0; x < bw; x += 8) {
748 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
749 // 8 -> 16
750 const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
751 const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
752 const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
753 // top_x * weights_y + scale_m_weights_y * bottom_left
754 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
755 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
756
757 pred_lo = _mm_add_epi32(pred_lo, round);
758 pred_hi = _mm_add_epi32(pred_hi, round);
759 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
760 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
761
762 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
763 pred = _mm_shuffle_epi8(pred, gat);
764 _mm_storel_epi64((__m128i *)(dst + x), pred);
765 }
766 dst += stride;
767 }
768 }
769
svt_aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)770 void svt_aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
771 const uint8_t *above,
772 const uint8_t *left) {
773 smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
774 }
775
svt_aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)776 void svt_aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
777 const uint8_t *above,
778 const uint8_t *left) {
779 smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
780 }
781
svt_aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)782 void svt_aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
783 const uint8_t *above,
784 const uint8_t *left) {
785 smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
786 }
787
svt_aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)788 void svt_aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
789 const uint8_t *above,
790 const uint8_t *left) {
791 smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
792 }
793
svt_aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)794 void svt_aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
795 const uint8_t *above,
796 const uint8_t *left) {
797 smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
798 }
799
svt_aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)800 void svt_aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
801 const uint8_t *above,
802 const uint8_t *left) {
803 smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
804 }
805
svt_aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)806 void svt_aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
807 const uint8_t *above,
808 const uint8_t *left) {
809 smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
810 }
811
svt_aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)812 void svt_aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
813 const uint8_t *above,
814 const uint8_t *left) {
815 smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
816 }
817
svt_aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)818 void svt_aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
819 const uint8_t *above,
820 const uint8_t *left) {
821 smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
822 }
823
svt_aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)824 void svt_aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
825 const uint8_t *above,
826 const uint8_t *left) {
827 smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
828 }
829
svt_aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)830 void svt_aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
831 const uint8_t *above,
832 const uint8_t *left) {
833 smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
834 }
835
svt_aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)836 void svt_aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
837 const uint8_t *above,
838 const uint8_t *left) {
839 smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
840 }
841
842 // -----------------------------------------------------------------------------
843 // SMOOTH_H_PRED
844
845 // pixels[0]: left vector
846 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)847 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
848 int32_t height, __m128i *pixels) {
849 if (height == 4)
850 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
851 else if (height == 8)
852 pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
853 else
854 pixels[0] = _mm_loadu_si128(((const __m128i *)left));
855 pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
856 }
857
858 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int32_t height,__m128i * weights)859 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int32_t height,
860 __m128i *weights) {
861 (void)height;
862 const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
863 const __m128i zero = _mm_setzero_si128();
864
865 const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
866 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
867 const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
868 weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
869 }
870
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int32_t h,uint8_t * dst,ptrdiff_t stride)871 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
872 const __m128i *weight, int32_t h, uint8_t *dst,
873 ptrdiff_t stride) {
874 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
875 const __m128i one = _mm_set1_epi16(1);
876 const __m128i gat = _mm_set1_epi32(0xc080400);
877 __m128i rep = _mm_set1_epi16((short)0x8000);
878
879 for (int32_t i = 0; i < h; ++i) {
880 __m128i b = _mm_shuffle_epi8(pixel[0], rep);
881 b = _mm_unpacklo_epi16(b, pixel[1]);
882 __m128i sum = _mm_madd_epi16(b, weight[0]);
883
884 sum = _mm_add_epi32(sum, pred_round);
885 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
886
887 sum = _mm_shuffle_epi8(sum, gat);
888 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
889 dst += stride;
890
891 rep = _mm_add_epi16(rep, one);
892 }
893 }
894
svt_aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)895 void svt_aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
896 const uint8_t *above,
897 const uint8_t *left) {
898 __m128i pixels[2];
899 load_pixel_h_w4(above, left, 4, pixels);
900
901 __m128i weights;
902 load_weight_h_w4(sm_weight_arrays, 4, &weights);
903
904 smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
905 }
906
svt_aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)907 void svt_aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
908 const uint8_t *above,
909 const uint8_t *left) {
910 __m128i pixels[2];
911 load_pixel_h_w4(above, left, 8, pixels);
912
913 __m128i weights;
914 load_weight_h_w4(sm_weight_arrays, 8, &weights);
915
916 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
917 }
918
svt_aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)919 void svt_aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
920 const uint8_t *above,
921 const uint8_t *left) {
922 __m128i pixels[2];
923 load_pixel_h_w4(above, left, 16, pixels);
924
925 __m128i weights;
926 load_weight_h_w4(sm_weight_arrays, 8, &weights);
927
928 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
929 dst += stride << 3;
930
931 pixels[0] = _mm_srli_si128(pixels[0], 8);
932 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
933 }
934
935 // pixels[0]: left vector
936 // pixels[1]: right_pred vector
937 // pixels[2]: left vector + 16
938 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)939 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
940 int32_t height, __m128i *pixels) {
941 pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
942
943 if (height == 4)
944 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
945 else if (height == 8)
946 pixels[0] = _mm_loadl_epi64((const __m128i *)left);
947 else if (height == 16)
948 pixels[0] = _mm_loadu_si128((const __m128i *)left);
949 else {
950 pixels[0] = _mm_loadu_si128((const __m128i *)left);
951 pixels[2] = _mm_loadu_si128((const __m128i *)(left + 16));
952 pixels[3] = pixels[1];
953 }
954 }
955
956 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
957 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_w)958 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int32_t height,
959 __m128i *weight_w) {
960 (void)height;
961 const __m128i zero = _mm_setzero_si128();
962 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
963 const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
964 const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
965 const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
966 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
967 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
968 }
969
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)970 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
971 int32_t h, uint8_t *dst, ptrdiff_t stride,
972 int32_t second_half) {
973 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
974 const __m128i one = _mm_set1_epi16(1);
975 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
976 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
977
978 for (int32_t i = 0; i < h; ++i) {
979 __m128i b = _mm_shuffle_epi8(pixels[0], rep);
980 b = _mm_unpacklo_epi16(b, pixels[1]);
981 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
982 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
983
984 sum0 = _mm_add_epi32(sum0, pred_round);
985 sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
986
987 sum1 = _mm_add_epi32(sum1, pred_round);
988 sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
989
990 sum0 = _mm_packus_epi16(sum0, sum1);
991 sum0 = _mm_shuffle_epi8(sum0, gat);
992 _mm_storel_epi64((__m128i *)dst, sum0);
993 dst += stride;
994
995 rep = _mm_add_epi16(rep, one);
996 }
997 }
998
svt_aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)999 void svt_aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1000 const uint8_t *above,
1001 const uint8_t *left) {
1002 __m128i pixels[2];
1003 load_pixel_h_w8(above, left, 4, pixels);
1004
1005 __m128i ww[2];
1006 load_weight_h_w8(sm_weight_arrays, 4, ww);
1007
1008 smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1009 }
1010
svt_aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void svt_aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1012 const uint8_t *above,
1013 const uint8_t *left) {
1014 __m128i pixels[2];
1015 load_pixel_h_w8(above, left, 8, pixels);
1016
1017 __m128i ww[2];
1018 load_weight_h_w8(sm_weight_arrays, 8, ww);
1019
1020 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1021 }
1022
svt_aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1023 void svt_aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1024 const uint8_t *above,
1025 const uint8_t *left) {
1026 __m128i pixels[2];
1027 load_pixel_h_w8(above, left, 16, pixels);
1028
1029 __m128i ww[2];
1030 load_weight_h_w8(sm_weight_arrays, 16, ww);
1031
1032 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1033 dst += stride << 3;
1034 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1035 }
1036
svt_aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void svt_aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1038 const uint8_t *above,
1039 const uint8_t *left) {
1040 __m128i pixels[4];
1041 load_pixel_h_w8(above, left, 32, pixels);
1042
1043 __m128i ww[2];
1044 load_weight_h_w8(sm_weight_arrays, 32, ww);
1045
1046 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1047 dst += stride << 3;
1048 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1049 dst += stride << 3;
1050 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1051 dst += stride << 3;
1052 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1053 }
1054
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1055 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1056 const uint8_t *above,
1057 const uint8_t *left, uint32_t bw,
1058 uint32_t bh) {
1059 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1060 const __m128i zero = _mm_setzero_si128();
1061 const __m128i scale_value =
1062 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1063 const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1064 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1065 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1066
1067 for (uint32_t y = 0; y < bh; ++y) {
1068 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1069 const __m128i tr_ly =
1070 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1071
1072 for (uint32_t x = 0; x < bw; x += 8) {
1073 const __m128i weights_x =
1074 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1075 const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1076 const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1077 const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1078 const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1079 __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1080 __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1081
1082 pred_lo = _mm_add_epi32(pred_lo, pred_round);
1083 pred_hi = _mm_add_epi32(pred_hi, pred_round);
1084
1085 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1086 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1087
1088 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1089 pred = _mm_shuffle_epi8(pred, gat);
1090 _mm_storel_epi64((__m128i *)(dst + x), pred);
1091 }
1092 dst += stride;
1093 }
1094 }
1095
svt_aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1096 void svt_aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1097 const uint8_t *above,
1098 const uint8_t *left) {
1099 smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1100 }
1101
svt_aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1102 void svt_aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1103 const uint8_t *above,
1104 const uint8_t *left) {
1105 smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1106 }
1107
svt_aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1108 void svt_aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1109 const uint8_t *above,
1110 const uint8_t *left) {
1111 smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1112 }
1113
svt_aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1114 void svt_aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1115 const uint8_t *above,
1116 const uint8_t *left) {
1117 smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1118 }
1119
svt_aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1120 void svt_aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1121 const uint8_t *above,
1122 const uint8_t *left) {
1123 smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1124 }
1125
svt_aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1126 void svt_aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1127 const uint8_t *above,
1128 const uint8_t *left) {
1129 smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1130 }
1131
svt_aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1132 void svt_aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1133 const uint8_t *above,
1134 const uint8_t *left) {
1135 smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1136 }
1137
svt_aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1138 void svt_aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1139 const uint8_t *above,
1140 const uint8_t *left) {
1141 smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1142 }
1143
svt_aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1144 void svt_aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1145 const uint8_t *above,
1146 const uint8_t *left) {
1147 smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1148 }
1149
svt_aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1150 void svt_aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1151 const uint8_t *above,
1152 const uint8_t *left) {
1153 smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1154 }
1155
svt_aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1156 void svt_aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1157 const uint8_t *above,
1158 const uint8_t *left) {
1159 smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1160 }
1161
svt_aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1162 void svt_aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1163 const uint8_t *above,
1164 const uint8_t *left) {
1165 smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1166 }
1167