1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include "EbDefinitions.h"
13 #include "common_dsp_rtcd.h"
14 #include <tmmintrin.h>
15 
16 // Weights are quadratic from '1' to '1 / BlockSize', scaled by
17 // 2^sm_weight_log2_scale.
18 static const int32_t sm_weight_log2_scale = 8;
19 
20 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
21 #define MAX_BLOCK_DIM 64
22 
23 /* clang-format off */
24 static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
25     // Unused, because we always offset by bs, which is at least 2.
26     0, 0,
27     // bs = 2
28     255, 128,
29     // bs = 4
30     255, 149, 85, 64,
31     // bs = 8
32     255, 197, 146, 105, 73, 50, 37, 32,
33     // bs = 16
34     255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
35     // bs = 32
36     255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
37     66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
38     // bs = 64
39     255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
40     150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
41     65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
42     13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
43 };
44 
45 // -----------------------------------------------------------------------------
46 // PAETH_PRED
47 
48 // -----------------------------------------------------------------------------
49 // SMOOTH_PRED
50 
51 // pixels[0]: above and below_pred interleave vector
52 // pixels[1]: left vector
53 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)54 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
55     int32_t height, __m128i *pixels) {
56     __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
57     if (height == 4)
58         pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
59     else if (height == 8)
60         pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
61     else
62         pixels[1] = _mm_loadu_si128(((const __m128i *)left));
63 
64     pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
65 
66     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
67     const __m128i zero = _mm_setzero_si128();
68     d = _mm_unpacklo_epi8(d, zero);
69     pixels[0] = _mm_unpacklo_epi16(d, bp);
70 }
71 
72 // weight_h[0]: weight_h vector
73 // weight_h[1]: scale - weight_h vector
74 // weight_h[2]: same as [0], second half for height = 16 only
75 // weight_h[3]: same as [1], second half for height = 16 only
76 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int32_t height,__m128i * weight_h,__m128i * weight_w)77 static INLINE void load_weight_w4(const uint8_t *weight_array, int32_t height,
78     __m128i *weight_h, __m128i *weight_w) {
79     const __m128i zero = _mm_setzero_si128();
80     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
81     const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
82     weight_h[0] = _mm_unpacklo_epi8(t, zero);
83     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
84     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
85 
86     if (height == 8) {
87         const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
88         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
89         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
90     }
91     else if (height == 16) {
92         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
93         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
94         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
95         weight_h[2] = _mm_unpackhi_epi8(weight, zero);
96         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
97     }
98 }
99 
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)100 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
101     const __m128i *ww, int32_t h, uint8_t *dst,
102     ptrdiff_t stride, int32_t second_half) {
103     const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
104     const __m128i one = _mm_set1_epi16(1);
105     const __m128i inc = _mm_set1_epi16(0x202);
106     const __m128i gat = _mm_set1_epi32(0xc080400);
107     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
108     __m128i d = _mm_set1_epi16(0x100);
109 
110     for (int32_t i = 0; i < h; ++i) {
111         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
112         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
113         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
114         __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
115 
116         __m128i b = _mm_shuffle_epi8(pixel[1], rep);
117         b = _mm_unpacklo_epi16(b, pixel[2]);
118         __m128i sum = _mm_madd_epi16(b, ww[0]);
119 
120         sum = _mm_add_epi32(s, sum);
121         sum = _mm_add_epi32(sum, round);
122         sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
123 
124         sum = _mm_shuffle_epi8(sum, gat);
125         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
126         dst += stride;
127 
128         rep = _mm_add_epi16(rep, one);
129         d = _mm_add_epi16(d, inc);
130     }
131 }
132 
svt_aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)133 void svt_aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
134     const uint8_t *above, const uint8_t *left) {
135     __m128i pixels[3];
136     load_pixel_w4(above, left, 4, pixels);
137 
138     __m128i wh[4], ww[2];
139     load_weight_w4(sm_weight_arrays, 4, wh, ww);
140 
141     smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
142 }
143 
svt_aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)144 void svt_aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
145     const uint8_t *above, const uint8_t *left) {
146     __m128i pixels[3];
147     load_pixel_w4(above, left, 8, pixels);
148 
149     __m128i wh[4], ww[2];
150     load_weight_w4(sm_weight_arrays, 8, wh, ww);
151 
152     smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
153 }
154 
svt_aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)155 void svt_aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
156     const uint8_t *above,
157     const uint8_t *left) {
158     __m128i pixels[3];
159     load_pixel_w4(above, left, 16, pixels);
160 
161     __m128i wh[4], ww[2];
162     load_weight_w4(sm_weight_arrays, 16, wh, ww);
163 
164     smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
165     dst += stride << 3;
166     smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
167 }
168 
169 // pixels[0]: above and below_pred interleave vector, first half
170 // pixels[1]: above and below_pred interleave vector, second half
171 // pixels[2]: left vector
172 // pixels[3]: right_pred vector
173 // pixels[4]: above and below_pred interleave vector, first half
174 // pixels[5]: above and below_pred interleave vector, second half
175 // pixels[6]: left vector + 16
176 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)177 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
178     int32_t height, __m128i *pixels) {
179     const __m128i zero = _mm_setzero_si128();
180     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
181     __m128i d = _mm_loadl_epi64((const __m128i *)above);
182     d = _mm_unpacklo_epi8(d, zero);
183     pixels[0] = _mm_unpacklo_epi16(d, bp);
184     pixels[1] = _mm_unpackhi_epi16(d, bp);
185 
186     pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
187 
188     if (height == 4)
189         pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
190     else if (height == 8)
191         pixels[2] = _mm_loadl_epi64((const __m128i *)left);
192     else if (height == 16)
193         pixels[2] = _mm_loadu_si128((const __m128i *)left);
194     else {
195         pixels[2] = _mm_loadu_si128((const __m128i *)left);
196         pixels[4] = pixels[0];
197         pixels[5] = pixels[1];
198         pixels[6] = _mm_loadu_si128((const __m128i *)(left + 16));
199         pixels[7] = pixels[3];
200     }
201 }
202 
203 // weight_h[0]: weight_h vector
204 // weight_h[1]: scale - weight_h vector
205 // weight_h[2]: same as [0], offset 8
206 // weight_h[3]: same as [1], offset 8
207 // weight_h[4]: same as [0], offset 16
208 // weight_h[5]: same as [1], offset 16
209 // weight_h[6]: same as [0], offset 24
210 // weight_h[7]: same as [1], offset 24
211 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
212 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_h,__m128i * weight_w)213 static INLINE void load_weight_w8(const uint8_t *weight_array, int32_t height,
214     __m128i *weight_h, __m128i *weight_w) {
215     const __m128i zero = _mm_setzero_si128();
216     const int32_t we_offset = height < 8 ? 4 : 8;
217     __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
218     weight_h[0] = _mm_unpacklo_epi8(we, zero);
219     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
220     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
221 
222     if (height == 4) {
223         we = _mm_srli_si128(we, 4);
224         __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
225         __m128i tmp2 = _mm_sub_epi16(d, tmp1);
226         weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
227         weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
228     }
229     else {
230         weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
231         weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
232     }
233 
234     if (height == 16) {
235         we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
236         weight_h[0] = _mm_unpacklo_epi8(we, zero);
237         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
238         weight_h[2] = _mm_unpackhi_epi8(we, zero);
239         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
240     }
241     else if (height == 32) {
242         const __m128i weight_lo =
243             _mm_loadu_si128((const __m128i *)&weight_array[32]);
244         weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
245         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
246         weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
247         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
248         const __m128i weight_hi =
249             _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
250         weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
251         weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
252         weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
253         weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
254     }
255 }
256 
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)257 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
258     const __m128i *ww, int32_t h, uint8_t *dst,
259     ptrdiff_t stride, int32_t second_half) {
260     const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
261     const __m128i one = _mm_set1_epi16(1);
262     const __m128i inc = _mm_set1_epi16(0x202);
263     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
264 
265     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
266     __m128i d = _mm_set1_epi16(0x100);
267 
268     int32_t i;
269     for (i = 0; i < h; ++i) {
270         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
271         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
272         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
273         __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
274         __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
275 
276         __m128i b = _mm_shuffle_epi8(pixels[2], rep);
277         b = _mm_unpacklo_epi16(b, pixels[3]);
278         __m128i sum0 = _mm_madd_epi16(b, ww[0]);
279         __m128i sum1 = _mm_madd_epi16(b, ww[1]);
280 
281         s0 = _mm_add_epi32(s0, sum0);
282         s0 = _mm_add_epi32(s0, round);
283         s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
284 
285         s1 = _mm_add_epi32(s1, sum1);
286         s1 = _mm_add_epi32(s1, round);
287         s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
288 
289         sum0 = _mm_packus_epi16(s0, s1);
290         sum0 = _mm_shuffle_epi8(sum0, gat);
291         _mm_storel_epi64((__m128i *)dst, sum0);
292         dst += stride;
293 
294         rep = _mm_add_epi16(rep, one);
295         d = _mm_add_epi16(d, inc);
296     }
297 }
298 
svt_aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)299 void svt_aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
300     const uint8_t *above, const uint8_t *left) {
301     __m128i pixels[4];
302     load_pixel_w8(above, left, 4, pixels);
303 
304     __m128i wh[4], ww[2];
305     load_weight_w8(sm_weight_arrays, 4, wh, ww);
306 
307     smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
308 }
309 
svt_aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)310 void svt_aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
311     const uint8_t *above, const uint8_t *left) {
312     __m128i pixels[4];
313     load_pixel_w8(above, left, 8, pixels);
314 
315     __m128i wh[4], ww[2];
316     load_weight_w8(sm_weight_arrays, 8, wh, ww);
317 
318     smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
319 }
320 
svt_aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)321 void svt_aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
322     const uint8_t *above,
323     const uint8_t *left) {
324     __m128i pixels[4];
325     load_pixel_w8(above, left, 16, pixels);
326 
327     __m128i wh[4], ww[2];
328     load_weight_w8(sm_weight_arrays, 16, wh, ww);
329 
330     smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
331     dst += stride << 3;
332     smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
333 }
334 
svt_aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335 void svt_aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
336     const uint8_t *above,
337     const uint8_t *left) {
338     __m128i pixels[8];
339     load_pixel_w8(above, left, 32, pixels);
340 
341     __m128i wh[8], ww[2];
342     load_weight_w8(sm_weight_arrays, 32, wh, ww);
343 
344     smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
345     dst += stride << 3;
346     smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
347     dst += stride << 3;
348     smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
349     dst += stride << 3;
350     smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
351 }
352 
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)353 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
354     const uint8_t *above,
355     const uint8_t *left, uint32_t bw,
356     uint32_t bh) {
357     const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
358     const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
359     const __m128i zero = _mm_setzero_si128();
360     const __m128i scale_value =
361         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
362     const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
363     const __m128i dup16 = _mm_set1_epi32(0x01000100);
364     const __m128i top_right =
365         _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
366     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
367     const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
368 
369     for (uint32_t y = 0; y < bh; ++y) {
370         const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
371         const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
372         const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
373         __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
374         const __m128i wl_y =
375             _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
376         pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
377         pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
378 
379         for (uint32_t x = 0; x < bw; x += 8) {
380             const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
381             const __m128i weights_x =
382                 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
383             const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
384             const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
385             const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
386 
387             __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
388             __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
389 
390             const __m128i scale_m_weights_x =
391                 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
392             const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
393             const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
394             const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
395 
396             pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
397             pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
398 
399             pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
400             pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
401 
402             pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
403             pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
404 
405             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
406             pred = _mm_shuffle_epi8(pred, gat);
407             _mm_storel_epi64((__m128i *)(dst + x), pred);
408         }
409         dst += stride;
410     }
411 }
412 
svt_aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)413 void svt_aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
414     const uint8_t *above,
415     const uint8_t *left) {
416     smooth_predictor_wxh(dst, stride, above, left, 16, 4);
417 }
418 
svt_aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)419 void svt_aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
420     const uint8_t *above,
421     const uint8_t *left) {
422     smooth_predictor_wxh(dst, stride, above, left, 16, 8);
423 }
424 
svt_aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void svt_aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
426     const uint8_t *above,
427     const uint8_t *left) {
428     smooth_predictor_wxh(dst, stride, above, left, 16, 16);
429 }
430 
svt_aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)431 void svt_aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
432     const uint8_t *above,
433     const uint8_t *left) {
434     smooth_predictor_wxh(dst, stride, above, left, 16, 32);
435 }
436 
svt_aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)437 void svt_aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
438     const uint8_t *above,
439     const uint8_t *left) {
440     smooth_predictor_wxh(dst, stride, above, left, 32, 8);
441 }
442 
svt_aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void svt_aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
444     const uint8_t *above,
445     const uint8_t *left) {
446     smooth_predictor_wxh(dst, stride, above, left, 32, 16);
447 }
448 
svt_aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)449 void svt_aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
450     const uint8_t *above,
451     const uint8_t *left) {
452     smooth_predictor_wxh(dst, stride, above, left, 32, 32);
453 }
454 
svt_aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)455 void svt_aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
456     const uint8_t *above,
457     const uint8_t *left) {
458     smooth_predictor_wxh(dst, stride, above, left, 32, 64);
459 }
460 
svt_aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)461 void svt_aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
462     const uint8_t *above,
463     const uint8_t *left) {
464     smooth_predictor_wxh(dst, stride, above, left, 64, 64);
465 }
466 
svt_aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)467 void svt_aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
468     const uint8_t *above,
469     const uint8_t *left) {
470     smooth_predictor_wxh(dst, stride, above, left, 64, 32);
471 }
472 
svt_aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)473 void svt_aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
474     const uint8_t *above,
475     const uint8_t *left) {
476     smooth_predictor_wxh(dst, stride, above, left, 64, 16);
477 }
478 
svt_aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)479 void svt_aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
480     const uint8_t *above,
481     const uint8_t *left) {
482     smooth_predictor_wxh(dst, stride, above, left, 16, 64);
483 }
484 
485 // -----------------------------------------------------------------------------
486 // SMOOTH_V_PRED
487 
488 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)489 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
490     int32_t height, __m128i *pixels) {
491     const __m128i zero = _mm_setzero_si128();
492     __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
493     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
494     d = _mm_unpacklo_epi8(d, zero);
495     pixels[0] = _mm_unpacklo_epi16(d, bp);
496 }
497 
498 // weights[0]: weights_h vector
499 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int32_t height,__m128i * weights)500 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int32_t height,
501     __m128i *weights) {
502     const __m128i zero = _mm_setzero_si128();
503     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
504 
505     if (height == 4) {
506         const __m128i weight =
507             _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
508         weights[0] = _mm_unpacklo_epi8(weight, zero);
509         weights[1] = _mm_sub_epi16(d, weights[0]);
510     }
511     else if (height == 8) {
512         const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
513         weights[0] = _mm_unpacklo_epi8(weight, zero);
514         weights[1] = _mm_sub_epi16(d, weights[0]);
515     }
516     else {
517         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
518         weights[0] = _mm_unpacklo_epi8(weight, zero);
519         weights[1] = _mm_sub_epi16(d, weights[0]);
520         weights[2] = _mm_unpackhi_epi8(weight, zero);
521         weights[3] = _mm_sub_epi16(d, weights[2]);
522     }
523 }
524 
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int32_t h,uint8_t * dst,ptrdiff_t stride)525 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
526     const __m128i *weight, int32_t h, uint8_t *dst,
527     ptrdiff_t stride) {
528     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
529     const __m128i inc = _mm_set1_epi16(0x202);
530     const __m128i gat = _mm_set1_epi32(0xc080400);
531     __m128i d = _mm_set1_epi16(0x100);
532 
533     for (int32_t i = 0; i < h; ++i) {
534         const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
535         const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
536         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
537         __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
538         sum = _mm_add_epi32(sum, pred_round);
539         sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
540         sum = _mm_shuffle_epi8(sum, gat);
541         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
542         dst += stride;
543         d = _mm_add_epi16(d, inc);
544     }
545 }
546 
svt_aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547 void svt_aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
548     const uint8_t *above,
549     const uint8_t *left) {
550     __m128i pixels;
551     load_pixel_v_w4(above, left, 4, &pixels);
552 
553     __m128i weights[2];
554     load_weight_v_w4(sm_weight_arrays, 4, weights);
555 
556     smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
557 }
558 
svt_aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)559 void svt_aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
560     const uint8_t *above,
561     const uint8_t *left) {
562     __m128i pixels;
563     load_pixel_v_w4(above, left, 8, &pixels);
564 
565     __m128i weights[2];
566     load_weight_v_w4(sm_weight_arrays, 8, weights);
567 
568     smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
569 }
570 
svt_aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)571 void svt_aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
572     const uint8_t *above,
573     const uint8_t *left) {
574     __m128i pixels;
575     load_pixel_v_w4(above, left, 16, &pixels);
576 
577     __m128i weights[4];
578     load_weight_v_w4(sm_weight_arrays, 16, weights);
579 
580     smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
581     dst += stride << 3;
582     smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
583 }
584 
585 // pixels[0]: above and below_pred interleave vector, first half
586 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)587 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
588     int32_t height, __m128i *pixels) {
589     const __m128i zero = _mm_setzero_si128();
590     __m128i d = _mm_loadl_epi64((const __m128i *)above);
591     const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
592     d = _mm_unpacklo_epi8(d, zero);
593     pixels[0] = _mm_unpacklo_epi16(d, bp);
594     pixels[1] = _mm_unpackhi_epi16(d, bp);
595 }
596 
597 // weight_h[0]: weight_h vector
598 // weight_h[1]: scale - weight_h vector
599 // weight_h[2]: same as [0], offset 8
600 // weight_h[3]: same as [1], offset 8
601 // weight_h[4]: same as [0], offset 16
602 // weight_h[5]: same as [1], offset 16
603 // weight_h[6]: same as [0], offset 24
604 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_h)605 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int32_t height,
606     __m128i *weight_h) {
607     const __m128i zero = _mm_setzero_si128();
608     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
609 
610     if (height < 16) {
611         const int32_t offset = height < 8 ? 4 : 8;
612         const __m128i weight =
613             _mm_loadu_si128((const __m128i *)&weight_array[offset]);
614         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
615         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
616     }
617     else if (height == 16) {
618         const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
619         weight_h[0] = _mm_unpacklo_epi8(weight, zero);
620         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
621         weight_h[2] = _mm_unpackhi_epi8(weight, zero);
622         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
623     }
624     else {
625         const __m128i weight_lo =
626             _mm_loadu_si128((const __m128i *)&weight_array[32]);
627         weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
628         weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
629         weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
630         weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
631         const __m128i weight_hi =
632             _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
633         weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
634         weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
635         weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
636         weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
637     }
638 }
639 
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int32_t h,uint8_t * dst,ptrdiff_t stride)640 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
641     int32_t h, uint8_t *dst, ptrdiff_t stride) {
642     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
643     const __m128i inc = _mm_set1_epi16(0x202);
644     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
645     __m128i d = _mm_set1_epi16(0x100);
646 
647     for (int32_t i = 0; i < h; ++i) {
648         const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
649         const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
650         const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
651         __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
652         __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
653 
654         s0 = _mm_add_epi32(s0, pred_round);
655         s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
656 
657         s1 = _mm_add_epi32(s1, pred_round);
658         s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
659 
660         __m128i sum01 = _mm_packus_epi16(s0, s1);
661         sum01 = _mm_shuffle_epi8(sum01, gat);
662         _mm_storel_epi64((__m128i *)dst, sum01);
663         dst += stride;
664 
665         d = _mm_add_epi16(d, inc);
666     }
667 }
668 
svt_aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void svt_aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
670     const uint8_t *above,
671     const uint8_t *left) {
672     __m128i pixels[2];
673     load_pixel_v_w8(above, left, 4, pixels);
674 
675     __m128i wh[2];
676     load_weight_v_w8(sm_weight_arrays, 4, wh);
677 
678     smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
679 }
680 
svt_aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)681 void svt_aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
682     const uint8_t *above,
683     const uint8_t *left) {
684     __m128i pixels[2];
685     load_pixel_v_w8(above, left, 8, pixels);
686 
687     __m128i wh[2];
688     load_weight_v_w8(sm_weight_arrays, 8, wh);
689 
690     smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
691 }
692 
svt_aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)693 void svt_aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
694     const uint8_t *above,
695     const uint8_t *left) {
696     __m128i pixels[2];
697     load_pixel_v_w8(above, left, 16, pixels);
698 
699     __m128i wh[4];
700     load_weight_v_w8(sm_weight_arrays, 16, wh);
701 
702     smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
703     dst += stride << 3;
704     smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
705 }
706 
svt_aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)707 void svt_aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
708     const uint8_t *above,
709     const uint8_t *left) {
710     __m128i pixels[2];
711     load_pixel_v_w8(above, left, 32, pixels);
712 
713     __m128i wh[8];
714     load_weight_v_w8(sm_weight_arrays, 32, wh);
715 
716     smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
717     dst += stride << 3;
718     smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
719     dst += stride << 3;
720     smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
721     dst += stride << 3;
722     smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
723 }
724 
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)725 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
726     const uint8_t *above,
727     const uint8_t *left, uint32_t bw,
728     uint32_t bh) {
729     const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
730     const __m128i zero = _mm_setzero_si128();
731     const __m128i scale_value =
732         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
733     const __m128i dup16 = _mm_set1_epi32(0x01000100);
734     const __m128i bottom_left =
735         _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
736     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
737     const __m128i round =
738         _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
739 
740     for (uint32_t y = 0; y < bh; ++y) {
741         const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
742         const __m128i scale_m_weights_y =
743             _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
744         const __m128i wl_y =
745             _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
746 
747         for (uint32_t x = 0; x < bw; x += 8) {
748             const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
749             // 8 -> 16
750             const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
751             const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
752             const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
753             // top_x * weights_y + scale_m_weights_y * bottom_left
754             __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
755             __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
756 
757             pred_lo = _mm_add_epi32(pred_lo, round);
758             pred_hi = _mm_add_epi32(pred_hi, round);
759             pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
760             pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
761 
762             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
763             pred = _mm_shuffle_epi8(pred, gat);
764             _mm_storel_epi64((__m128i *)(dst + x), pred);
765         }
766         dst += stride;
767     }
768 }
769 
svt_aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)770 void svt_aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
771     const uint8_t *above,
772     const uint8_t *left) {
773     smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
774 }
775 
svt_aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)776 void svt_aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
777     const uint8_t *above,
778     const uint8_t *left) {
779     smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
780 }
781 
svt_aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)782 void svt_aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
783     const uint8_t *above,
784     const uint8_t *left) {
785     smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
786 }
787 
svt_aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)788 void svt_aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
789     const uint8_t *above,
790     const uint8_t *left) {
791     smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
792 }
793 
svt_aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)794 void svt_aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
795     const uint8_t *above,
796     const uint8_t *left) {
797     smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
798 }
799 
svt_aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)800 void svt_aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
801     const uint8_t *above,
802     const uint8_t *left) {
803     smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
804 }
805 
svt_aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)806 void svt_aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
807     const uint8_t *above,
808     const uint8_t *left) {
809     smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
810 }
811 
svt_aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)812 void svt_aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
813     const uint8_t *above,
814     const uint8_t *left) {
815     smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
816 }
817 
svt_aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)818 void svt_aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
819     const uint8_t *above,
820     const uint8_t *left) {
821     smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
822 }
823 
svt_aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)824 void svt_aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
825     const uint8_t *above,
826     const uint8_t *left) {
827     smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
828 }
829 
svt_aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)830 void svt_aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
831     const uint8_t *above,
832     const uint8_t *left) {
833     smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
834 }
835 
svt_aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)836 void svt_aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
837     const uint8_t *above,
838     const uint8_t *left) {
839     smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
840 }
841 
842 // -----------------------------------------------------------------------------
843 // SMOOTH_H_PRED
844 
845 // pixels[0]: left vector
846 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)847 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
848     int32_t height, __m128i *pixels) {
849     if (height == 4)
850         pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
851     else if (height == 8)
852         pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
853     else
854         pixels[0] = _mm_loadu_si128(((const __m128i *)left));
855     pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
856 }
857 
858 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int32_t height,__m128i * weights)859 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int32_t height,
860     __m128i *weights) {
861     (void)height;
862     const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
863     const __m128i zero = _mm_setzero_si128();
864 
865     const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
866     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
867     const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
868     weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
869 }
870 
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int32_t h,uint8_t * dst,ptrdiff_t stride)871 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
872     const __m128i *weight, int32_t h, uint8_t *dst,
873     ptrdiff_t stride) {
874     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
875     const __m128i one = _mm_set1_epi16(1);
876     const __m128i gat = _mm_set1_epi32(0xc080400);
877     __m128i rep = _mm_set1_epi16((short)0x8000);
878 
879     for (int32_t i = 0; i < h; ++i) {
880         __m128i b = _mm_shuffle_epi8(pixel[0], rep);
881         b = _mm_unpacklo_epi16(b, pixel[1]);
882         __m128i sum = _mm_madd_epi16(b, weight[0]);
883 
884         sum = _mm_add_epi32(sum, pred_round);
885         sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
886 
887         sum = _mm_shuffle_epi8(sum, gat);
888         *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
889         dst += stride;
890 
891         rep = _mm_add_epi16(rep, one);
892     }
893 }
894 
svt_aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)895 void svt_aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
896     const uint8_t *above,
897     const uint8_t *left) {
898     __m128i pixels[2];
899     load_pixel_h_w4(above, left, 4, pixels);
900 
901     __m128i weights;
902     load_weight_h_w4(sm_weight_arrays, 4, &weights);
903 
904     smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
905 }
906 
svt_aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)907 void svt_aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
908     const uint8_t *above,
909     const uint8_t *left) {
910     __m128i pixels[2];
911     load_pixel_h_w4(above, left, 8, pixels);
912 
913     __m128i weights;
914     load_weight_h_w4(sm_weight_arrays, 8, &weights);
915 
916     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
917 }
918 
svt_aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)919 void svt_aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
920     const uint8_t *above,
921     const uint8_t *left) {
922     __m128i pixels[2];
923     load_pixel_h_w4(above, left, 16, pixels);
924 
925     __m128i weights;
926     load_weight_h_w4(sm_weight_arrays, 8, &weights);
927 
928     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
929     dst += stride << 3;
930 
931     pixels[0] = _mm_srli_si128(pixels[0], 8);
932     smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
933 }
934 
935 // pixels[0]: left vector
936 // pixels[1]: right_pred vector
937 // pixels[2]: left vector + 16
938 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int32_t height,__m128i * pixels)939 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
940     int32_t height, __m128i *pixels) {
941     pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
942 
943     if (height == 4)
944         pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
945     else if (height == 8)
946         pixels[0] = _mm_loadl_epi64((const __m128i *)left);
947     else if (height == 16)
948         pixels[0] = _mm_loadu_si128((const __m128i *)left);
949     else {
950         pixels[0] = _mm_loadu_si128((const __m128i *)left);
951         pixels[2] = _mm_loadu_si128((const __m128i *)(left + 16));
952         pixels[3] = pixels[1];
953     }
954 }
955 
956 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
957 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int32_t height,__m128i * weight_w)958 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int32_t height,
959     __m128i *weight_w) {
960     (void)height;
961     const __m128i zero = _mm_setzero_si128();
962     const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
963     const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
964     const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
965     const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
966     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
967     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
968 }
969 
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int32_t h,uint8_t * dst,ptrdiff_t stride,int32_t second_half)970 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
971     int32_t h, uint8_t *dst, ptrdiff_t stride,
972     int32_t second_half) {
973     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
974     const __m128i one = _mm_set1_epi16(1);
975     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
976     __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) : _mm_set1_epi16((short)0x8000);
977 
978     for (int32_t i = 0; i < h; ++i) {
979         __m128i b = _mm_shuffle_epi8(pixels[0], rep);
980         b = _mm_unpacklo_epi16(b, pixels[1]);
981         __m128i sum0 = _mm_madd_epi16(b, ww[0]);
982         __m128i sum1 = _mm_madd_epi16(b, ww[1]);
983 
984         sum0 = _mm_add_epi32(sum0, pred_round);
985         sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
986 
987         sum1 = _mm_add_epi32(sum1, pred_round);
988         sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
989 
990         sum0 = _mm_packus_epi16(sum0, sum1);
991         sum0 = _mm_shuffle_epi8(sum0, gat);
992         _mm_storel_epi64((__m128i *)dst, sum0);
993         dst += stride;
994 
995         rep = _mm_add_epi16(rep, one);
996     }
997 }
998 
svt_aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)999 void svt_aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1000     const uint8_t *above,
1001     const uint8_t *left) {
1002     __m128i pixels[2];
1003     load_pixel_h_w8(above, left, 4, pixels);
1004 
1005     __m128i ww[2];
1006     load_weight_h_w8(sm_weight_arrays, 4, ww);
1007 
1008     smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1009 }
1010 
svt_aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void svt_aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1012     const uint8_t *above,
1013     const uint8_t *left) {
1014     __m128i pixels[2];
1015     load_pixel_h_w8(above, left, 8, pixels);
1016 
1017     __m128i ww[2];
1018     load_weight_h_w8(sm_weight_arrays, 8, ww);
1019 
1020     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1021 }
1022 
svt_aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1023 void svt_aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1024     const uint8_t *above,
1025     const uint8_t *left) {
1026     __m128i pixels[2];
1027     load_pixel_h_w8(above, left, 16, pixels);
1028 
1029     __m128i ww[2];
1030     load_weight_h_w8(sm_weight_arrays, 16, ww);
1031 
1032     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1033     dst += stride << 3;
1034     smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1035 }
1036 
svt_aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void svt_aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1038     const uint8_t *above,
1039     const uint8_t *left) {
1040     __m128i pixels[4];
1041     load_pixel_h_w8(above, left, 32, pixels);
1042 
1043     __m128i ww[2];
1044     load_weight_h_w8(sm_weight_arrays, 32, ww);
1045 
1046     smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1047     dst += stride << 3;
1048     smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1049     dst += stride << 3;
1050     smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1051     dst += stride << 3;
1052     smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1053 }
1054 
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1055 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1056     const uint8_t *above,
1057     const uint8_t *left, uint32_t bw,
1058     uint32_t bh) {
1059     const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1060     const __m128i zero = _mm_setzero_si128();
1061     const __m128i scale_value =
1062         _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1063     const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1064     const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1065     const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1066 
1067     for (uint32_t y = 0; y < bh; ++y) {
1068         const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1069         const __m128i tr_ly =
1070             _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1071 
1072         for (uint32_t x = 0; x < bw; x += 8) {
1073             const __m128i weights_x =
1074                 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1075             const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1076             const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1077             const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1078             const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1079             __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1080             __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1081 
1082             pred_lo = _mm_add_epi32(pred_lo, pred_round);
1083             pred_hi = _mm_add_epi32(pred_hi, pred_round);
1084 
1085             pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1086             pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1087 
1088             __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1089             pred = _mm_shuffle_epi8(pred, gat);
1090             _mm_storel_epi64((__m128i *)(dst + x), pred);
1091         }
1092         dst += stride;
1093     }
1094 }
1095 
svt_aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1096 void svt_aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1097     const uint8_t *above,
1098     const uint8_t *left) {
1099     smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1100 }
1101 
svt_aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1102 void svt_aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1103     const uint8_t *above,
1104     const uint8_t *left) {
1105     smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1106 }
1107 
svt_aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1108 void svt_aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1109     const uint8_t *above,
1110     const uint8_t *left) {
1111     smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1112 }
1113 
svt_aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1114 void svt_aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1115     const uint8_t *above,
1116     const uint8_t *left) {
1117     smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1118 }
1119 
svt_aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1120 void svt_aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1121     const uint8_t *above,
1122     const uint8_t *left) {
1123     smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1124 }
1125 
svt_aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1126 void svt_aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1127     const uint8_t *above,
1128     const uint8_t *left) {
1129     smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1130 }
1131 
svt_aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1132 void svt_aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1133     const uint8_t *above,
1134     const uint8_t *left) {
1135     smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1136 }
1137 
svt_aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1138 void svt_aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1139     const uint8_t *above,
1140     const uint8_t *left) {
1141     smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1142 }
1143 
svt_aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1144 void svt_aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1145     const uint8_t *above,
1146     const uint8_t *left) {
1147     smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1148 }
1149 
svt_aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1150 void svt_aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1151     const uint8_t *above,
1152     const uint8_t *left) {
1153     smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1154 }
1155 
svt_aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1156 void svt_aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1157     const uint8_t *above,
1158     const uint8_t *left) {
1159     smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1160 }
1161 
svt_aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1162 void svt_aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1163     const uint8_t *above,
1164     const uint8_t *left) {
1165     smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1166 }
1167