1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #include <immintrin.h>
13 #include "EbHighbdIntraPrediction_SSE2.h"
14 #include "EbDefinitions.h"
15 #include "common_dsp_rtcd.h"
16 #include "EbIntraPrediction_AVX2.h"
17 
18 // =============================================================================
19 
20 // DC RELATED PRED
21 
22 // Handle number of elements: up to 64.
dc_sum_large(const __m256i src)23 static INLINE __m128i dc_sum_large(const __m256i src) {
24     const __m128i s_lo = _mm256_extracti128_si256(src, 0);
25     const __m128i s_hi = _mm256_extracti128_si256(src, 1);
26     __m128i       sum, sum_hi;
27     sum    = _mm_add_epi16(s_lo, s_hi);
28     sum_hi = _mm_srli_si128(sum, 8);
29     sum    = _mm_add_epi16(sum, sum_hi);
30     // Unpack to avoid 12-bit overflow.
31     sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
32 
33     return dc_sum_4x32bit(sum);
34 }
35 
36 // Handle number of elements: 65 to 128.
dc_sum_larger(const __m256i src)37 static INLINE __m128i dc_sum_larger(const __m256i src) {
38     const __m128i s_lo = _mm256_extracti128_si256(src, 0);
39     const __m128i s_hi = _mm256_extracti128_si256(src, 1);
40     __m128i       sum, sum_hi;
41     sum = _mm_add_epi16(s_lo, s_hi);
42     // Unpack to avoid 12-bit overflow.
43     sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
44     sum    = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
45     sum    = _mm_add_epi32(sum, sum_hi);
46 
47     return dc_sum_4x32bit(sum);
48 }
49 
dc_sum_16(const uint16_t * const src)50 static INLINE __m128i dc_sum_16(const uint16_t *const src) {
51     const __m256i s    = _mm256_loadu_si256((const __m256i *)src);
52     const __m128i s_lo = _mm256_extracti128_si256(s, 0);
53     const __m128i s_hi = _mm256_extracti128_si256(s, 1);
54     const __m128i sum  = _mm_add_epi16(s_lo, s_hi);
55     return dc_sum_8x16bit(sum);
56 }
57 
dc_sum_32(const uint16_t * const src)58 static INLINE __m128i dc_sum_32(const uint16_t *const src) {
59     const __m256i s0  = _mm256_loadu_si256((const __m256i *)(src + 0x00));
60     const __m256i s1  = _mm256_loadu_si256((const __m256i *)(src + 0x10));
61     const __m256i sum = _mm256_add_epi16(s0, s1);
62     return dc_sum_large(sum);
63 }
64 
dc_sum_64(const uint16_t * const src)65 static INLINE __m128i dc_sum_64(const uint16_t *const src) {
66     const __m256i s0  = _mm256_loadu_si256((const __m256i *)(src + 0x00));
67     const __m256i s1  = _mm256_loadu_si256((const __m256i *)(src + 0x10));
68     const __m256i s2  = _mm256_loadu_si256((const __m256i *)(src + 0x20));
69     const __m256i s3  = _mm256_loadu_si256((const __m256i *)(src + 0x30));
70     const __m256i s01 = _mm256_add_epi16(s0, s1);
71     const __m256i s23 = _mm256_add_epi16(s2, s3);
72     const __m256i sum = _mm256_add_epi16(s01, s23);
73     return dc_sum_large(sum);
74 }
75 
dc_sum_4_16(const uint16_t * const src_4,const uint16_t * const src_16)76 static INLINE __m128i dc_sum_4_16(const uint16_t *const src_4, const uint16_t *const src_16) {
77     const __m128i s_4         = _mm_loadl_epi64((const __m128i *)src_4);
78     const __m256i s_16        = _mm256_loadu_si256((const __m256i *)src_16);
79     const __m128i s_lo        = _mm256_extracti128_si256(s_16, 0);
80     const __m128i s_hi        = _mm256_extracti128_si256(s_16, 1);
81     const __m128i s_16_sum0   = _mm_add_epi16(s_lo, s_hi);
82     const __m128i s_16_sum_hi = _mm_srli_si128(s_16_sum0, 8);
83     const __m128i s_16_sum    = _mm_add_epi16(s_16_sum0, s_16_sum_hi);
84     const __m128i sum         = _mm_add_epi16(s_16_sum, s_4);
85     return dc_sum_4x16bit_large(sum);
86 }
87 
dc_sum_8_16(const uint16_t * const src_8,const uint16_t * const src_16)88 static INLINE __m128i dc_sum_8_16(const uint16_t *const src_8, const uint16_t *const src_16) {
89     const __m128i s_8      = _mm_loadu_si128((const __m128i *)src_8);
90     const __m256i s_16     = _mm256_loadu_si256((const __m256i *)src_16);
91     const __m128i s_lo     = _mm256_extracti128_si256(s_16, 0);
92     const __m128i s_hi     = _mm256_extracti128_si256(s_16, 1);
93     const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
94     const __m128i sum      = _mm_add_epi16(s_16_sum, s_8);
95     return dc_sum_8x16bit_large(sum);
96 }
97 
dc_sum_8_32(const uint16_t * const src_8,const uint16_t * const src_32)98 static INLINE __m128i dc_sum_8_32(const uint16_t *const src_8, const uint16_t *const src_32) {
99     const __m128i s_8      = _mm_loadu_si128((const __m128i *)src_8);
100     const __m256i s_32_0   = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
101     const __m256i s_32_1   = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
102     const __m256i s_32     = _mm256_add_epi16(s_32_0, s_32_1);
103     const __m128i s_lo     = _mm256_extracti128_si256(s_32, 0);
104     const __m128i s_hi     = _mm256_extracti128_si256(s_32, 1);
105     const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
106     const __m128i sum      = _mm_add_epi16(s_8, s_16_sum);
107     return dc_sum_8x16bit_large(sum);
108 }
109 
dc_sum_16_16(const uint16_t * const src0,const uint16_t * const src1)110 static INLINE __m128i dc_sum_16_16(const uint16_t *const src0, const uint16_t *const src1) {
111     const __m256i s0  = _mm256_loadu_si256((const __m256i *)src0);
112     const __m256i s1  = _mm256_loadu_si256((const __m256i *)src1);
113     const __m256i sum = _mm256_add_epi16(s0, s1);
114     return dc_sum_large(sum);
115 }
116 
dc_sum_16_32(const uint16_t * const src_16,const uint16_t * const src_32)117 static INLINE __m128i dc_sum_16_32(const uint16_t *const src_16, const uint16_t *const src_32) {
118     const __m256i s_16   = _mm256_loadu_si256((const __m256i *)src_16);
119     const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
120     const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
121     const __m256i sum0   = _mm256_add_epi16(s_16, s_32_0);
122     const __m256i sum    = _mm256_add_epi16(sum0, s_32_1);
123     return dc_sum_large(sum);
124 }
125 
dc_sum_32_32(const uint16_t * const src0,const uint16_t * const src1)126 static INLINE __m128i dc_sum_32_32(const uint16_t *const src0, const uint16_t *const src1) {
127     const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
128     const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
129     const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
130     const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
131     const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
132     const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
133     const __m256i sum  = _mm256_add_epi16(sum0, sum1);
134     return dc_sum_large(sum);
135 }
136 
dc_sum_32_64(const uint16_t * const src_32,const uint16_t * const src_64)137 static INLINE __m128i dc_sum_32_64(const uint16_t *const src_32, const uint16_t *const src_64) {
138     const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
139     const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
140     const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
141     const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
142     const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
143     const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
144     const __m256i sum0   = _mm256_add_epi16(s_32_0, s_64_0);
145     const __m256i sum1   = _mm256_add_epi16(s_32_1, s_64_1);
146     const __m256i sum2   = _mm256_add_epi16(s_64_2, s_64_3);
147     const __m256i sum3   = _mm256_add_epi16(sum0, sum1);
148     const __m256i sum    = _mm256_add_epi16(sum2, sum3);
149     return dc_sum_larger(sum);
150 }
151 
dc_sum_64_64(const uint16_t * const src0,const uint16_t * const src1)152 static INLINE __m128i dc_sum_64_64(const uint16_t *const src0, const uint16_t *const src1) {
153     const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
154     const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
155     const __m256i s0_2 = _mm256_loadu_si256((const __m256i *)(src0 + 0x20));
156     const __m256i s0_3 = _mm256_loadu_si256((const __m256i *)(src0 + 0x30));
157     const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
158     const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
159     const __m256i s1_2 = _mm256_loadu_si256((const __m256i *)(src1 + 0x20));
160     const __m256i s1_3 = _mm256_loadu_si256((const __m256i *)(src1 + 0x30));
161     const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
162     const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
163     const __m256i sum2 = _mm256_add_epi16(s0_2, s1_2);
164     const __m256i sum3 = _mm256_add_epi16(s0_3, s1_3);
165     const __m256i sum4 = _mm256_add_epi16(sum0, sum1);
166     const __m256i sum5 = _mm256_add_epi16(sum2, sum3);
167     const __m256i sum  = _mm256_add_epi16(sum4, sum5);
168     return dc_sum_larger(sum);
169 }
170 
dc_sum_16_64(const uint16_t * const src_16,const uint16_t * const src_64)171 static INLINE __m128i dc_sum_16_64(const uint16_t *const src_16, const uint16_t *const src_64) {
172     const __m256i s_16   = _mm256_loadu_si256((const __m256i *)src_16);
173     const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
174     const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
175     const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
176     const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
177     const __m256i s0     = _mm256_add_epi16(s_16, s_64_0);
178     const __m256i s1     = _mm256_add_epi16(s0, s_64_1);
179     const __m256i s2     = _mm256_add_epi16(s_64_2, s_64_3);
180     const __m256i sum    = _mm256_add_epi16(s1, s2);
181     return dc_sum_larger(sum);
182 }
183 
dc_common_predictor_16xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)184 static INLINE void dc_common_predictor_16xh_kernel(uint16_t *dst, const ptrdiff_t stride,
185                                                    const int32_t h, const __m256i dc) {
186     for (int32_t i = 0; i < h; i++) {
187         _mm256_storeu_si256((__m256i *)dst, dc);
188         dst += stride;
189     }
190 }
191 
dc_common_predictor_32xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)192 static INLINE void dc_common_predictor_32xh_kernel(uint16_t *dst, const ptrdiff_t stride,
193                                                    const int32_t h, const __m256i dc) {
194     for (int32_t i = 0; i < h; i++) {
195         _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
196         _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
197         dst += stride;
198     }
199 }
200 
dc_common_predictor_64xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)201 static INLINE void dc_common_predictor_64xh_kernel(uint16_t *dst, const ptrdiff_t stride,
202                                                    const int32_t h, const __m256i dc) {
203     for (int32_t i = 0; i < h; i++) {
204         _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
205         _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
206         _mm256_storeu_si256((__m256i *)(dst + 0x20), dc);
207         _mm256_storeu_si256((__m256i *)(dst + 0x30), dc);
208         dst += stride;
209     }
210 }
211 
dc_common_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)212 static INLINE void dc_common_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
213                                             const int32_t h, const __m128i dc) {
214     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
215     dc_common_predictor_16xh_kernel(dst, stride, h, expected_dc);
216 }
217 
dc_common_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)218 static INLINE void dc_common_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
219                                             const int32_t h, const __m128i dc) {
220     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
221     dc_common_predictor_32xh_kernel(dst, stride, h, expected_dc);
222 }
223 
dc_common_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)224 static INLINE void dc_common_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
225                                             const int32_t h, const __m128i dc) {
226     const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
227     dc_common_predictor_64xh_kernel(dst, stride, h, expected_dc);
228 }
229 
230 // =============================================================================
231 
232 // DC_128_PRED
233 
234 // 16xN
235 
dc_128_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)236 static INLINE void dc_128_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
237                                          const int32_t h, const int32_t bd) {
238     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
239     dc_common_predictor_16xh_kernel(dst, stride, h, dc);
240 }
241 
svt_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)242 void svt_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
243                                                const uint16_t *above, const uint16_t *left,
244                                                int32_t bd) {
245     (void)above;
246     (void)left;
247     dc_128_predictor_16xh(dst, stride, 4, bd);
248 }
249 
svt_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)250 void svt_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
251                                                const uint16_t *above, const uint16_t *left,
252                                                int32_t bd) {
253     (void)above;
254     (void)left;
255     dc_128_predictor_16xh(dst, stride, 8, bd);
256 }
257 
svt_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)258 void svt_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
259                                                 const uint16_t *above, const uint16_t *left,
260                                                 int32_t bd) {
261     (void)above;
262     (void)left;
263     dc_128_predictor_16xh(dst, stride, 16, bd);
264 }
265 
svt_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)266 void svt_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
267                                                 const uint16_t *above, const uint16_t *left,
268                                                 int32_t bd) {
269     (void)above;
270     (void)left;
271     dc_128_predictor_16xh(dst, stride, 32, bd);
272 }
273 
svt_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)274 void svt_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
275                                                 const uint16_t *above, const uint16_t *left,
276                                                 int32_t bd) {
277     (void)above;
278     (void)left;
279     dc_128_predictor_16xh(dst, stride, 64, bd);
280 }
281 
282 // 32xN
283 
dc_128_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)284 static INLINE void dc_128_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
285                                          const int32_t h, const int32_t bd) {
286     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
287     dc_common_predictor_32xh_kernel(dst, stride, h, dc);
288 }
289 
svt_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)290 void svt_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
291                                                const uint16_t *above, const uint16_t *left,
292                                                int32_t bd) {
293     (void)above;
294     (void)left;
295     dc_128_predictor_32xh(dst, stride, 8, bd);
296 }
297 
svt_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)298 void svt_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
299                                                 const uint16_t *above, const uint16_t *left,
300                                                 int32_t bd) {
301     (void)above;
302     (void)left;
303     dc_128_predictor_32xh(dst, stride, 16, bd);
304 }
305 
svt_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)306 void svt_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
307                                                 const uint16_t *above, const uint16_t *left,
308                                                 int32_t bd) {
309     (void)above;
310     (void)left;
311     dc_128_predictor_32xh(dst, stride, 32, bd);
312 }
313 
svt_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)314 void svt_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
315                                                 const uint16_t *above, const uint16_t *left,
316                                                 int32_t bd) {
317     (void)above;
318     (void)left;
319     dc_128_predictor_32xh(dst, stride, 64, bd);
320 }
321 
322 // 64xN
323 
dc_128_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)324 static INLINE void dc_128_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
325                                          const int32_t h, const int32_t bd) {
326     const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
327     dc_common_predictor_64xh_kernel(dst, stride, h, dc);
328 }
329 
svt_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)330 void svt_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
331                                                 const uint16_t *above, const uint16_t *left,
332                                                 int32_t bd) {
333     (void)above;
334     (void)left;
335     dc_128_predictor_64xh(dst, stride, 16, bd);
336 }
337 
svt_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)338 void svt_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
339                                                 const uint16_t *above, const uint16_t *left,
340                                                 int32_t bd) {
341     (void)above;
342     (void)left;
343     dc_128_predictor_64xh(dst, stride, 32, bd);
344 }
345 
svt_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)346 void svt_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
347                                                 const uint16_t *above, const uint16_t *left,
348                                                 int32_t bd) {
349     (void)above;
350     (void)left;
351     dc_128_predictor_64xh(dst, stride, 64, bd);
352 }
353 
354 // =============================================================================
355 
356 // DC_LEFT_PRED
357 
358 // 16xN
359 
svt_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)360 void svt_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
361                                                 const uint16_t *above, const uint16_t *left,
362                                                 int32_t bd) {
363     const __m128i round = _mm_cvtsi32_si128(2);
364     __m128i       sum;
365     (void)above;
366     (void)bd;
367 
368     sum = dc_sum_4(left);
369     sum = _mm_add_epi16(sum, round);
370     sum = _mm_srli_epi16(sum, 2);
371     dc_common_predictor_16xh(dst, stride, 4, sum);
372 }
373 
svt_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)374 void svt_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
375                                                 const uint16_t *above, const uint16_t *left,
376                                                 int32_t bd) {
377     const __m128i round = _mm_cvtsi32_si128(4);
378     __m128i       sum;
379     (void)above;
380     (void)bd;
381 
382     sum = dc_sum_8(left);
383     sum = _mm_add_epi16(sum, round);
384     sum = _mm_srli_epi16(sum, 3);
385     dc_common_predictor_16xh(dst, stride, 8, sum);
386 }
387 
svt_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)388 void svt_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
389                                                  const uint16_t *above, const uint16_t *left,
390                                                  int32_t bd) {
391     const __m128i round = _mm_cvtsi32_si128(8);
392     __m128i       sum;
393     (void)above;
394     (void)bd;
395 
396     sum = dc_sum_16(left);
397     sum = _mm_add_epi16(sum, round);
398     sum = _mm_srli_epi16(sum, 4);
399     dc_common_predictor_16xh(dst, stride, 16, sum);
400 }
401 
svt_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)402 void svt_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
403                                                  const uint16_t *above, const uint16_t *left,
404                                                  int32_t bd) {
405     const __m128i round = _mm_cvtsi32_si128(16);
406     __m128i       sum;
407     (void)above;
408     (void)bd;
409 
410     sum = dc_sum_32(left);
411     sum = _mm_add_epi32(sum, round);
412     sum = _mm_srli_epi32(sum, 5);
413     dc_common_predictor_16xh(dst, stride, 32, sum);
414 }
415 
svt_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)416 void svt_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
417                                                  const uint16_t *above, const uint16_t *left,
418                                                  int32_t bd) {
419     const __m128i round = _mm_cvtsi32_si128(32);
420     __m128i       sum;
421     (void)above;
422     (void)bd;
423 
424     sum = dc_sum_64(left);
425     sum = _mm_add_epi32(sum, round);
426     sum = _mm_srli_epi32(sum, 6);
427     dc_common_predictor_16xh(dst, stride, 64, sum);
428 }
429 
430 // 32xN
431 
svt_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)432 void svt_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
433                                                 const uint16_t *above, const uint16_t *left,
434                                                 int32_t bd) {
435     const __m128i round = _mm_cvtsi32_si128(4);
436     __m128i       sum;
437     (void)above;
438     (void)bd;
439 
440     sum = dc_sum_8(left);
441     sum = _mm_add_epi16(sum, round);
442     sum = _mm_srli_epi16(sum, 3);
443     dc_common_predictor_32xh(dst, stride, 8, sum);
444 }
445 
svt_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)446 void svt_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
447                                                  const uint16_t *above, const uint16_t *left,
448                                                  int32_t bd) {
449     const __m128i round = _mm_cvtsi32_si128(8);
450     __m128i       sum;
451     (void)above;
452     (void)bd;
453 
454     sum = dc_sum_16(left);
455     sum = _mm_add_epi16(sum, round);
456     sum = _mm_srli_epi16(sum, 4);
457     dc_common_predictor_32xh(dst, stride, 16, sum);
458 }
459 
svt_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)460 void svt_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
461                                                  const uint16_t *above, const uint16_t *left,
462                                                  int32_t bd) {
463     const __m128i round = _mm_cvtsi32_si128(16);
464     __m128i       sum;
465     (void)above;
466     (void)bd;
467 
468     sum = dc_sum_32(left);
469     sum = _mm_add_epi32(sum, round);
470     sum = _mm_srli_epi32(sum, 5);
471     dc_common_predictor_32xh(dst, stride, 32, sum);
472 }
473 
svt_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)474 void svt_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
475                                                  const uint16_t *above, const uint16_t *left,
476                                                  int32_t bd) {
477     const __m128i round = _mm_cvtsi32_si128(32);
478     __m128i       sum;
479     (void)above;
480     (void)bd;
481 
482     sum = dc_sum_64(left);
483     sum = _mm_add_epi32(sum, round);
484     sum = _mm_srli_epi32(sum, 6);
485     dc_common_predictor_32xh(dst, stride, 64, sum);
486 }
487 
488 // 64xN
489 
svt_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)490 void svt_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
491                                                  const uint16_t *above, const uint16_t *left,
492                                                  int32_t bd) {
493     const __m128i round = _mm_cvtsi32_si128(8);
494     __m128i       sum;
495     (void)above;
496     (void)bd;
497 
498     sum = dc_sum_16(left);
499     sum = _mm_add_epi16(sum, round);
500     sum = _mm_srli_epi16(sum, 4);
501     dc_common_predictor_64xh(dst, stride, 16, sum);
502 }
503 
svt_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)504 void svt_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
505                                                  const uint16_t *above, const uint16_t *left,
506                                                  int32_t bd) {
507     const __m128i round = _mm_cvtsi32_si128(16);
508     __m128i       sum;
509     (void)above;
510     (void)bd;
511 
512     sum = dc_sum_32(left);
513     sum = _mm_add_epi32(sum, round);
514     sum = _mm_srli_epi32(sum, 5);
515     dc_common_predictor_64xh(dst, stride, 32, sum);
516 }
517 
svt_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)518 void svt_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
519                                                  const uint16_t *above, const uint16_t *left,
520                                                  int32_t bd) {
521     const __m128i round = _mm_cvtsi32_si128(32);
522     __m128i       sum;
523     (void)above;
524     (void)bd;
525 
526     sum = dc_sum_64(left);
527     sum = _mm_add_epi32(sum, round);
528     sum = _mm_srli_epi32(sum, 6);
529     dc_common_predictor_64xh(dst, stride, 64, sum);
530 }
531 
532 // =============================================================================
533 
534 // DC_TOP_PRED
535 
536 // 16xN
537 
dc_top_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)538 static INLINE void dc_top_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
539                                          const uint16_t *const above, const int32_t h,
540                                          const int32_t bd) {
541     (void)bd;
542     const __m128i round = _mm_cvtsi32_si128(8);
543     __m128i       sum;
544 
545     sum = dc_sum_16(above);
546     sum = _mm_add_epi16(sum, round);
547     sum = _mm_srli_epi16(sum, 4);
548     dc_common_predictor_16xh(dst, stride, h, sum);
549 }
550 
svt_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)551 void svt_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
552                                                const uint16_t *above, const uint16_t *left,
553                                                int32_t bd) {
554     (void)left;
555     dc_top_predictor_16xh(dst, stride, above, 4, bd);
556 }
557 
svt_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)558 void svt_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
559                                                const uint16_t *above, const uint16_t *left,
560                                                int32_t bd) {
561     (void)left;
562     dc_top_predictor_16xh(dst, stride, above, 8, bd);
563 }
564 
svt_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)565 void svt_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
566                                                 const uint16_t *above, const uint16_t *left,
567                                                 int32_t bd) {
568     (void)left;
569     dc_top_predictor_16xh(dst, stride, above, 16, bd);
570 }
571 
svt_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)572 void svt_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
573                                                 const uint16_t *above, const uint16_t *left,
574                                                 int32_t bd) {
575     (void)left;
576     dc_top_predictor_16xh(dst, stride, above, 32, bd);
577 }
578 
svt_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)579 void svt_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
580                                                 const uint16_t *above, const uint16_t *left,
581                                                 int32_t bd) {
582     (void)left;
583     dc_top_predictor_16xh(dst, stride, above, 64, bd);
584 }
585 
586 // 32xN
587 
dc_top_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)588 static INLINE void dc_top_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
589                                          const uint16_t *const above, const int32_t h,
590                                          const int32_t bd) {
591     const __m128i round = _mm_cvtsi32_si128(16);
592     __m128i       sum;
593     (void)bd;
594 
595     sum = dc_sum_32(above);
596     sum = _mm_add_epi32(sum, round);
597     sum = _mm_srli_epi32(sum, 5);
598     dc_common_predictor_32xh(dst, stride, h, sum);
599 }
600 
svt_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)601 void svt_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
602                                                const uint16_t *above, const uint16_t *left,
603                                                int32_t bd) {
604     (void)left;
605     dc_top_predictor_32xh(dst, stride, above, 8, bd);
606 }
607 
svt_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)608 void svt_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
609                                                 const uint16_t *above, const uint16_t *left,
610                                                 int32_t bd) {
611     (void)left;
612     dc_top_predictor_32xh(dst, stride, above, 16, bd);
613 }
614 
svt_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)615 void svt_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
616                                                 const uint16_t *above, const uint16_t *left,
617                                                 int32_t bd) {
618     (void)left;
619     dc_top_predictor_32xh(dst, stride, above, 32, bd);
620 }
621 
svt_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)622 void svt_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
623                                                 const uint16_t *above, const uint16_t *left,
624                                                 int32_t bd) {
625     (void)left;
626     dc_top_predictor_32xh(dst, stride, above, 64, bd);
627 }
628 
629 // 64xN
630 
dc_top_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)631 static INLINE void dc_top_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
632                                          const uint16_t *const above, const int32_t h,
633                                          const int32_t bd) {
634     const __m128i round = _mm_cvtsi32_si128(32);
635     __m128i       sum;
636     (void)bd;
637 
638     sum = dc_sum_64(above);
639     sum = _mm_add_epi32(sum, round);
640     sum = _mm_srli_epi32(sum, 6);
641     dc_common_predictor_64xh(dst, stride, h, sum);
642 }
643 
svt_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)644 void svt_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
645                                                 const uint16_t *above, const uint16_t *left,
646                                                 int32_t bd) {
647     (void)left;
648     dc_top_predictor_64xh(dst, stride, above, 16, bd);
649 }
650 
svt_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)651 void svt_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
652                                                 const uint16_t *above, const uint16_t *left,
653                                                 int32_t bd) {
654     (void)left;
655     dc_top_predictor_64xh(dst, stride, above, 32, bd);
656 }
657 
svt_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)658 void svt_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
659                                                 const uint16_t *above, const uint16_t *left,
660                                                 int32_t bd) {
661     (void)left;
662     dc_top_predictor_64xh(dst, stride, above, 64, bd);
663 }
664 
665 // =============================================================================
666 
667 // DC_PRED
668 
669 // 16xN
670 
svt_aom_highbd_dc_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)671 void svt_aom_highbd_dc_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
672                                            const uint16_t *left, int32_t bd) {
673     (void)bd;
674     __m128i  sum   = dc_sum_4_16(left, above);
675     uint32_t sum32 = _mm_cvtsi128_si32(sum);
676     sum32 += 10;
677     sum32 /= 20;
678     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
679 
680     dc_common_predictor_16xh_kernel(dst, stride, 4, dc);
681 }
682 
svt_aom_highbd_dc_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)683 void svt_aom_highbd_dc_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
684                                            const uint16_t *left, int32_t bd) {
685     (void)bd;
686     __m128i  sum   = dc_sum_8_16(left, above);
687     uint32_t sum32 = _mm_cvtsi128_si32(sum);
688     sum32 += 12;
689     sum32 /= 24;
690     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
691 
692     dc_common_predictor_16xh_kernel(dst, stride, 8, dc);
693 }
694 
svt_aom_highbd_dc_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)695 void svt_aom_highbd_dc_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
696                                             const uint16_t *left, int32_t bd) {
697     (void)bd;
698     __m128i sum = dc_sum_16_16(above, left);
699     sum         = _mm_add_epi32(sum, _mm_set1_epi32(16));
700     sum         = _mm_srli_epi32(sum, 5);
701     dc_common_predictor_16xh(dst, stride, 16, sum);
702 }
703 
svt_aom_highbd_dc_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)704 void svt_aom_highbd_dc_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
705                                             const uint16_t *left, int32_t bd) {
706     (void)bd;
707     __m128i  sum   = dc_sum_16_32(above, left);
708     uint32_t sum32 = _mm_cvtsi128_si32(sum);
709     sum32 += 24;
710     sum32 /= 48;
711     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
712 
713     dc_common_predictor_16xh_kernel(dst, stride, 32, dc);
714 }
715 
svt_aom_highbd_dc_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)716 void svt_aom_highbd_dc_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
717                                             const uint16_t *left, int32_t bd) {
718     (void)bd;
719     __m128i  sum   = dc_sum_16_64(above, left);
720     uint32_t sum32 = _mm_cvtsi128_si32(sum);
721     sum32 += 40;
722     sum32 /= 80;
723     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
724 
725     dc_common_predictor_16xh_kernel(dst, stride, 64, dc);
726 }
727 
728 // 32xN
729 
svt_aom_highbd_dc_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)730 void svt_aom_highbd_dc_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
731                                            const uint16_t *left, int32_t bd) {
732     (void)bd;
733     __m128i  sum   = dc_sum_8_32(left, above);
734     uint32_t sum32 = _mm_cvtsi128_si32(sum);
735     sum32 += 20;
736     sum32 /= 40;
737     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
738 
739     dc_common_predictor_32xh_kernel(dst, stride, 8, dc);
740 }
741 
svt_aom_highbd_dc_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)742 void svt_aom_highbd_dc_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
743                                             const uint16_t *left, int32_t bd) {
744     (void)bd;
745     __m128i  sum   = dc_sum_16_32(left, above);
746     uint32_t sum32 = _mm_cvtsi128_si32(sum);
747     sum32 += 24;
748     sum32 /= 48;
749     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
750 
751     dc_common_predictor_32xh_kernel(dst, stride, 16, dc);
752 }
753 
svt_aom_highbd_dc_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)754 void svt_aom_highbd_dc_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
755                                             const uint16_t *left, int32_t bd) {
756     (void)bd;
757     __m128i sum = dc_sum_32_32(above, left);
758     sum         = _mm_add_epi32(sum, _mm_set1_epi32(32));
759     sum         = _mm_srli_epi32(sum, 6);
760     dc_common_predictor_32xh(dst, stride, 32, sum);
761 }
762 
svt_aom_highbd_dc_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)763 void svt_aom_highbd_dc_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
764                                             const uint16_t *left, int32_t bd) {
765     (void)bd;
766     __m128i  sum   = dc_sum_32_64(above, left);
767     uint32_t sum32 = _mm_cvtsi128_si32(sum);
768     sum32 += 48;
769     sum32 /= 96;
770     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
771 
772     dc_common_predictor_32xh_kernel(dst, stride, 64, dc);
773 }
774 
775 // 64xN
776 
svt_aom_highbd_dc_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)777 void svt_aom_highbd_dc_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
778                                             const uint16_t *left, int32_t bd) {
779     (void)bd;
780     __m128i  sum   = dc_sum_16_64(left, above);
781     uint32_t sum32 = _mm_cvtsi128_si32(sum);
782     sum32 += 40;
783     sum32 /= 80;
784     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
785 
786     dc_common_predictor_64xh_kernel(dst, stride, 16, dc);
787 }
788 
svt_aom_highbd_dc_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)789 void svt_aom_highbd_dc_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
790                                             const uint16_t *left, int32_t bd) {
791     (void)bd;
792     __m128i  sum   = dc_sum_32_64(left, above);
793     uint32_t sum32 = _mm_cvtsi128_si32(sum);
794     sum32 += 48;
795     sum32 /= 96;
796     const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
797 
798     dc_common_predictor_64xh_kernel(dst, stride, 32, dc);
799 }
800 
svt_aom_highbd_dc_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)801 void svt_aom_highbd_dc_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
802                                             const uint16_t *left, int32_t bd) {
803     (void)bd;
804     __m128i sum = dc_sum_64_64(above, left);
805     sum         = _mm_add_epi32(sum, _mm_set1_epi32(64));
806     sum         = _mm_srli_epi32(sum, 7);
807     dc_common_predictor_64xh(dst, stride, 64, sum);
808 }
809 
810 // =============================================================================
811 
812 // H_PRED
813 
814 // 16xN
815 
h_pred_16(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)816 static INLINE void h_pred_16(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
817     // Broadcast the 16-bit left pixel to 256-bit register.
818     const __m256i row = _mm256_broadcastw_epi16(left);
819 
820     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
821     *dst += stride;
822 }
823 
824 // Process 8 rows.
h_pred_16x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)825 static INLINE void h_pred_16x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
826     const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
827 
828     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 0));
829     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 2));
830     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 4));
831     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 6));
832     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 8));
833     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 10));
834     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 12));
835     h_pred_16(dst, stride, _mm_srli_si128(left_u16, 14));
836 }
837 
838 // 16x4
839 
svt_aom_highbd_h_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)840 void svt_aom_highbd_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
841                                           const uint16_t *left, int32_t bd) {
842     (void)above;
843     (void)bd;
844     const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
845 
846     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 0));
847     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 2));
848     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 4));
849     h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 6));
850 }
851 
852 // 16x64
853 
svt_aom_highbd_h_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)854 void svt_aom_highbd_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
855                                            const uint16_t *left, int32_t bd) {
856     (void)above;
857     (void)bd;
858 
859     for (int32_t i = 0; i < 8; i++, left += 8) h_pred_16x8(&dst, stride, left);
860 }
861 
862 // -----------------------------------------------------------------------------
863 
864 // 32xN
865 
h_pred_32(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)866 static INLINE void h_pred_32(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
867     // Broadcast the 16-bit left pixel to 256-bit register.
868     const __m256i row = _mm256_broadcastw_epi16(left);
869 
870     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
871     _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
872     *dst += stride;
873 }
874 
875 // Process 8 rows.
h_pred_32x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)876 static INLINE void h_pred_32x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
877     const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
878 
879     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 0));
880     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 2));
881     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 4));
882     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 6));
883     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 8));
884     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 10));
885     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 12));
886     h_pred_32(dst, stride, _mm_srli_si128(left_u16, 14));
887 }
888 
889 // 32x8
890 
svt_aom_highbd_h_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)891 void svt_aom_highbd_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
892                                           const uint16_t *left, int32_t bd) {
893     (void)above;
894     (void)bd;
895 
896     h_pred_32x8(&dst, stride, left);
897 }
898 
899 // 32x64
900 
svt_aom_highbd_h_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)901 void svt_aom_highbd_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
902                                            const uint16_t *left, int32_t bd) {
903     (void)above;
904     (void)bd;
905 
906     for (int32_t i = 0; i < 8; i++, left += 8) h_pred_32x8(&dst, stride, left);
907 }
908 
909 // -----------------------------------------------------------------------------
910 
911 // 64xN
912 
h_pred_64(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)913 static INLINE void h_pred_64(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
914     // Broadcast the 16-bit left pixel to 256-bit register.
915     const __m256i row = _mm256_broadcastw_epi16(left);
916 
917     _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
918     _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
919     _mm256_storeu_si256((__m256i *)(*dst + 0x20), row);
920     _mm256_storeu_si256((__m256i *)(*dst + 0x30), row);
921     *dst += stride;
922 }
923 
924 // Process 8 rows.
h_pred_64x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)925 static INLINE void h_pred_64x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
926     const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
927 
928     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 0));
929     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 2));
930     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 4));
931     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 6));
932     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 8));
933     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 10));
934     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 12));
935     h_pred_64(dst, stride, _mm_srli_si128(left_u16, 14));
936 }
937 
938 // 64x16
939 
svt_aom_highbd_h_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)940 void svt_aom_highbd_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
941                                            const uint16_t *left, int32_t bd) {
942     (void)above;
943     (void)bd;
944 
945     for (int32_t i = 0; i < 2; i++, left += 8) h_pred_64x8(&dst, stride, left);
946 }
947 
948 // 64x32
949 
svt_aom_highbd_h_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)950 void svt_aom_highbd_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
951                                            const uint16_t *left, int32_t bd) {
952     (void)above;
953     (void)bd;
954 
955     for (int32_t i = 0; i < 4; i++, left += 8) h_pred_64x8(&dst, stride, left);
956 }
957 
958 // 64x64
959 
svt_aom_highbd_h_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)960 void svt_aom_highbd_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
961                                            const uint16_t *left, int32_t bd) {
962     (void)above;
963     (void)bd;
964 
965     for (int32_t i = 0; i < 8; i++, left += 8) h_pred_64x8(&dst, stride, left);
966 }
967 
968 // =============================================================================
969 
970 // V_PRED
971 
972 // 16xN
973 
v_pred_16(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0)974 static INLINE void v_pred_16(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0) {
975     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
976     *dst += stride;
977 }
978 
979 // Process 8 rows.
v_pred_16x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above)980 static INLINE void v_pred_16x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above) {
981     v_pred_16(dst, stride, above);
982     v_pred_16(dst, stride, above);
983     v_pred_16(dst, stride, above);
984     v_pred_16(dst, stride, above);
985     v_pred_16(dst, stride, above);
986     v_pred_16(dst, stride, above);
987     v_pred_16(dst, stride, above);
988     v_pred_16(dst, stride, above);
989 }
990 
991 // 16x4
992 
svt_aom_highbd_v_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)993 void svt_aom_highbd_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
994                                           const uint16_t *left, int32_t bd) {
995     // Load all 16 pixels in a row into 256-bit registers.
996     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
997 
998     (void)left;
999     (void)bd;
1000 
1001     v_pred_16(&dst, stride, above0);
1002     v_pred_16(&dst, stride, above0);
1003     v_pred_16(&dst, stride, above0);
1004     v_pred_16(&dst, stride, above0);
1005 }
1006 
1007 // 16x8
1008 
svt_aom_highbd_v_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1009 void svt_aom_highbd_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1010                                           const uint16_t *left, int32_t bd) {
1011     // Load all 16 pixels in a row into 256-bit registers.
1012     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1013 
1014     (void)left;
1015     (void)bd;
1016 
1017     v_pred_16x8(&dst, stride, above0);
1018 }
1019 
1020 // 16x16
1021 
svt_aom_highbd_v_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1022 void svt_aom_highbd_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1023                                            const uint16_t *left, int32_t bd) {
1024     // Load all 16 pixels in a row into 256-bit registers.
1025     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1026 
1027     (void)left;
1028     (void)bd;
1029 
1030     for (int32_t i = 0; i < 2; i++) v_pred_16x8(&dst, stride, above0);
1031 }
1032 
1033 // 16x32
1034 
svt_aom_highbd_v_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1035 void svt_aom_highbd_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1036                                            const uint16_t *left, int32_t bd) {
1037     // Load all 16 pixels in a row into 256-bit registers.
1038     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1039 
1040     (void)left;
1041     (void)bd;
1042 
1043     for (int32_t i = 0; i < 4; i++) v_pred_16x8(&dst, stride, above0);
1044 }
1045 
1046 // 16x64
1047 
svt_aom_highbd_v_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1048 void svt_aom_highbd_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1049                                            const uint16_t *left, int32_t bd) {
1050     // Load all 16 pixels in a row into 256-bit registers.
1051     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1052 
1053     (void)left;
1054     (void)bd;
1055 
1056     for (int32_t i = 0; i < 8; i++) v_pred_16x8(&dst, stride, above0);
1057 }
1058 
1059 // -----------------------------------------------------------------------------
1060 
1061 // 32xN
1062 
v_pred_32(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1)1063 static INLINE void v_pred_32(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1064                              const __m256i above1) {
1065     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1066     _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1067     *dst += stride;
1068 }
1069 
1070 // Process 8 rows.
v_pred_32x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1)1071 static INLINE void v_pred_32x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1072                                const __m256i above1) {
1073     v_pred_32(dst, stride, above0, above1);
1074     v_pred_32(dst, stride, above0, above1);
1075     v_pred_32(dst, stride, above0, above1);
1076     v_pred_32(dst, stride, above0, above1);
1077     v_pred_32(dst, stride, above0, above1);
1078     v_pred_32(dst, stride, above0, above1);
1079     v_pred_32(dst, stride, above0, above1);
1080     v_pred_32(dst, stride, above0, above1);
1081 }
1082 
1083 // 32x8
1084 
svt_aom_highbd_v_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1085 void svt_aom_highbd_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1086                                           const uint16_t *left, int32_t bd) {
1087     // Load all 32 pixels in a row into 256-bit registers.
1088     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1089     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1090 
1091     (void)left;
1092     (void)bd;
1093 
1094     v_pred_32x8(&dst, stride, above0, above1);
1095 }
1096 
1097 // 32x16
1098 
svt_aom_highbd_v_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1099 void svt_aom_highbd_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1100                                            const uint16_t *left, int32_t bd) {
1101     // Load all 32 pixels in a row into 256-bit registers.
1102     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1103     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1104 
1105     (void)left;
1106     (void)bd;
1107 
1108     for (int32_t i = 0; i < 2; i++) v_pred_32x8(&dst, stride, above0, above1);
1109 }
1110 
1111 // 32x32
1112 
svt_aom_highbd_v_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1113 void svt_aom_highbd_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1114                                            const uint16_t *left, int32_t bd) {
1115     // Load all 32 pixels in a row into 256-bit registers.
1116     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1117     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1118 
1119     (void)left;
1120     (void)bd;
1121 
1122     for (int32_t i = 0; i < 4; i++) v_pred_32x8(&dst, stride, above0, above1);
1123 }
1124 
1125 // 32x64
1126 
svt_aom_highbd_v_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1127 void svt_aom_highbd_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1128                                            const uint16_t *left, int32_t bd) {
1129     // Load all 32 pixels in a row into 256-bit registers.
1130     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1131     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1132 
1133     (void)left;
1134     (void)bd;
1135 
1136     for (int32_t i = 0; i < 8; i++) v_pred_32x8(&dst, stride, above0, above1);
1137 }
1138 
1139 // -----------------------------------------------------------------------------
1140 
1141 // 64xN
1142 
v_pred_64(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1,const __m256i above2,const __m256i above3)1143 static INLINE void v_pred_64(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1144                              const __m256i above1, const __m256i above2, const __m256i above3) {
1145     _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1146     _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1147     _mm256_storeu_si256((__m256i *)(*dst + 0x20), above2);
1148     _mm256_storeu_si256((__m256i *)(*dst + 0x30), above3);
1149     *dst += stride;
1150 }
1151 
1152 // Process 8 rows.
v_pred_64x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1,const __m256i above2,const __m256i above3)1153 static INLINE void v_pred_64x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1154                                const __m256i above1, const __m256i above2, const __m256i above3) {
1155     v_pred_64(dst, stride, above0, above1, above2, above3);
1156     v_pred_64(dst, stride, above0, above1, above2, above3);
1157     v_pred_64(dst, stride, above0, above1, above2, above3);
1158     v_pred_64(dst, stride, above0, above1, above2, above3);
1159     v_pred_64(dst, stride, above0, above1, above2, above3);
1160     v_pred_64(dst, stride, above0, above1, above2, above3);
1161     v_pred_64(dst, stride, above0, above1, above2, above3);
1162     v_pred_64(dst, stride, above0, above1, above2, above3);
1163 }
1164 
1165 // 64x16
1166 
svt_aom_highbd_v_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1167 void svt_aom_highbd_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1168                                            const uint16_t *left, int32_t bd) {
1169     // Load all 64 pixels in a row into 256-bit registers.
1170     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1171     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1172     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1173     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1174 
1175     (void)left;
1176     (void)bd;
1177 
1178     for (int32_t i = 0; i < 2; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1179 }
1180 
1181 // 64x32
1182 
svt_aom_highbd_v_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1183 void svt_aom_highbd_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1184                                            const uint16_t *left, int32_t bd) {
1185     // Load all 64 pixels in a row into 256-bit registers.
1186     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1187     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1188     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1189     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1190 
1191     (void)left;
1192     (void)bd;
1193 
1194     for (int32_t i = 0; i < 4; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1195 }
1196 
1197 // 64x64
1198 
svt_aom_highbd_v_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1199 void svt_aom_highbd_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1200                                            const uint16_t *left, int32_t bd) {
1201     // Load all 64 pixels in a row into 256-bit registers.
1202     const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1203     const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1204     const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1205     const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1206 
1207     (void)left;
1208     (void)bd;
1209 
1210     for (int32_t i = 0; i < 8; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1211 }
1212 
1213 // =============================================================================
1214 
1215 // Repeat for AVX2 optimizations.
1216 
1217 // bs = 4
1218 EB_ALIGN(32)
1219 static const uint16_t sm_weights_d_4[16] = {
1220     255,
1221     1,
1222     149,
1223     107,
1224     85,
1225     171,
1226     64,
1227     192, //  0  1  2  3
1228     255,
1229     1,
1230     149,
1231     107,
1232     85,
1233     171,
1234     64,
1235     192 //  0  1  2  3
1236 };
1237 
1238 // bs = 8
1239 EB_ALIGN(32)
1240 static const uint16_t sm_weights_d_8[32] = {
1241     255, 1,   197, 59,  146, 110, 105, 151, //  0  1  2  3
1242     255, 1,   197, 59,  146, 110, 105, 151, //  0  1  2  3
1243     73,  183, 50,  206, 37,  219, 32,  224, //  4  5  6  7
1244     73,  183, 50,  206, 37,  219, 32,  224 //  4  5  6  7
1245 };
1246 
1247 // bs = 16
1248 EB_ALIGN(32)
1249 static const uint16_t sm_weights_d_16[64] = {
1250     255, 1,   225, 31,  196, 60,  170, 86, //  0  1  2  3
1251     255, 1,   225, 31,  196, 60,  170, 86, //  0  1  2  3
1252     145, 111, 123, 133, 102, 154, 84,  172, //  4  5  6  7
1253     145, 111, 123, 133, 102, 154, 84,  172, //  4  5  6  7
1254     68,  188, 54,  202, 43,  213, 33,  223, //  8  9 10 11
1255     68,  188, 54,  202, 43,  213, 33,  223, //  8  9 10 11
1256     26,  230, 20,  236, 17,  239, 16,  240, // 12 13 14 15
1257     26,  230, 20,  236, 17,  239, 16,  240 // 12 13 14 15
1258 };
1259 
1260 // bs = 32
1261 EB_ALIGN(32)
1262 static const uint16_t sm_weights_d_32[128] = {
1263     255, 1,   240, 16,  225, 31,  210, 46, //  0  1  2  3
1264     255, 1,   240, 16,  225, 31,  210, 46, //  0  1  2  3
1265     196, 60,  182, 74,  169, 87,  157, 99, //  4  5  6  7
1266     196, 60,  182, 74,  169, 87,  157, 99, //  4  5  6  7
1267     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
1268     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
1269     101, 155, 92,  164, 83,  173, 74,  182, // 12 13 14 15
1270     101, 155, 92,  164, 83,  173, 74,  182, // 12 13 14 15
1271     66,  190, 59,  197, 52,  204, 45,  211, // 16 17 18 19
1272     66,  190, 59,  197, 52,  204, 45,  211, // 16 17 18 19
1273     39,  217, 34,  222, 29,  227, 25,  231, // 20 21 22 23
1274     39,  217, 34,  222, 29,  227, 25,  231, // 20 21 22 23
1275     21,  235, 17,  239, 14,  242, 12,  244, // 24 25 26 27
1276     21,  235, 17,  239, 14,  242, 12,  244, // 24 25 26 27
1277     10,  246, 9,   247, 8,   248, 8,   248, // 28 29 30 31
1278     10,  246, 9,   247, 8,   248, 8,   248 // 28 29 30 31
1279 };
1280 
1281 // bs = 64
1282 EB_ALIGN(32)
1283 static const uint16_t sm_weights_d_64[256] = {
1284     255, 1,   248, 8,   240, 16,  233, 23, //  0  1  2  3
1285     255, 1,   248, 8,   240, 16,  233, 23, //  0  1  2  3
1286     225, 31,  218, 38,  210, 46,  203, 53, //  4  5  6  7
1287     225, 31,  218, 38,  210, 46,  203, 53, //  4  5  6  7
1288     196, 60,  189, 67,  182, 74,  176, 80, //  8  9 10 11
1289     196, 60,  189, 67,  182, 74,  176, 80, //  8  9 10 11
1290     169, 87,  163, 93,  156, 100, 150, 106, // 12 13 14 15
1291     169, 87,  163, 93,  156, 100, 150, 106, // 12 13 14 15
1292     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1293     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1294     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1295     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1296     101, 155, 96,  160, 91,  165, 86,  170, // 24 25 26 27
1297     101, 155, 96,  160, 91,  165, 86,  170, // 24 25 26 27
1298     82,  174, 77,  179, 73,  183, 69,  187, // 28 29 30 31
1299     82,  174, 77,  179, 73,  183, 69,  187, // 28 29 30 31
1300     65,  191, 61,  195, 57,  199, 54,  202, // 32 33 34 35
1301     65,  191, 61,  195, 57,  199, 54,  202, // 32 33 34 35
1302     50,  206, 47,  209, 44,  212, 41,  215, // 36 37 38 39
1303     50,  206, 47,  209, 44,  212, 41,  215, // 36 37 38 39
1304     38,  218, 35,  221, 32,  224, 29,  227, // 40 41 42 43
1305     38,  218, 35,  221, 32,  224, 29,  227, // 40 41 42 43
1306     27,  229, 25,  231, 22,  234, 20,  236, // 44 45 46 47
1307     27,  229, 25,  231, 22,  234, 20,  236, // 44 45 46 47
1308     18,  238, 16,  240, 15,  241, 13,  243, // 48 49 50 51
1309     18,  238, 16,  240, 15,  241, 13,  243, // 48 49 50 51
1310     12,  244, 10,  246, 9,   247, 8,   248, // 52 53 54 55
1311     12,  244, 10,  246, 9,   247, 8,   248, // 52 53 54 55
1312     7,   249, 6,   250, 6,   250, 5,   251, // 56 57 58 59
1313     7,   249, 6,   250, 6,   250, 5,   251, // 56 57 58 59
1314     5,   251, 4,   252, 4,   252, 4,   252, // 60 61 62 63
1315     5,   251, 4,   252, 4,   252, 4,   252 // 60 61 62 63
1316 };
1317 
1318 // -----------------------------------------------------------------------------
1319 
1320 // Shuffle for AVX2 optimizations.
1321 
1322 // bs = 16
1323 EB_ALIGN(32)
1324 static const uint16_t sm_weights_16[32] = {
1325     255, 1,   225, 31,  196, 60,  170, 86, //  0  1  2  3
1326     68,  188, 54,  202, 43,  213, 33,  223, //  8  9 10 11
1327     145, 111, 123, 133, 102, 154, 84,  172, //  4  5  6  7
1328     26,  230, 20,  236, 17,  239, 16,  240 // 12 13 14 15
1329 };
1330 
1331 // bs = 32
1332 EB_ALIGN(32)
1333 static const uint16_t sm_weights_32[64] = {
1334     255, 1,   240, 16,  225, 31,  210, 46, //  0  1  2  3
1335     145, 111, 133, 123, 122, 134, 111, 145, //  8  9 10 11
1336     196, 60,  182, 74,  169, 87,  157, 99, //  4  5  6  7
1337     101, 155, 92,  164, 83,  173, 74,  182, // 12 13 14 15
1338     66,  190, 59,  197, 52,  204, 45,  211, // 16 17 18 19
1339     21,  235, 17,  239, 14,  242, 12,  244, // 24 25 26 27
1340     39,  217, 34,  222, 29,  227, 25,  231, // 20 21 22 23
1341     10,  246, 9,   247, 8,   248, 8,   248 // 28 29 30 31
1342 };
1343 
1344 // bs = 64
1345 EB_ALIGN(32)
1346 static const uint16_t sm_weights_64[128] = {
1347     255, 1,   248, 8,   240, 16,  233, 23, //  0  1  2  3
1348     196, 60,  189, 67,  182, 74,  176, 80, //  8  9 10 11
1349     225, 31,  218, 38,  210, 46,  203, 53, //  4  5  6  7
1350     169, 87,  163, 93,  156, 100, 150, 106, // 12 13 14 15
1351     144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1352     101, 155, 96,  160, 91,  165, 86,  170, // 24 25 26 27
1353     121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1354     82,  174, 77,  179, 73,  183, 69,  187, // 28 29 30 31
1355     65,  191, 61,  195, 57,  199, 54,  202, // 32 33 34 35
1356     38,  218, 35,  221, 32,  224, 29,  227, // 40 41 42 43
1357     50,  206, 47,  209, 44,  212, 41,  215, // 36 37 38 39
1358     27,  229, 25,  231, 22,  234, 20,  236, // 44 45 46 47
1359     18,  238, 16,  240, 15,  241, 13,  243, // 48 49 50 51
1360     7,   249, 6,   250, 6,   250, 5,   251, // 56 57 58 59
1361     12,  244, 10,  246, 9,   247, 8,   248, // 52 53 54 55
1362     5,   251, 4,   252, 4,   252, 4,   252 // 60 61 62 63
1363 };
1364 
1365 // SMOOTH_PRED
1366 
1367 // 8xN
1368 
load_right_weights_8(const uint16_t * const above,__m256i * const r,__m256i * const weights)1369 static INLINE void load_right_weights_8(const uint16_t *const above, __m256i *const r,
1370                                         __m256i *const weights) {
1371     *r = _mm256_set1_epi16((uint16_t)above[7]);
1372 
1373     // 0 1 2 3  0 1 2 3
1374     weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_d_8 + 0x00));
1375     // 4 5 6 7  4 5 6 7
1376     weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_d_8 + 0x10));
1377 }
1378 
load_left_4(const uint16_t * const left,const __m256i r)1379 static INLINE __m256i load_left_4(const uint16_t *const left, const __m256i r) {
1380     const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
1381     // 0 1 2 3 x x x x  0 1 2 3 x x x x
1382     const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1383     return _mm256_unpacklo_epi16(l, r); // 0 1 2 3  0 1 2 3
1384 }
1385 
load_left_8(const uint16_t * const left,const __m256i r,__m256i * const lr)1386 static INLINE void load_left_8(const uint16_t *const left, const __m256i r, __m256i *const lr) {
1387     const __m128i l0 = _mm_loadu_si128((const __m128i *)left);
1388     // 0 1 2 3 4 5 6 7  0 1 2 3 4 5 6 7
1389     const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1390     lr[0]           = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  0 1 2 3
1391     lr[1]           = _mm256_unpackhi_epi16(l, r); // 4 5 6 7  4 5 6 7
1392 }
1393 
init_8(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1394 static INLINE void init_8(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1395                           __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1396                           __m256i *const rep) {
1397     const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
1398     const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
1399     const __m256i b  = _mm256_set1_epi16((uint16_t)left[h - 1]);
1400     __m256i       a[2];
1401     a[0]  = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
1402     a[1]  = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
1403     ab[0] = _mm256_unpacklo_epi16(a[0], b);
1404     ab[1] = _mm256_unpacklo_epi16(a[1], b);
1405     load_right_weights_8(above, r, weights_w);
1406 
1407     const __m128i rep0 = _mm_set1_epi32(0x03020100);
1408     const __m128i rep1 = _mm_set1_epi32(0x07060504);
1409     const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
1410     const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
1411     rep[0]             = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
1412     rep[1]             = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
1413 }
1414 
smooth_pred_kernel(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr)1415 static INLINE __m256i smooth_pred_kernel(const __m256i *const weights_w, const __m256i weights_h,
1416                                          const __m256i rep, const __m256i *const ab,
1417                                          const __m256i lr) {
1418     const __m256i round = _mm256_set1_epi32((1 << sm_weight_log2_scale));
1419     __m256i       s[2], sum[2];
1420     // 0 0 0 0  1 1 1 1
1421     const __m256i w = _mm256_shuffle_epi8(weights_h, rep);
1422     const __m256i t = _mm256_shuffle_epi8(lr, rep);
1423     s[0]            = _mm256_madd_epi16(ab[0], w);
1424     s[1]            = _mm256_madd_epi16(ab[1], w);
1425     // width 8: 00 01 02 03  10 11 12 13
1426     // width 16: 0 1 2 3  8 9 A b
1427     sum[0] = _mm256_madd_epi16(t, weights_w[0]);
1428     // width 8: 04 05 06 07  14 15 16 17
1429     // width 16: 4 5 6 7  C D E F
1430     sum[1] = _mm256_madd_epi16(t, weights_w[1]);
1431     sum[0] = _mm256_add_epi32(sum[0], s[0]);
1432     sum[1] = _mm256_add_epi32(sum[1], s[1]);
1433     sum[0] = _mm256_add_epi32(sum[0], round);
1434     sum[1] = _mm256_add_epi32(sum[1], round);
1435     sum[0] = _mm256_srai_epi32(sum[0], 1 + sm_weight_log2_scale);
1436     sum[1] = _mm256_srai_epi32(sum[1], 1 + sm_weight_log2_scale);
1437     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
1438     // width 16: 0 1 2 3 4 5 6 7  8 9 A b C D E F
1439     return _mm256_packs_epi32(sum[0], sum[1]);
1440 }
1441 
smooth_pred_8x2(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1442 static INLINE void smooth_pred_8x2(const __m256i *const weights_w, const __m256i weights_h,
1443                                    const __m256i rep, const __m256i *const ab, const __m256i lr,
1444                                    uint16_t **const dst, const ptrdiff_t stride) {
1445     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
1446     const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1447     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
1448     *dst += stride;
1449     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
1450     *dst += stride;
1451 }
1452 
smooth_pred_8x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1453 static INLINE void smooth_pred_8x4(const __m256i *const  weights_w,
1454                                    const uint16_t *const sm_weights_h, const __m256i *const rep,
1455                                    const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1456                                    const ptrdiff_t stride) {
1457     const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1458     smooth_pred_8x2(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1459     smooth_pred_8x2(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1460 }
1461 
smooth_pred_8x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1462 static INLINE void smooth_pred_8x8(const uint16_t *const left, const __m256i *const weights_w,
1463                                    const uint16_t *const sm_weights_h, const __m256i *const rep,
1464                                    const __m256i *const ab, const __m256i r, uint16_t **const dst,
1465                                    const ptrdiff_t stride) {
1466     __m256i lr[2];
1467     load_left_8(left, r, lr);
1468 
1469     smooth_pred_8x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1470     smooth_pred_8x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1471 }
1472 
1473 // 8x4
1474 
svt_aom_highbd_smooth_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1475 void svt_aom_highbd_smooth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
1476                                               const uint16_t *above, const uint16_t *left,
1477                                               int32_t bd) {
1478     __m256i ab[2], r, lr, weights_w[2], rep[2];
1479     (void)bd;
1480 
1481     init_8(above, left, 4, ab, &r, weights_w, rep);
1482     lr = load_left_4(left, r);
1483     smooth_pred_8x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1484 }
1485 
1486 // 8x8
1487 
svt_aom_highbd_smooth_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1488 void svt_aom_highbd_smooth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
1489                                               const uint16_t *above, const uint16_t *left,
1490                                               int32_t bd) {
1491     __m256i ab[2], r, weights_w[2], rep[2];
1492     (void)bd;
1493 
1494     init_8(above, left, 8, ab, &r, weights_w, rep);
1495 
1496     smooth_pred_8x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1497 }
1498 
1499 // 8x16
1500 
svt_aom_highbd_smooth_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1501 void svt_aom_highbd_smooth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
1502                                                const uint16_t *above, const uint16_t *left,
1503                                                int32_t bd) {
1504     __m256i ab[2], r, weights_w[2], rep[2];
1505     (void)bd;
1506 
1507     init_8(above, left, 16, ab, &r, weights_w, rep);
1508 
1509     for (int32_t i = 0; i < 2; i++) {
1510         smooth_pred_8x8(
1511             left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1512     }
1513 }
1514 
1515 // 8x32
1516 
svt_aom_highbd_smooth_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1517 void svt_aom_highbd_smooth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
1518                                                const uint16_t *above, const uint16_t *left,
1519                                                int32_t bd) {
1520     __m256i ab[2], r, weights_w[2], rep[2];
1521     (void)bd;
1522 
1523     init_8(above, left, 32, ab, &r, weights_w, rep);
1524 
1525     for (int32_t i = 0; i < 4; i++) {
1526         smooth_pred_8x8(
1527             left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1528     }
1529 }
1530 
1531 // -----------------------------------------------------------------------------
1532 // 16xN
1533 
load_right_weights_16(const uint16_t * const above,__m256i * const r,__m256i * const weights)1534 static INLINE void load_right_weights_16(const uint16_t *const above, __m256i *const r,
1535                                          __m256i *const weights) {
1536     *r = _mm256_set1_epi16((uint16_t)above[15]);
1537 
1538     //  0  1  2  3   8  9 10 11
1539     weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_16 + 0x00));
1540     //  4  5  6  7  12 13 14 15
1541     weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_16 + 0x10));
1542 }
1543 
prepare_ab(const uint16_t * const above,const __m256i b,__m256i * const ab)1544 static INLINE void prepare_ab(const uint16_t *const above, const __m256i b, __m256i *const ab) {
1545     const __m256i a = _mm256_loadu_si256((const __m256i *)above);
1546     ab[0]           = _mm256_unpacklo_epi16(a, b);
1547     ab[1]           = _mm256_unpackhi_epi16(a, b);
1548 }
1549 
init_16(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1550 static INLINE void init_16(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1551                            __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1552                            __m256i *const rep) {
1553     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1554     prepare_ab(above, b, ab);
1555     load_right_weights_16(above, r, weights_w);
1556 
1557     rep[0] = _mm256_set1_epi32(0x03020100);
1558     rep[1] = _mm256_set1_epi32(0x07060504);
1559     rep[2] = _mm256_set1_epi32(0x0B0A0908);
1560     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1561 }
1562 
smooth_pred_16(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1563 static INLINE void smooth_pred_16(const __m256i *const weights_w, const __m256i weights_h,
1564                                   const __m256i rep, const __m256i *const ab, const __m256i lr,
1565                                   uint16_t **const dst, const ptrdiff_t stride) {
1566     const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1567     _mm256_storeu_si256((__m256i *)*dst, d);
1568     *dst += stride;
1569 }
1570 
smooth_pred_16x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1571 static INLINE void smooth_pred_16x4(const __m256i *const  weights_w,
1572                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1573                                     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1574                                     const ptrdiff_t stride) {
1575     const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1576     smooth_pred_16(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1577     smooth_pred_16(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1578     smooth_pred_16(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1579     smooth_pred_16(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1580 }
1581 
smooth_pred_16x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1582 static INLINE void smooth_pred_16x8(const uint16_t *const left, const __m256i *const weights_w,
1583                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1584                                     const __m256i *const ab, const __m256i r, uint16_t **const dst,
1585                                     const ptrdiff_t stride) {
1586     __m256i lr[2];
1587     load_left_8(left, r, lr);
1588 
1589     smooth_pred_16x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1590     smooth_pred_16x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1591 }
1592 
1593 // 16x4
1594 
svt_aom_highbd_smooth_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1595 void svt_aom_highbd_smooth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
1596                                                const uint16_t *above, const uint16_t *left,
1597                                                int32_t bd) {
1598     __m256i ab[2], r, lr, weights_w[2], rep[4];
1599     (void)bd;
1600 
1601     init_16(above, left, 4, ab, &r, weights_w, rep);
1602     lr = load_left_4(left, r);
1603     smooth_pred_16x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1604 }
1605 
1606 // 16x8
1607 
svt_aom_highbd_smooth_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1608 void svt_aom_highbd_smooth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
1609                                                const uint16_t *above, const uint16_t *left,
1610                                                int32_t bd) {
1611     __m256i ab[2], r, weights_w[2], rep[4];
1612     (void)bd;
1613 
1614     init_16(above, left, 8, ab, &r, weights_w, rep);
1615 
1616     smooth_pred_16x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1617 }
1618 
1619 // 16x16
1620 
svt_aom_highbd_smooth_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1621 void svt_aom_highbd_smooth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
1622                                                 const uint16_t *above, const uint16_t *left,
1623                                                 int32_t bd) {
1624     __m256i ab[2], r, weights_w[2], rep[4];
1625     (void)bd;
1626 
1627     init_16(above, left, 16, ab, &r, weights_w, rep);
1628 
1629     for (int32_t i = 0; i < 2; i++) {
1630         smooth_pred_16x8(
1631             left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1632     }
1633 }
1634 
1635 // 16x32
1636 
svt_aom_highbd_smooth_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1637 void svt_aom_highbd_smooth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
1638                                                 const uint16_t *above, const uint16_t *left,
1639                                                 int32_t bd) {
1640     __m256i ab[2], r, weights_w[2], rep[4];
1641     (void)bd;
1642 
1643     init_16(above, left, 32, ab, &r, weights_w, rep);
1644 
1645     for (int32_t i = 0; i < 4; i++) {
1646         smooth_pred_16x8(
1647             left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1648     }
1649 }
1650 
1651 // 16x64
1652 
svt_aom_highbd_smooth_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1653 void svt_aom_highbd_smooth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
1654                                                 const uint16_t *above, const uint16_t *left,
1655                                                 int32_t bd) {
1656     __m256i ab[2], r, weights_w[2], rep[4];
1657     (void)bd;
1658 
1659     init_16(above, left, 64, ab, &r, weights_w, rep);
1660 
1661     for (int32_t i = 0; i < 8; i++) {
1662         smooth_pred_16x8(
1663             left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1664     }
1665 }
1666 
1667 // -----------------------------------------------------------------------------
1668 // 32xN
1669 
load_right_weights_32(const uint16_t * const above,__m256i * const r,__m256i * const weights)1670 static INLINE void load_right_weights_32(const uint16_t *const above, __m256i *const r,
1671                                          __m256i *const weights) {
1672     *r = _mm256_set1_epi16((uint16_t)above[31]);
1673 
1674     //  0  1  2  3   8  9 10 11
1675     weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x00));
1676     //  4  5  6  7  12 13 14 15
1677     weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x10));
1678     // 16 17 18 19  24 25 26 27
1679     weights[2] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x20));
1680     // 20 21 22 23  28 29 30 31
1681     weights[3] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x30));
1682 }
1683 
init_32(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1684 static INLINE void init_32(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1685                            __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1686                            __m256i *const rep) {
1687     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1688     prepare_ab(above + 0x00, b, ab + 0);
1689     prepare_ab(above + 0x10, b, ab + 2);
1690     load_right_weights_32(above, r, weights_w);
1691 
1692     rep[0] = _mm256_set1_epi32(0x03020100);
1693     rep[1] = _mm256_set1_epi32(0x07060504);
1694     rep[2] = _mm256_set1_epi32(0x0B0A0908);
1695     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1696 }
1697 
smooth_pred_32(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1698 static INLINE void smooth_pred_32(const __m256i *const weights_w, const __m256i weights_h,
1699                                   const __m256i rep, const __m256i *const ab, const __m256i lr,
1700                                   uint16_t **const dst, const ptrdiff_t stride) {
1701     __m256i d;
1702 
1703     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
1704     d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1705     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1706 
1707     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
1708     d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1709     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1710     *dst += stride;
1711 }
1712 
smooth_pred_32x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1713 static INLINE void smooth_pred_32x4(const __m256i *const  weights_w,
1714                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1715                                     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1716                                     const ptrdiff_t stride) {
1717     const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1718     smooth_pred_32(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1719     smooth_pred_32(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1720     smooth_pred_32(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1721     smooth_pred_32(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1722 }
1723 
smooth_pred_32x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1724 static INLINE void smooth_pred_32x8(const uint16_t *const left, const __m256i *const weights_w,
1725                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1726                                     const __m256i *const ab, const __m256i r, uint16_t **const dst,
1727                                     const ptrdiff_t stride) {
1728     __m256i lr[2];
1729     load_left_8(left, r, lr);
1730 
1731     smooth_pred_32x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1732     smooth_pred_32x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1733 }
1734 
1735 // 32x8
1736 
svt_aom_highbd_smooth_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1737 void svt_aom_highbd_smooth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
1738                                                const uint16_t *above, const uint16_t *left,
1739                                                int32_t bd) {
1740     __m256i ab[4], r, weights_w[4], rep[4];
1741     (void)bd;
1742 
1743     init_32(above, left, 8, ab, &r, weights_w, rep);
1744 
1745     smooth_pred_32x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1746 }
1747 
1748 // 32x16
1749 
svt_aom_highbd_smooth_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1750 void svt_aom_highbd_smooth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
1751                                                 const uint16_t *above, const uint16_t *left,
1752                                                 int32_t bd) {
1753     __m256i ab[4], r, weights_w[4], rep[4];
1754     (void)bd;
1755 
1756     init_32(above, left, 16, ab, &r, weights_w, rep);
1757 
1758     for (int32_t i = 0; i < 2; i++) {
1759         smooth_pred_32x8(
1760             left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1761     }
1762 }
1763 
1764 // 32x32
1765 
svt_aom_highbd_smooth_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1766 void svt_aom_highbd_smooth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
1767                                                 const uint16_t *above, const uint16_t *left,
1768                                                 int32_t bd) {
1769     __m256i ab[4], r, weights_w[4], rep[4];
1770     (void)bd;
1771 
1772     init_32(above, left, 32, ab, &r, weights_w, rep);
1773 
1774     for (int32_t i = 0; i < 4; i++) {
1775         smooth_pred_32x8(
1776             left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1777     }
1778 }
1779 
1780 // 32x64
1781 
svt_aom_highbd_smooth_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1782 void svt_aom_highbd_smooth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
1783                                                 const uint16_t *above, const uint16_t *left,
1784                                                 int32_t bd) {
1785     __m256i ab[4], r, weights_w[4], rep[4];
1786     (void)bd;
1787 
1788     init_32(above, left, 64, ab, &r, weights_w, rep);
1789 
1790     for (int32_t i = 0; i < 8; i++) {
1791         smooth_pred_32x8(
1792             left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1793     }
1794 }
1795 
1796 // -----------------------------------------------------------------------------
1797 // 64xN
1798 
load_right_weights_64(const uint16_t * const above,__m256i * const r,__m256i * const weights)1799 static INLINE void load_right_weights_64(const uint16_t *const above, __m256i *const r,
1800                                          __m256i *const weights) {
1801     *r = _mm256_set1_epi16((uint16_t)above[63]);
1802 
1803     //  0  1  2  3   8  9 10 11
1804     weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x00));
1805     //  4  5  6  7  12 13 14 15
1806     weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x10));
1807     // 16 17 18 19  24 25 26 27
1808     weights[2] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x20));
1809     // 20 21 22 23  28 29 30 31
1810     weights[3] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x30));
1811     // 32 33 34 35  40 41 42 43
1812     weights[4] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x40));
1813     // 36 37 38 39  44 45 46 47
1814     weights[5] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x50));
1815     // 48 49 50 51  56 57 58 59
1816     weights[6] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x60));
1817     // 52 53 54 55  60 61 62 63
1818     weights[7] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x70));
1819 }
1820 
init_64(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1821 static INLINE void init_64(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1822                            __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1823                            __m256i *const rep) {
1824     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1825     prepare_ab(above + 0x00, b, ab + 0);
1826     prepare_ab(above + 0x10, b, ab + 2);
1827     prepare_ab(above + 0x20, b, ab + 4);
1828     prepare_ab(above + 0x30, b, ab + 6);
1829     load_right_weights_64(above, r, weights_w);
1830 
1831     rep[0] = _mm256_set1_epi32(0x03020100);
1832     rep[1] = _mm256_set1_epi32(0x07060504);
1833     rep[2] = _mm256_set1_epi32(0x0B0A0908);
1834     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1835 }
1836 
smooth_pred_64(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1837 static INLINE void smooth_pred_64(const __m256i *const weights_w, const __m256i weights_h,
1838                                   const __m256i rep, const __m256i *const ab, const __m256i lr,
1839                                   uint16_t **const dst, const ptrdiff_t stride) {
1840     __m256i d;
1841 
1842     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
1843     d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1844     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1845 
1846     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
1847     d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1848     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1849 
1850     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
1851     d = smooth_pred_kernel(weights_w + 4, weights_h, rep, ab + 4, lr);
1852     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
1853 
1854     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
1855     d = smooth_pred_kernel(weights_w + 6, weights_h, rep, ab + 6, lr);
1856     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
1857     *dst += stride;
1858 }
1859 
smooth_pred_64x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1860 static INLINE void smooth_pred_64x4(const __m256i *const  weights_w,
1861                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1862                                     const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1863                                     const ptrdiff_t stride) {
1864     const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1865     smooth_pred_64(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1866     smooth_pred_64(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1867     smooth_pred_64(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1868     smooth_pred_64(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1869 }
1870 
smooth_pred_64x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1871 static INLINE void smooth_pred_64x8(const uint16_t *const left, const __m256i *const weights_w,
1872                                     const uint16_t *const sm_weights_h, const __m256i *const rep,
1873                                     const __m256i *const ab, const __m256i r, uint16_t **const dst,
1874                                     const ptrdiff_t stride) {
1875     __m256i lr[2];
1876     load_left_8(left, r, lr);
1877 
1878     smooth_pred_64x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1879     smooth_pred_64x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1880 }
1881 
1882 // 64x16
1883 
svt_aom_highbd_smooth_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1884 void svt_aom_highbd_smooth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
1885                                                 const uint16_t *above, const uint16_t *left,
1886                                                 int32_t bd) {
1887     __m256i ab[8], r, weights_w[8], rep[4];
1888     (void)bd;
1889 
1890     init_64(above, left, 16, ab, &r, weights_w, rep);
1891 
1892     for (int32_t i = 0; i < 2; i++) {
1893         smooth_pred_64x8(
1894             left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1895     }
1896 }
1897 
1898 // 64x32
1899 
svt_aom_highbd_smooth_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1900 void svt_aom_highbd_smooth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
1901                                                 const uint16_t *above, const uint16_t *left,
1902                                                 int32_t bd) {
1903     __m256i ab[8], r, weights_w[8], rep[4];
1904     (void)bd;
1905 
1906     init_64(above, left, 32, ab, &r, weights_w, rep);
1907 
1908     for (int32_t i = 0; i < 4; i++) {
1909         smooth_pred_64x8(
1910             left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1911     }
1912 }
1913 
1914 // 64x64
1915 
svt_aom_highbd_smooth_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1916 void svt_aom_highbd_smooth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
1917                                                 const uint16_t *above, const uint16_t *left,
1918                                                 int32_t bd) {
1919     __m256i ab[8], r, weights_w[8], rep[4];
1920     (void)bd;
1921 
1922     init_64(above, left, 64, ab, &r, weights_w, rep);
1923 
1924     for (int32_t i = 0; i < 8; i++) {
1925         smooth_pred_64x8(
1926             left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1927     }
1928 }
1929 
1930 // =============================================================================
1931 
1932 // SMOOTH_H_PRED
1933 
1934 // 8xN
1935 
smooth_h_pred_kernel(const __m256i * const weights,const __m256i lr)1936 static INLINE __m256i smooth_h_pred_kernel(const __m256i *const weights, const __m256i lr) {
1937     const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1938     __m256i       sum[2];
1939     // width 8: 00 01 02 03  10 11 12 13
1940     // width 16: 0 1 2 3  8 9 A b
1941     sum[0] = _mm256_madd_epi16(lr, weights[0]);
1942     // width 8: 04 05 06 07  14 15 16 17
1943     // width 16: 4 5 6 7  C D E F
1944     sum[1] = _mm256_madd_epi16(lr, weights[1]);
1945     sum[0] = _mm256_add_epi32(sum[0], round);
1946     sum[1] = _mm256_add_epi32(sum[1], round);
1947     sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
1948     sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
1949     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
1950     // width 16: 0 1 2 3 4 5 6 7  8 9 A b C D E F
1951     return _mm256_packs_epi32(sum[0], sum[1]);
1952 }
1953 
smooth_h_pred_8x2(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)1954 static INLINE void smooth_h_pred_8x2(const __m256i *const weights, __m256i *const lr,
1955                                      uint16_t **const dst, const ptrdiff_t stride) {
1956     const __m256i rep = _mm256_set1_epi32(0x03020100);
1957     // lr: 0 1 2 3  1 2 3 4
1958     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  1 1 1 1
1959     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
1960     const __m256i d = smooth_h_pred_kernel(weights, t);
1961     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
1962     *dst += stride;
1963     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
1964     *dst += stride;
1965     *lr = _mm256_srli_si256(*lr, 8); // 2 3 x x  3 4 x x
1966 }
1967 
smooth_h_pred_8x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)1968 static INLINE void smooth_h_pred_8x4(const __m256i *const weights, __m256i *const lr,
1969                                      uint16_t **const dst, const ptrdiff_t stride) {
1970     smooth_h_pred_8x2(weights, lr, dst, stride);
1971     smooth_h_pred_8x2(weights, lr, dst, stride);
1972 }
1973 
smooth_h_pred_8x8(const uint16_t * const left,const __m256i r,const __m256i * const weights,uint16_t ** const dst,const ptrdiff_t stride)1974 static INLINE void smooth_h_pred_8x8(const uint16_t *const left, const __m256i r,
1975                                      const __m256i *const weights, uint16_t **const dst,
1976                                      const ptrdiff_t stride) {
1977     const __m128i l0 = _mm_loadu_si128((const __m128i *)left);
1978     const __m128i l1 = _mm_srli_si128(l0, 2);
1979     // 0 1 2 3 4 5 6 7  1 2 3 4 5 6 7 x
1980     const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
1981     __m256i       lr[2];
1982     lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  1 2 3 4
1983     lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7  5 6 7 x
1984     smooth_h_pred_8x4(weights, &lr[0], dst, stride);
1985     smooth_h_pred_8x4(weights, &lr[1], dst, stride);
1986 }
1987 
1988 // 8x4
1989 
svt_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1990 void svt_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
1991                                                 const uint16_t *above, const uint16_t *left,
1992                                                 int32_t bd) {
1993     const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
1994     const __m128i l1 = _mm_srli_si128(l0, 2);
1995     // 0 1 2 3 x x x x  1 2 3 4 x x x x
1996     const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
1997     __m256i       r, weights[2];
1998     (void)bd;
1999 
2000     load_right_weights_8(above, &r, weights);
2001     __m256i lr = _mm256_unpacklo_epi16(l, r); // 0 1 2 3  1 2 3 4
2002     smooth_h_pred_8x4(weights, &lr, &dst, stride);
2003 }
2004 
2005 // 8x8
2006 
svt_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2007 void svt_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2008                                                 const uint16_t *above, const uint16_t *left,
2009                                                 int32_t bd) {
2010     __m256i r, weights[2];
2011     (void)bd;
2012 
2013     load_right_weights_8(above, &r, weights);
2014     smooth_h_pred_8x8(left, r, weights, &dst, stride);
2015 }
2016 
2017 // 8x16
2018 
svt_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2019 void svt_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2020                                                  const uint16_t *above, const uint16_t *left,
2021                                                  int32_t bd) {
2022     __m256i r, weights[2];
2023     (void)bd;
2024 
2025     load_right_weights_8(above, &r, weights);
2026     smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2027     smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2028 }
2029 
2030 // 8x32
2031 
svt_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2032 void svt_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2033                                                  const uint16_t *above, const uint16_t *left,
2034                                                  int32_t bd) {
2035     __m256i r, weights[2];
2036     (void)bd;
2037 
2038     load_right_weights_8(above, &r, weights);
2039 
2040     for (int32_t i = 0; i < 2; i++) {
2041         smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2042         smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2043         left += 16;
2044     }
2045 }
2046 
2047 // -----------------------------------------------------------------------------
2048 // 16xN
2049 
smooth_h_pred_16(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2050 static INLINE void smooth_h_pred_16(const __m256i *const weights, __m256i *const lr,
2051                                     uint16_t **const dst, const ptrdiff_t stride) {
2052     const __m256i rep = _mm256_set1_epi32(0x03020100);
2053     // lr: 0 1 2 3  0 1 2 3
2054     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
2055     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
2056     const __m256i d = smooth_h_pred_kernel(weights, t);
2057     _mm256_storeu_si256((__m256i *)*dst, d);
2058     *dst += stride;
2059     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
2060 }
2061 
smooth_h_pred_16x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2062 static INLINE void smooth_h_pred_16x4(const __m256i *const weights, __m256i *const lr,
2063                                       uint16_t **const dst, const ptrdiff_t stride) {
2064     smooth_h_pred_16(weights, lr, dst, stride);
2065     smooth_h_pred_16(weights, lr, dst, stride);
2066     smooth_h_pred_16(weights, lr, dst, stride);
2067     smooth_h_pred_16(weights, lr, dst, stride);
2068 }
2069 
smooth_h_pred_16x8(const uint16_t * const left,const __m256i r,const __m256i * const weights,uint16_t ** const dst,const ptrdiff_t stride)2070 static INLINE void smooth_h_pred_16x8(const uint16_t *const left, const __m256i r,
2071                                       const __m256i *const weights, uint16_t **const dst,
2072                                       const ptrdiff_t stride) {
2073     __m256i lr[2];
2074     load_left_8(left, r, lr);
2075     smooth_h_pred_16x4(weights, &lr[0], dst, stride);
2076     smooth_h_pred_16x4(weights, &lr[1], dst, stride);
2077 }
2078 
smooth_h_predictor_16x16(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2079 static INLINE void smooth_h_predictor_16x16(uint16_t *dst, const ptrdiff_t stride,
2080                                             const uint16_t *const above, const uint16_t *left,
2081                                             const int32_t n) {
2082     __m256i r, weights[2];
2083 
2084     load_right_weights_16(above, &r, weights);
2085 
2086     for (int32_t i = 0; i < n; i++) {
2087         smooth_h_pred_16x8(left + 0, r, weights, &dst, stride);
2088         smooth_h_pred_16x8(left + 8, r, weights, &dst, stride);
2089         left += 16;
2090     }
2091 }
2092 
2093 // 16x4
2094 
svt_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2095 void svt_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2096                                                  const uint16_t *above, const uint16_t *left,
2097                                                  int32_t bd) {
2098     __m256i r, lr, weights[2];
2099     (void)bd;
2100 
2101     load_right_weights_16(above, &r, weights);
2102     lr = load_left_4(left, r);
2103     smooth_h_pred_16x4(weights, &lr, &dst, stride);
2104 }
2105 
2106 // 16x8
2107 
svt_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2108 void svt_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2109                                                  const uint16_t *above, const uint16_t *left,
2110                                                  int32_t bd) {
2111     __m256i r, weights[2];
2112     (void)bd;
2113 
2114     load_right_weights_16(above, &r, weights);
2115     smooth_h_pred_16x8(left, r, weights, &dst, stride);
2116 }
2117 
2118 // 16x16
2119 
svt_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2120 void svt_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2121                                                   const uint16_t *above, const uint16_t *left,
2122                                                   int32_t bd) {
2123     (void)bd;
2124     smooth_h_predictor_16x16(dst, stride, above, left, 1);
2125 }
2126 
2127 // 16x32
2128 
svt_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2129 void svt_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2130                                                   const uint16_t *above, const uint16_t *left,
2131                                                   int32_t bd) {
2132     (void)bd;
2133     smooth_h_predictor_16x16(dst, stride, above, left, 2);
2134 }
2135 
2136 // 16x64
2137 
svt_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2138 void svt_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2139                                                   const uint16_t *above, const uint16_t *left,
2140                                                   int32_t bd) {
2141     (void)bd;
2142     smooth_h_predictor_16x16(dst, stride, above, left, 4);
2143 }
2144 
2145 // -----------------------------------------------------------------------------
2146 // 32xN
2147 
smooth_h_pred_32(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2148 static INLINE void smooth_h_pred_32(const __m256i *const weights, __m256i *const lr,
2149                                     uint16_t **const dst, const ptrdiff_t stride) {
2150     const __m256i rep = _mm256_set1_epi32(0x03020100);
2151     // lr: 0 1 2 3  0 1 2 3
2152     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
2153     __m256i       d;
2154 
2155     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
2156     d = smooth_h_pred_kernel(weights + 0, t);
2157     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2158 
2159     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
2160     d = smooth_h_pred_kernel(weights + 2, t);
2161     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2162     *dst += stride;
2163     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
2164 }
2165 
smooth_h_pred_32x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2166 static INLINE void smooth_h_pred_32x4(const __m256i *const weights, __m256i *const lr,
2167                                       uint16_t **const dst, const ptrdiff_t stride) {
2168     smooth_h_pred_32(weights, lr, dst, stride);
2169     smooth_h_pred_32(weights, lr, dst, stride);
2170     smooth_h_pred_32(weights, lr, dst, stride);
2171     smooth_h_pred_32(weights, lr, dst, stride);
2172 }
2173 
smooth_h_pred_32x8(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2174 static INLINE void smooth_h_pred_32x8(uint16_t *dst, const ptrdiff_t stride,
2175                                       const uint16_t *const above, const uint16_t *left,
2176                                       const int32_t n) {
2177     __m256i r, lr[2], weights[4];
2178 
2179     load_right_weights_32(above, &r, weights);
2180 
2181     for (int32_t i = 0; i < n; i++) {
2182         load_left_8(left, r, lr);
2183         smooth_h_pred_32x4(weights, &lr[0], &dst, stride);
2184         smooth_h_pred_32x4(weights, &lr[1], &dst, stride);
2185         left += 8;
2186     }
2187 }
2188 
2189 // 32x8
2190 
svt_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2191 void svt_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2192                                                  const uint16_t *above, const uint16_t *left,
2193                                                  int32_t bd) {
2194     (void)bd;
2195     smooth_h_pred_32x8(dst, stride, above, left, 1);
2196 }
2197 
2198 // 32x16
2199 
svt_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2200 void svt_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2201                                                   const uint16_t *above, const uint16_t *left,
2202                                                   int32_t bd) {
2203     (void)bd;
2204     smooth_h_pred_32x8(dst, stride, above, left, 2);
2205 }
2206 
2207 // 32x32
2208 
svt_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2209 void svt_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2210                                                   const uint16_t *above, const uint16_t *left,
2211                                                   int32_t bd) {
2212     (void)bd;
2213     smooth_h_pred_32x8(dst, stride, above, left, 4);
2214 }
2215 
2216 // 32x64
2217 
svt_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2218 void svt_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2219                                                   const uint16_t *above, const uint16_t *left,
2220                                                   int32_t bd) {
2221     (void)bd;
2222     smooth_h_pred_32x8(dst, stride, above, left, 8);
2223 }
2224 
2225 // -----------------------------------------------------------------------------
2226 // 64xN
2227 
smooth_h_pred_64(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2228 static INLINE void smooth_h_pred_64(const __m256i *const weights, __m256i *const lr,
2229                                     uint16_t **const dst, const ptrdiff_t stride) {
2230     const __m256i rep = _mm256_set1_epi32(0x03020100);
2231     // lr: 0 1 2 3  0 1 2 3
2232     const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0  0 0 0 0
2233     __m256i       d;
2234 
2235     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
2236     d = smooth_h_pred_kernel(weights + 0, t);
2237     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2238 
2239     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
2240     d = smooth_h_pred_kernel(weights + 2, t);
2241     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2242 
2243     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
2244     d = smooth_h_pred_kernel(weights + 4, t);
2245     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2246 
2247     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
2248     d = smooth_h_pred_kernel(weights + 6, t);
2249     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2250     *dst += stride;
2251     *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x  1 2 3 x
2252 }
2253 
smooth_h_pred_64x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2254 static INLINE void smooth_h_pred_64x4(const __m256i *const weights, __m256i *const lr,
2255                                       uint16_t **const dst, const ptrdiff_t stride) {
2256     smooth_h_pred_64(weights, lr, dst, stride);
2257     smooth_h_pred_64(weights, lr, dst, stride);
2258     smooth_h_pred_64(weights, lr, dst, stride);
2259     smooth_h_pred_64(weights, lr, dst, stride);
2260 }
2261 
smooth_h_pred_64x8(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2262 static INLINE void smooth_h_pred_64x8(uint16_t *dst, const ptrdiff_t stride,
2263                                       const uint16_t *const above, const uint16_t *left,
2264                                       const int32_t n) {
2265     __m256i r, lr[2], weights[8];
2266 
2267     load_right_weights_64(above, &r, weights);
2268 
2269     for (int32_t i = 0; i < n; i++) {
2270         load_left_8(left, r, lr);
2271         smooth_h_pred_64x4(weights, &lr[0], &dst, stride);
2272         smooth_h_pred_64x4(weights, &lr[1], &dst, stride);
2273         left += 8;
2274     }
2275 }
2276 
2277 // 64x16
2278 
svt_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2279 void svt_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2280                                                   const uint16_t *above, const uint16_t *left,
2281                                                   int32_t bd) {
2282     (void)bd;
2283     smooth_h_pred_64x8(dst, stride, above, left, 2);
2284 }
2285 
2286 // 64x32
2287 
svt_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2288 void svt_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2289                                                   const uint16_t *above, const uint16_t *left,
2290                                                   int32_t bd) {
2291     (void)bd;
2292     smooth_h_pred_64x8(dst, stride, above, left, 4);
2293 }
2294 
2295 // 64x64
2296 
svt_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2297 void svt_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2298                                                   const uint16_t *above, const uint16_t *left,
2299                                                   int32_t bd) {
2300     (void)bd;
2301     smooth_h_pred_64x8(dst, stride, above, left, 8);
2302 }
2303 
2304 // =============================================================================
2305 
2306 // SMOOTH_V_PRED
2307 
2308 // 8xN
2309 
smooth_v_init_8(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2310 static INLINE void smooth_v_init_8(const uint16_t *const above, const uint16_t *const left,
2311                                    const int32_t h, __m256i *const ab, __m256i *const rep) {
2312     const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
2313     const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
2314     const __m256i b  = _mm256_set1_epi16((uint16_t)left[h - 1]);
2315     __m256i       a[2];
2316     a[0]  = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
2317     a[1]  = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
2318     ab[0] = _mm256_unpacklo_epi16(a[0], b);
2319     ab[1] = _mm256_unpacklo_epi16(a[1], b);
2320 
2321     const __m128i rep0 = _mm_set1_epi32(0x03020100);
2322     const __m128i rep1 = _mm_set1_epi32(0x07060504);
2323     const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
2324     const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
2325     rep[0]             = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
2326     rep[1]             = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
2327 }
2328 
smooth_v_pred_kernel(const __m256i weights,const __m256i rep,const __m256i * const ab)2329 static INLINE __m256i smooth_v_pred_kernel(const __m256i weights, const __m256i rep,
2330                                            const __m256i *const ab) {
2331     const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
2332     __m256i       sum[2];
2333     // 0 0 0 0  1 1 1 1
2334     const __m256i w = _mm256_shuffle_epi8(weights, rep);
2335     sum[0]          = _mm256_madd_epi16(ab[0], w);
2336     sum[1]          = _mm256_madd_epi16(ab[1], w);
2337     sum[0]          = _mm256_add_epi32(sum[0], round);
2338     sum[1]          = _mm256_add_epi32(sum[1], round);
2339     sum[0]          = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
2340     sum[1]          = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
2341     // width 8: 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
2342     // width 16: 0 1 2 3 4 5 6 7  8 9 A b C D E F
2343     return _mm256_packs_epi32(sum[0], sum[1]);
2344 }
2345 
smooth_v_pred_8x2(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2346 static INLINE void smooth_v_pred_8x2(const __m256i weights, const __m256i rep,
2347                                      const __m256i *const ab, uint16_t **const dst,
2348                                      const ptrdiff_t stride) {
2349     // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
2350     const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2351     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
2352     *dst += stride;
2353     _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
2354     *dst += stride;
2355 }
2356 
smooth_v_pred_8x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2357 static INLINE void smooth_v_pred_8x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2358                                      const __m256i *const ab, uint16_t **const dst,
2359                                      const ptrdiff_t stride) {
2360     const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2361     smooth_v_pred_8x2(weights, rep[0], ab, dst, stride);
2362     smooth_v_pred_8x2(weights, rep[1], ab, dst, stride);
2363 }
2364 
smooth_v_pred_8x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2365 static INLINE void smooth_v_pred_8x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2366                                      const __m256i *const ab, uint16_t **const dst,
2367                                      const ptrdiff_t stride) {
2368     smooth_v_pred_8x4(sm_weights_h + 0, rep, ab, dst, stride);
2369     smooth_v_pred_8x4(sm_weights_h + 16, rep, ab, dst, stride);
2370 }
2371 
2372 // 8x4
2373 
svt_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2374 void svt_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2375                                                 const uint16_t *above, const uint16_t *left,
2376                                                 int32_t bd) {
2377     __m256i ab[2], rep[2];
2378     (void)bd;
2379 
2380     smooth_v_init_8(above, left, 4, ab, rep);
2381     smooth_v_pred_8x4(sm_weights_d_4, rep, ab, &dst, stride);
2382 }
2383 
2384 // 8x8
2385 
svt_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2386 void svt_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2387                                                 const uint16_t *above, const uint16_t *left,
2388                                                 int32_t bd) {
2389     __m256i ab[2], rep[2];
2390     (void)bd;
2391 
2392     smooth_v_init_8(above, left, 8, ab, rep);
2393 
2394     smooth_v_pred_8x8(sm_weights_d_8, rep, ab, &dst, stride);
2395 }
2396 
2397 // 8x16
2398 
svt_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2399 void svt_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2400                                                  const uint16_t *above, const uint16_t *left,
2401                                                  int32_t bd) {
2402     __m256i ab[2], rep[2];
2403     (void)bd;
2404 
2405     smooth_v_init_8(above, left, 16, ab, rep);
2406 
2407     for (int32_t i = 0; i < 2; i++)
2408         smooth_v_pred_8x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2409 }
2410 
2411 // 8x32
2412 
svt_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2413 void svt_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2414                                                  const uint16_t *above, const uint16_t *left,
2415                                                  int32_t bd) {
2416     __m256i ab[2], rep[2];
2417     (void)bd;
2418 
2419     smooth_v_init_8(above, left, 32, ab, rep);
2420 
2421     for (int32_t i = 0; i < 4; i++)
2422         smooth_v_pred_8x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2423 }
2424 
2425 // -----------------------------------------------------------------------------
2426 // 16xN
2427 
smooth_v_prepare_ab(const uint16_t * const above,const __m256i b,__m256i * const ab)2428 static INLINE void smooth_v_prepare_ab(const uint16_t *const above, const __m256i b,
2429                                        __m256i *const ab) {
2430     const __m256i a = _mm256_loadu_si256((const __m256i *)above);
2431     ab[0]           = _mm256_unpacklo_epi16(a, b);
2432     ab[1]           = _mm256_unpackhi_epi16(a, b);
2433 }
2434 
smooth_v_init_16(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2435 static INLINE void smooth_v_init_16(const uint16_t *const above, const uint16_t *const left,
2436                                     const int32_t h, __m256i *const ab, __m256i *const rep) {
2437     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2438     smooth_v_prepare_ab(above, b, ab);
2439 
2440     rep[0] = _mm256_set1_epi32(0x03020100);
2441     rep[1] = _mm256_set1_epi32(0x07060504);
2442     rep[2] = _mm256_set1_epi32(0x0B0A0908);
2443     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2444 }
2445 
smooth_v_pred_16(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2446 static INLINE void smooth_v_pred_16(const __m256i weights, const __m256i rep,
2447                                     const __m256i *const ab, uint16_t **const dst,
2448                                     const ptrdiff_t stride) {
2449     const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2450     _mm256_storeu_si256((__m256i *)*dst, d);
2451     *dst += stride;
2452 }
2453 
smooth_v_pred_16x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2454 static INLINE void smooth_v_pred_16x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2455                                       const __m256i *const ab, uint16_t **const dst,
2456                                       const ptrdiff_t stride) {
2457     const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2458     smooth_v_pred_16(weights, rep[0], ab, dst, stride);
2459     smooth_v_pred_16(weights, rep[1], ab, dst, stride);
2460     smooth_v_pred_16(weights, rep[2], ab, dst, stride);
2461     smooth_v_pred_16(weights, rep[3], ab, dst, stride);
2462 }
2463 
smooth_v_pred_16x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2464 static INLINE void smooth_v_pred_16x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2465                                       const __m256i *const ab, uint16_t **const dst,
2466                                       const ptrdiff_t stride) {
2467     smooth_v_pred_16x4(sm_weights_h + 0, rep, ab, dst, stride);
2468     smooth_v_pred_16x4(sm_weights_h + 16, rep, ab, dst, stride);
2469 }
2470 
2471 // 16x4
2472 
svt_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2473 void svt_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2474                                                  const uint16_t *above, const uint16_t *left,
2475                                                  int32_t bd) {
2476     __m256i ab[2], rep[4];
2477     (void)bd;
2478 
2479     smooth_v_init_16(above, left, 4, ab, rep);
2480     smooth_v_pred_16x4(sm_weights_d_4, rep, ab, &dst, stride);
2481 }
2482 
2483 // 16x8
2484 
svt_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2485 void svt_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2486                                                  const uint16_t *above, const uint16_t *left,
2487                                                  int32_t bd) {
2488     __m256i ab[2], rep[4];
2489     (void)bd;
2490 
2491     smooth_v_init_16(above, left, 8, ab, rep);
2492 
2493     smooth_v_pred_16x8(sm_weights_d_8, rep, ab, &dst, stride);
2494 }
2495 
2496 // 16x16
2497 
svt_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2498 void svt_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2499                                                   const uint16_t *above, const uint16_t *left,
2500                                                   int32_t bd) {
2501     __m256i ab[2], rep[4];
2502     (void)bd;
2503 
2504     smooth_v_init_16(above, left, 16, ab, rep);
2505 
2506     for (int32_t i = 0; i < 2; i++)
2507         smooth_v_pred_16x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2508 }
2509 
2510 // 16x32
2511 
svt_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2512 void svt_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2513                                                   const uint16_t *above, const uint16_t *left,
2514                                                   int32_t bd) {
2515     __m256i ab[2], rep[4];
2516     (void)bd;
2517 
2518     smooth_v_init_16(above, left, 32, ab, rep);
2519 
2520     for (int32_t i = 0; i < 4; i++)
2521         smooth_v_pred_16x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2522 }
2523 
2524 // 16x64
2525 
svt_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2526 void svt_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2527                                                   const uint16_t *above, const uint16_t *left,
2528                                                   int32_t bd) {
2529     __m256i ab[2], rep[4];
2530     (void)bd;
2531 
2532     smooth_v_init_16(above, left, 64, ab, rep);
2533 
2534     for (int32_t i = 0; i < 8; i++)
2535         smooth_v_pred_16x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2536 }
2537 
2538 // -----------------------------------------------------------------------------
2539 // 32xN
2540 
smooth_v_init_32(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2541 static INLINE void smooth_v_init_32(const uint16_t *const above, const uint16_t *const left,
2542                                     const int32_t h, __m256i *const ab, __m256i *const rep) {
2543     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2544     smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2545     smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2546 
2547     rep[0] = _mm256_set1_epi32(0x03020100);
2548     rep[1] = _mm256_set1_epi32(0x07060504);
2549     rep[2] = _mm256_set1_epi32(0x0B0A0908);
2550     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2551 }
2552 
smooth_v_pred_32(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2553 static INLINE void smooth_v_pred_32(const __m256i weights, const __m256i rep,
2554                                     const __m256i *const ab, uint16_t **const dst,
2555                                     const ptrdiff_t stride) {
2556     __m256i d;
2557 
2558     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
2559     d = smooth_v_pred_kernel(weights, rep, ab + 0);
2560     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2561 
2562     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
2563     d = smooth_v_pred_kernel(weights, rep, ab + 2);
2564     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2565     *dst += stride;
2566 }
2567 
smooth_v_pred_32x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2568 static INLINE void smooth_v_pred_32x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2569                                       const __m256i *const ab, uint16_t **const dst,
2570                                       const ptrdiff_t stride) {
2571     const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2572     smooth_v_pred_32(weights, rep[0], ab, dst, stride);
2573     smooth_v_pred_32(weights, rep[1], ab, dst, stride);
2574     smooth_v_pred_32(weights, rep[2], ab, dst, stride);
2575     smooth_v_pred_32(weights, rep[3], ab, dst, stride);
2576 }
2577 
smooth_v_pred_32x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2578 static INLINE void smooth_v_pred_32x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2579                                       const __m256i *const ab, uint16_t **const dst,
2580                                       const ptrdiff_t stride) {
2581     smooth_v_pred_32x4(sm_weights_h + 0, rep, ab, dst, stride);
2582     smooth_v_pred_32x4(sm_weights_h + 16, rep, ab, dst, stride);
2583 }
2584 
2585 // 32x8
2586 
svt_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2587 void svt_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2588                                                  const uint16_t *above, const uint16_t *left,
2589                                                  int32_t bd) {
2590     __m256i ab[4], rep[4];
2591     (void)bd;
2592 
2593     smooth_v_init_32(above, left, 8, ab, rep);
2594 
2595     smooth_v_pred_32x8(sm_weights_d_8, rep, ab, &dst, stride);
2596 }
2597 
2598 // 32x16
2599 
svt_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2600 void svt_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2601                                                   const uint16_t *above, const uint16_t *left,
2602                                                   int32_t bd) {
2603     __m256i ab[4], rep[4];
2604     (void)bd;
2605 
2606     smooth_v_init_32(above, left, 16, ab, rep);
2607 
2608     for (int32_t i = 0; i < 2; i++)
2609         smooth_v_pred_32x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2610 }
2611 
2612 // 32x32
2613 
svt_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2614 void svt_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2615                                                   const uint16_t *above, const uint16_t *left,
2616                                                   int32_t bd) {
2617     __m256i ab[4], rep[4];
2618     (void)bd;
2619 
2620     smooth_v_init_32(above, left, 32, ab, rep);
2621 
2622     for (int32_t i = 0; i < 4; i++)
2623         smooth_v_pred_32x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2624 }
2625 
2626 // 32x64
2627 
svt_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2628 void svt_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2629                                                   const uint16_t *above, const uint16_t *left,
2630                                                   int32_t bd) {
2631     __m256i ab[4], rep[4];
2632     (void)bd;
2633 
2634     smooth_v_init_32(above, left, 64, ab, rep);
2635 
2636     for (int32_t i = 0; i < 8; i++)
2637         smooth_v_pred_32x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2638 }
2639 
2640 // -----------------------------------------------------------------------------
2641 // 64xN
2642 
smooth_v_init_64(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2643 static INLINE void smooth_v_init_64(const uint16_t *const above, const uint16_t *const left,
2644                                     const int32_t h, __m256i *const ab, __m256i *const rep) {
2645     const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2646     smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2647     smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2648     smooth_v_prepare_ab(above + 0x20, b, ab + 4);
2649     smooth_v_prepare_ab(above + 0x30, b, ab + 6);
2650 
2651     rep[0] = _mm256_set1_epi32(0x03020100);
2652     rep[1] = _mm256_set1_epi32(0x07060504);
2653     rep[2] = _mm256_set1_epi32(0x0B0A0908);
2654     rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2655 }
2656 
smooth_v_pred_64(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2657 static INLINE void smooth_v_pred_64(const __m256i weights, const __m256i rep,
2658                                     const __m256i *const ab, uint16_t **const dst,
2659                                     const ptrdiff_t stride) {
2660     __m256i d;
2661 
2662     //  0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15
2663     d = smooth_v_pred_kernel(weights, rep, ab + 0);
2664     _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2665 
2666     // 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31
2667     d = smooth_v_pred_kernel(weights, rep, ab + 2);
2668     _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2669 
2670     // 32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47
2671     d = smooth_v_pred_kernel(weights, rep, ab + 4);
2672     _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2673 
2674     // 48 49 50 51 52 53 54 55  56 57 58 59 60 61 62 63
2675     d = smooth_v_pred_kernel(weights, rep, ab + 6);
2676     _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2677     *dst += stride;
2678 }
2679 
smooth_v_pred_64x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2680 static INLINE void smooth_v_pred_64x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2681                                       const __m256i *const ab, uint16_t **const dst,
2682                                       const ptrdiff_t stride) {
2683     const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2684     smooth_v_pred_64(weights, rep[0], ab, dst, stride);
2685     smooth_v_pred_64(weights, rep[1], ab, dst, stride);
2686     smooth_v_pred_64(weights, rep[2], ab, dst, stride);
2687     smooth_v_pred_64(weights, rep[3], ab, dst, stride);
2688 }
2689 
smooth_v_pred_64x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2690 static INLINE void smooth_v_pred_64x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2691                                       const __m256i *const ab, uint16_t **const dst,
2692                                       const ptrdiff_t stride) {
2693     smooth_v_pred_64x4(sm_weights_h + 0, rep, ab, dst, stride);
2694     smooth_v_pred_64x4(sm_weights_h + 16, rep, ab, dst, stride);
2695 }
2696 
2697 // 64x16
2698 
svt_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2699 void svt_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2700                                                   const uint16_t *above, const uint16_t *left,
2701                                                   int32_t bd) {
2702     __m256i ab[8], rep[4];
2703     (void)bd;
2704 
2705     smooth_v_init_64(above, left, 16, ab, rep);
2706 
2707     for (int32_t i = 0; i < 2; i++)
2708         smooth_v_pred_64x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2709 }
2710 
2711 // 64x32
2712 
svt_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2713 void svt_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2714                                                   const uint16_t *above, const uint16_t *left,
2715                                                   int32_t bd) {
2716     __m256i ab[8], rep[4];
2717     (void)bd;
2718 
2719     smooth_v_init_64(above, left, 32, ab, rep);
2720 
2721     for (int32_t i = 0; i < 4; i++)
2722         smooth_v_pred_64x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2723 }
2724 
2725 // 64x64
2726 
svt_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2727 void svt_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2728                                                   const uint16_t *above, const uint16_t *left,
2729                                                   int32_t bd) {
2730     __m256i ab[8], rep[4];
2731     (void)bd;
2732 
2733     smooth_v_init_64(above, left, 64, ab, rep);
2734 
2735     for (int32_t i = 0; i < 8; i++)
2736         smooth_v_pred_64x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2737 }
2738