1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <immintrin.h>
13 #include "EbHighbdIntraPrediction_SSE2.h"
14 #include "EbDefinitions.h"
15 #include "common_dsp_rtcd.h"
16 #include "EbIntraPrediction_AVX2.h"
17
18 // =============================================================================
19
20 // DC RELATED PRED
21
22 // Handle number of elements: up to 64.
dc_sum_large(const __m256i src)23 static INLINE __m128i dc_sum_large(const __m256i src) {
24 const __m128i s_lo = _mm256_extracti128_si256(src, 0);
25 const __m128i s_hi = _mm256_extracti128_si256(src, 1);
26 __m128i sum, sum_hi;
27 sum = _mm_add_epi16(s_lo, s_hi);
28 sum_hi = _mm_srli_si128(sum, 8);
29 sum = _mm_add_epi16(sum, sum_hi);
30 // Unpack to avoid 12-bit overflow.
31 sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
32
33 return dc_sum_4x32bit(sum);
34 }
35
36 // Handle number of elements: 65 to 128.
dc_sum_larger(const __m256i src)37 static INLINE __m128i dc_sum_larger(const __m256i src) {
38 const __m128i s_lo = _mm256_extracti128_si256(src, 0);
39 const __m128i s_hi = _mm256_extracti128_si256(src, 1);
40 __m128i sum, sum_hi;
41 sum = _mm_add_epi16(s_lo, s_hi);
42 // Unpack to avoid 12-bit overflow.
43 sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
44 sum = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
45 sum = _mm_add_epi32(sum, sum_hi);
46
47 return dc_sum_4x32bit(sum);
48 }
49
dc_sum_16(const uint16_t * const src)50 static INLINE __m128i dc_sum_16(const uint16_t *const src) {
51 const __m256i s = _mm256_loadu_si256((const __m256i *)src);
52 const __m128i s_lo = _mm256_extracti128_si256(s, 0);
53 const __m128i s_hi = _mm256_extracti128_si256(s, 1);
54 const __m128i sum = _mm_add_epi16(s_lo, s_hi);
55 return dc_sum_8x16bit(sum);
56 }
57
dc_sum_32(const uint16_t * const src)58 static INLINE __m128i dc_sum_32(const uint16_t *const src) {
59 const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
60 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
61 const __m256i sum = _mm256_add_epi16(s0, s1);
62 return dc_sum_large(sum);
63 }
64
dc_sum_64(const uint16_t * const src)65 static INLINE __m128i dc_sum_64(const uint16_t *const src) {
66 const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src + 0x00));
67 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + 0x10));
68 const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 0x20));
69 const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 0x30));
70 const __m256i s01 = _mm256_add_epi16(s0, s1);
71 const __m256i s23 = _mm256_add_epi16(s2, s3);
72 const __m256i sum = _mm256_add_epi16(s01, s23);
73 return dc_sum_large(sum);
74 }
75
dc_sum_4_16(const uint16_t * const src_4,const uint16_t * const src_16)76 static INLINE __m128i dc_sum_4_16(const uint16_t *const src_4, const uint16_t *const src_16) {
77 const __m128i s_4 = _mm_loadl_epi64((const __m128i *)src_4);
78 const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
79 const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
80 const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
81 const __m128i s_16_sum0 = _mm_add_epi16(s_lo, s_hi);
82 const __m128i s_16_sum_hi = _mm_srli_si128(s_16_sum0, 8);
83 const __m128i s_16_sum = _mm_add_epi16(s_16_sum0, s_16_sum_hi);
84 const __m128i sum = _mm_add_epi16(s_16_sum, s_4);
85 return dc_sum_4x16bit_large(sum);
86 }
87
dc_sum_8_16(const uint16_t * const src_8,const uint16_t * const src_16)88 static INLINE __m128i dc_sum_8_16(const uint16_t *const src_8, const uint16_t *const src_16) {
89 const __m128i s_8 = _mm_loadu_si128((const __m128i *)src_8);
90 const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
91 const __m128i s_lo = _mm256_extracti128_si256(s_16, 0);
92 const __m128i s_hi = _mm256_extracti128_si256(s_16, 1);
93 const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
94 const __m128i sum = _mm_add_epi16(s_16_sum, s_8);
95 return dc_sum_8x16bit_large(sum);
96 }
97
dc_sum_8_32(const uint16_t * const src_8,const uint16_t * const src_32)98 static INLINE __m128i dc_sum_8_32(const uint16_t *const src_8, const uint16_t *const src_32) {
99 const __m128i s_8 = _mm_loadu_si128((const __m128i *)src_8);
100 const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
101 const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
102 const __m256i s_32 = _mm256_add_epi16(s_32_0, s_32_1);
103 const __m128i s_lo = _mm256_extracti128_si256(s_32, 0);
104 const __m128i s_hi = _mm256_extracti128_si256(s_32, 1);
105 const __m128i s_16_sum = _mm_add_epi16(s_lo, s_hi);
106 const __m128i sum = _mm_add_epi16(s_8, s_16_sum);
107 return dc_sum_8x16bit_large(sum);
108 }
109
dc_sum_16_16(const uint16_t * const src0,const uint16_t * const src1)110 static INLINE __m128i dc_sum_16_16(const uint16_t *const src0, const uint16_t *const src1) {
111 const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
112 const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
113 const __m256i sum = _mm256_add_epi16(s0, s1);
114 return dc_sum_large(sum);
115 }
116
dc_sum_16_32(const uint16_t * const src_16,const uint16_t * const src_32)117 static INLINE __m128i dc_sum_16_32(const uint16_t *const src_16, const uint16_t *const src_32) {
118 const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
119 const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
120 const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
121 const __m256i sum0 = _mm256_add_epi16(s_16, s_32_0);
122 const __m256i sum = _mm256_add_epi16(sum0, s_32_1);
123 return dc_sum_large(sum);
124 }
125
dc_sum_32_32(const uint16_t * const src0,const uint16_t * const src1)126 static INLINE __m128i dc_sum_32_32(const uint16_t *const src0, const uint16_t *const src1) {
127 const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
128 const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
129 const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
130 const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
131 const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
132 const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
133 const __m256i sum = _mm256_add_epi16(sum0, sum1);
134 return dc_sum_large(sum);
135 }
136
dc_sum_32_64(const uint16_t * const src_32,const uint16_t * const src_64)137 static INLINE __m128i dc_sum_32_64(const uint16_t *const src_32, const uint16_t *const src_64) {
138 const __m256i s_32_0 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x00));
139 const __m256i s_32_1 = _mm256_loadu_si256((const __m256i *)(src_32 + 0x10));
140 const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
141 const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
142 const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
143 const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
144 const __m256i sum0 = _mm256_add_epi16(s_32_0, s_64_0);
145 const __m256i sum1 = _mm256_add_epi16(s_32_1, s_64_1);
146 const __m256i sum2 = _mm256_add_epi16(s_64_2, s_64_3);
147 const __m256i sum3 = _mm256_add_epi16(sum0, sum1);
148 const __m256i sum = _mm256_add_epi16(sum2, sum3);
149 return dc_sum_larger(sum);
150 }
151
dc_sum_64_64(const uint16_t * const src0,const uint16_t * const src1)152 static INLINE __m128i dc_sum_64_64(const uint16_t *const src0, const uint16_t *const src1) {
153 const __m256i s0_0 = _mm256_loadu_si256((const __m256i *)(src0 + 0x00));
154 const __m256i s0_1 = _mm256_loadu_si256((const __m256i *)(src0 + 0x10));
155 const __m256i s0_2 = _mm256_loadu_si256((const __m256i *)(src0 + 0x20));
156 const __m256i s0_3 = _mm256_loadu_si256((const __m256i *)(src0 + 0x30));
157 const __m256i s1_0 = _mm256_loadu_si256((const __m256i *)(src1 + 0x00));
158 const __m256i s1_1 = _mm256_loadu_si256((const __m256i *)(src1 + 0x10));
159 const __m256i s1_2 = _mm256_loadu_si256((const __m256i *)(src1 + 0x20));
160 const __m256i s1_3 = _mm256_loadu_si256((const __m256i *)(src1 + 0x30));
161 const __m256i sum0 = _mm256_add_epi16(s0_0, s1_0);
162 const __m256i sum1 = _mm256_add_epi16(s0_1, s1_1);
163 const __m256i sum2 = _mm256_add_epi16(s0_2, s1_2);
164 const __m256i sum3 = _mm256_add_epi16(s0_3, s1_3);
165 const __m256i sum4 = _mm256_add_epi16(sum0, sum1);
166 const __m256i sum5 = _mm256_add_epi16(sum2, sum3);
167 const __m256i sum = _mm256_add_epi16(sum4, sum5);
168 return dc_sum_larger(sum);
169 }
170
dc_sum_16_64(const uint16_t * const src_16,const uint16_t * const src_64)171 static INLINE __m128i dc_sum_16_64(const uint16_t *const src_16, const uint16_t *const src_64) {
172 const __m256i s_16 = _mm256_loadu_si256((const __m256i *)src_16);
173 const __m256i s_64_0 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x00));
174 const __m256i s_64_1 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x10));
175 const __m256i s_64_2 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x20));
176 const __m256i s_64_3 = _mm256_loadu_si256((const __m256i *)(src_64 + 0x30));
177 const __m256i s0 = _mm256_add_epi16(s_16, s_64_0);
178 const __m256i s1 = _mm256_add_epi16(s0, s_64_1);
179 const __m256i s2 = _mm256_add_epi16(s_64_2, s_64_3);
180 const __m256i sum = _mm256_add_epi16(s1, s2);
181 return dc_sum_larger(sum);
182 }
183
dc_common_predictor_16xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)184 static INLINE void dc_common_predictor_16xh_kernel(uint16_t *dst, const ptrdiff_t stride,
185 const int32_t h, const __m256i dc) {
186 for (int32_t i = 0; i < h; i++) {
187 _mm256_storeu_si256((__m256i *)dst, dc);
188 dst += stride;
189 }
190 }
191
dc_common_predictor_32xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)192 static INLINE void dc_common_predictor_32xh_kernel(uint16_t *dst, const ptrdiff_t stride,
193 const int32_t h, const __m256i dc) {
194 for (int32_t i = 0; i < h; i++) {
195 _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
196 _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
197 dst += stride;
198 }
199 }
200
dc_common_predictor_64xh_kernel(uint16_t * dst,const ptrdiff_t stride,const int32_t h,const __m256i dc)201 static INLINE void dc_common_predictor_64xh_kernel(uint16_t *dst, const ptrdiff_t stride,
202 const int32_t h, const __m256i dc) {
203 for (int32_t i = 0; i < h; i++) {
204 _mm256_storeu_si256((__m256i *)(dst + 0x00), dc);
205 _mm256_storeu_si256((__m256i *)(dst + 0x10), dc);
206 _mm256_storeu_si256((__m256i *)(dst + 0x20), dc);
207 _mm256_storeu_si256((__m256i *)(dst + 0x30), dc);
208 dst += stride;
209 }
210 }
211
dc_common_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)212 static INLINE void dc_common_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
213 const int32_t h, const __m128i dc) {
214 const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
215 dc_common_predictor_16xh_kernel(dst, stride, h, expected_dc);
216 }
217
dc_common_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)218 static INLINE void dc_common_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
219 const int32_t h, const __m128i dc) {
220 const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
221 dc_common_predictor_32xh_kernel(dst, stride, h, expected_dc);
222 }
223
dc_common_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const __m128i dc)224 static INLINE void dc_common_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
225 const int32_t h, const __m128i dc) {
226 const __m256i expected_dc = _mm256_broadcastw_epi16(dc);
227 dc_common_predictor_64xh_kernel(dst, stride, h, expected_dc);
228 }
229
230 // =============================================================================
231
232 // DC_128_PRED
233
234 // 16xN
235
dc_128_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)236 static INLINE void dc_128_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
237 const int32_t h, const int32_t bd) {
238 const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
239 dc_common_predictor_16xh_kernel(dst, stride, h, dc);
240 }
241
svt_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)242 void svt_aom_highbd_dc_128_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
243 const uint16_t *above, const uint16_t *left,
244 int32_t bd) {
245 (void)above;
246 (void)left;
247 dc_128_predictor_16xh(dst, stride, 4, bd);
248 }
249
svt_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)250 void svt_aom_highbd_dc_128_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
251 const uint16_t *above, const uint16_t *left,
252 int32_t bd) {
253 (void)above;
254 (void)left;
255 dc_128_predictor_16xh(dst, stride, 8, bd);
256 }
257
svt_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)258 void svt_aom_highbd_dc_128_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
259 const uint16_t *above, const uint16_t *left,
260 int32_t bd) {
261 (void)above;
262 (void)left;
263 dc_128_predictor_16xh(dst, stride, 16, bd);
264 }
265
svt_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)266 void svt_aom_highbd_dc_128_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
267 const uint16_t *above, const uint16_t *left,
268 int32_t bd) {
269 (void)above;
270 (void)left;
271 dc_128_predictor_16xh(dst, stride, 32, bd);
272 }
273
svt_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)274 void svt_aom_highbd_dc_128_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
275 const uint16_t *above, const uint16_t *left,
276 int32_t bd) {
277 (void)above;
278 (void)left;
279 dc_128_predictor_16xh(dst, stride, 64, bd);
280 }
281
282 // 32xN
283
dc_128_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)284 static INLINE void dc_128_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
285 const int32_t h, const int32_t bd) {
286 const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
287 dc_common_predictor_32xh_kernel(dst, stride, h, dc);
288 }
289
svt_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)290 void svt_aom_highbd_dc_128_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
291 const uint16_t *above, const uint16_t *left,
292 int32_t bd) {
293 (void)above;
294 (void)left;
295 dc_128_predictor_32xh(dst, stride, 8, bd);
296 }
297
svt_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)298 void svt_aom_highbd_dc_128_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
299 const uint16_t *above, const uint16_t *left,
300 int32_t bd) {
301 (void)above;
302 (void)left;
303 dc_128_predictor_32xh(dst, stride, 16, bd);
304 }
305
svt_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)306 void svt_aom_highbd_dc_128_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
307 const uint16_t *above, const uint16_t *left,
308 int32_t bd) {
309 (void)above;
310 (void)left;
311 dc_128_predictor_32xh(dst, stride, 32, bd);
312 }
313
svt_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)314 void svt_aom_highbd_dc_128_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
315 const uint16_t *above, const uint16_t *left,
316 int32_t bd) {
317 (void)above;
318 (void)left;
319 dc_128_predictor_32xh(dst, stride, 64, bd);
320 }
321
322 // 64xN
323
dc_128_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const int32_t h,const int32_t bd)324 static INLINE void dc_128_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
325 const int32_t h, const int32_t bd) {
326 const __m256i dc = _mm256_set1_epi16(1 << (bd - 1));
327 dc_common_predictor_64xh_kernel(dst, stride, h, dc);
328 }
329
svt_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)330 void svt_aom_highbd_dc_128_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
331 const uint16_t *above, const uint16_t *left,
332 int32_t bd) {
333 (void)above;
334 (void)left;
335 dc_128_predictor_64xh(dst, stride, 16, bd);
336 }
337
svt_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)338 void svt_aom_highbd_dc_128_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
339 const uint16_t *above, const uint16_t *left,
340 int32_t bd) {
341 (void)above;
342 (void)left;
343 dc_128_predictor_64xh(dst, stride, 32, bd);
344 }
345
svt_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)346 void svt_aom_highbd_dc_128_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
347 const uint16_t *above, const uint16_t *left,
348 int32_t bd) {
349 (void)above;
350 (void)left;
351 dc_128_predictor_64xh(dst, stride, 64, bd);
352 }
353
354 // =============================================================================
355
356 // DC_LEFT_PRED
357
358 // 16xN
359
svt_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)360 void svt_aom_highbd_dc_left_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
361 const uint16_t *above, const uint16_t *left,
362 int32_t bd) {
363 const __m128i round = _mm_cvtsi32_si128(2);
364 __m128i sum;
365 (void)above;
366 (void)bd;
367
368 sum = dc_sum_4(left);
369 sum = _mm_add_epi16(sum, round);
370 sum = _mm_srli_epi16(sum, 2);
371 dc_common_predictor_16xh(dst, stride, 4, sum);
372 }
373
svt_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)374 void svt_aom_highbd_dc_left_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
375 const uint16_t *above, const uint16_t *left,
376 int32_t bd) {
377 const __m128i round = _mm_cvtsi32_si128(4);
378 __m128i sum;
379 (void)above;
380 (void)bd;
381
382 sum = dc_sum_8(left);
383 sum = _mm_add_epi16(sum, round);
384 sum = _mm_srli_epi16(sum, 3);
385 dc_common_predictor_16xh(dst, stride, 8, sum);
386 }
387
svt_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)388 void svt_aom_highbd_dc_left_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
389 const uint16_t *above, const uint16_t *left,
390 int32_t bd) {
391 const __m128i round = _mm_cvtsi32_si128(8);
392 __m128i sum;
393 (void)above;
394 (void)bd;
395
396 sum = dc_sum_16(left);
397 sum = _mm_add_epi16(sum, round);
398 sum = _mm_srli_epi16(sum, 4);
399 dc_common_predictor_16xh(dst, stride, 16, sum);
400 }
401
svt_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)402 void svt_aom_highbd_dc_left_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
403 const uint16_t *above, const uint16_t *left,
404 int32_t bd) {
405 const __m128i round = _mm_cvtsi32_si128(16);
406 __m128i sum;
407 (void)above;
408 (void)bd;
409
410 sum = dc_sum_32(left);
411 sum = _mm_add_epi32(sum, round);
412 sum = _mm_srli_epi32(sum, 5);
413 dc_common_predictor_16xh(dst, stride, 32, sum);
414 }
415
svt_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)416 void svt_aom_highbd_dc_left_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
417 const uint16_t *above, const uint16_t *left,
418 int32_t bd) {
419 const __m128i round = _mm_cvtsi32_si128(32);
420 __m128i sum;
421 (void)above;
422 (void)bd;
423
424 sum = dc_sum_64(left);
425 sum = _mm_add_epi32(sum, round);
426 sum = _mm_srli_epi32(sum, 6);
427 dc_common_predictor_16xh(dst, stride, 64, sum);
428 }
429
430 // 32xN
431
svt_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)432 void svt_aom_highbd_dc_left_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
433 const uint16_t *above, const uint16_t *left,
434 int32_t bd) {
435 const __m128i round = _mm_cvtsi32_si128(4);
436 __m128i sum;
437 (void)above;
438 (void)bd;
439
440 sum = dc_sum_8(left);
441 sum = _mm_add_epi16(sum, round);
442 sum = _mm_srli_epi16(sum, 3);
443 dc_common_predictor_32xh(dst, stride, 8, sum);
444 }
445
svt_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)446 void svt_aom_highbd_dc_left_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
447 const uint16_t *above, const uint16_t *left,
448 int32_t bd) {
449 const __m128i round = _mm_cvtsi32_si128(8);
450 __m128i sum;
451 (void)above;
452 (void)bd;
453
454 sum = dc_sum_16(left);
455 sum = _mm_add_epi16(sum, round);
456 sum = _mm_srli_epi16(sum, 4);
457 dc_common_predictor_32xh(dst, stride, 16, sum);
458 }
459
svt_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)460 void svt_aom_highbd_dc_left_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
461 const uint16_t *above, const uint16_t *left,
462 int32_t bd) {
463 const __m128i round = _mm_cvtsi32_si128(16);
464 __m128i sum;
465 (void)above;
466 (void)bd;
467
468 sum = dc_sum_32(left);
469 sum = _mm_add_epi32(sum, round);
470 sum = _mm_srli_epi32(sum, 5);
471 dc_common_predictor_32xh(dst, stride, 32, sum);
472 }
473
svt_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)474 void svt_aom_highbd_dc_left_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
475 const uint16_t *above, const uint16_t *left,
476 int32_t bd) {
477 const __m128i round = _mm_cvtsi32_si128(32);
478 __m128i sum;
479 (void)above;
480 (void)bd;
481
482 sum = dc_sum_64(left);
483 sum = _mm_add_epi32(sum, round);
484 sum = _mm_srli_epi32(sum, 6);
485 dc_common_predictor_32xh(dst, stride, 64, sum);
486 }
487
488 // 64xN
489
svt_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)490 void svt_aom_highbd_dc_left_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
491 const uint16_t *above, const uint16_t *left,
492 int32_t bd) {
493 const __m128i round = _mm_cvtsi32_si128(8);
494 __m128i sum;
495 (void)above;
496 (void)bd;
497
498 sum = dc_sum_16(left);
499 sum = _mm_add_epi16(sum, round);
500 sum = _mm_srli_epi16(sum, 4);
501 dc_common_predictor_64xh(dst, stride, 16, sum);
502 }
503
svt_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)504 void svt_aom_highbd_dc_left_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
505 const uint16_t *above, const uint16_t *left,
506 int32_t bd) {
507 const __m128i round = _mm_cvtsi32_si128(16);
508 __m128i sum;
509 (void)above;
510 (void)bd;
511
512 sum = dc_sum_32(left);
513 sum = _mm_add_epi32(sum, round);
514 sum = _mm_srli_epi32(sum, 5);
515 dc_common_predictor_64xh(dst, stride, 32, sum);
516 }
517
svt_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)518 void svt_aom_highbd_dc_left_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
519 const uint16_t *above, const uint16_t *left,
520 int32_t bd) {
521 const __m128i round = _mm_cvtsi32_si128(32);
522 __m128i sum;
523 (void)above;
524 (void)bd;
525
526 sum = dc_sum_64(left);
527 sum = _mm_add_epi32(sum, round);
528 sum = _mm_srli_epi32(sum, 6);
529 dc_common_predictor_64xh(dst, stride, 64, sum);
530 }
531
532 // =============================================================================
533
534 // DC_TOP_PRED
535
536 // 16xN
537
dc_top_predictor_16xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)538 static INLINE void dc_top_predictor_16xh(uint16_t *const dst, const ptrdiff_t stride,
539 const uint16_t *const above, const int32_t h,
540 const int32_t bd) {
541 (void)bd;
542 const __m128i round = _mm_cvtsi32_si128(8);
543 __m128i sum;
544
545 sum = dc_sum_16(above);
546 sum = _mm_add_epi16(sum, round);
547 sum = _mm_srli_epi16(sum, 4);
548 dc_common_predictor_16xh(dst, stride, h, sum);
549 }
550
svt_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)551 void svt_aom_highbd_dc_top_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
552 const uint16_t *above, const uint16_t *left,
553 int32_t bd) {
554 (void)left;
555 dc_top_predictor_16xh(dst, stride, above, 4, bd);
556 }
557
svt_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)558 void svt_aom_highbd_dc_top_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
559 const uint16_t *above, const uint16_t *left,
560 int32_t bd) {
561 (void)left;
562 dc_top_predictor_16xh(dst, stride, above, 8, bd);
563 }
564
svt_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)565 void svt_aom_highbd_dc_top_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
566 const uint16_t *above, const uint16_t *left,
567 int32_t bd) {
568 (void)left;
569 dc_top_predictor_16xh(dst, stride, above, 16, bd);
570 }
571
svt_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)572 void svt_aom_highbd_dc_top_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
573 const uint16_t *above, const uint16_t *left,
574 int32_t bd) {
575 (void)left;
576 dc_top_predictor_16xh(dst, stride, above, 32, bd);
577 }
578
svt_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)579 void svt_aom_highbd_dc_top_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
580 const uint16_t *above, const uint16_t *left,
581 int32_t bd) {
582 (void)left;
583 dc_top_predictor_16xh(dst, stride, above, 64, bd);
584 }
585
586 // 32xN
587
dc_top_predictor_32xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)588 static INLINE void dc_top_predictor_32xh(uint16_t *const dst, const ptrdiff_t stride,
589 const uint16_t *const above, const int32_t h,
590 const int32_t bd) {
591 const __m128i round = _mm_cvtsi32_si128(16);
592 __m128i sum;
593 (void)bd;
594
595 sum = dc_sum_32(above);
596 sum = _mm_add_epi32(sum, round);
597 sum = _mm_srli_epi32(sum, 5);
598 dc_common_predictor_32xh(dst, stride, h, sum);
599 }
600
svt_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)601 void svt_aom_highbd_dc_top_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
602 const uint16_t *above, const uint16_t *left,
603 int32_t bd) {
604 (void)left;
605 dc_top_predictor_32xh(dst, stride, above, 8, bd);
606 }
607
svt_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)608 void svt_aom_highbd_dc_top_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
609 const uint16_t *above, const uint16_t *left,
610 int32_t bd) {
611 (void)left;
612 dc_top_predictor_32xh(dst, stride, above, 16, bd);
613 }
614
svt_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)615 void svt_aom_highbd_dc_top_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
616 const uint16_t *above, const uint16_t *left,
617 int32_t bd) {
618 (void)left;
619 dc_top_predictor_32xh(dst, stride, above, 32, bd);
620 }
621
svt_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)622 void svt_aom_highbd_dc_top_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
623 const uint16_t *above, const uint16_t *left,
624 int32_t bd) {
625 (void)left;
626 dc_top_predictor_32xh(dst, stride, above, 64, bd);
627 }
628
629 // 64xN
630
dc_top_predictor_64xh(uint16_t * const dst,const ptrdiff_t stride,const uint16_t * const above,const int32_t h,const int32_t bd)631 static INLINE void dc_top_predictor_64xh(uint16_t *const dst, const ptrdiff_t stride,
632 const uint16_t *const above, const int32_t h,
633 const int32_t bd) {
634 const __m128i round = _mm_cvtsi32_si128(32);
635 __m128i sum;
636 (void)bd;
637
638 sum = dc_sum_64(above);
639 sum = _mm_add_epi32(sum, round);
640 sum = _mm_srli_epi32(sum, 6);
641 dc_common_predictor_64xh(dst, stride, h, sum);
642 }
643
svt_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)644 void svt_aom_highbd_dc_top_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
645 const uint16_t *above, const uint16_t *left,
646 int32_t bd) {
647 (void)left;
648 dc_top_predictor_64xh(dst, stride, above, 16, bd);
649 }
650
svt_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)651 void svt_aom_highbd_dc_top_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
652 const uint16_t *above, const uint16_t *left,
653 int32_t bd) {
654 (void)left;
655 dc_top_predictor_64xh(dst, stride, above, 32, bd);
656 }
657
svt_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)658 void svt_aom_highbd_dc_top_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
659 const uint16_t *above, const uint16_t *left,
660 int32_t bd) {
661 (void)left;
662 dc_top_predictor_64xh(dst, stride, above, 64, bd);
663 }
664
665 // =============================================================================
666
667 // DC_PRED
668
669 // 16xN
670
svt_aom_highbd_dc_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)671 void svt_aom_highbd_dc_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
672 const uint16_t *left, int32_t bd) {
673 (void)bd;
674 __m128i sum = dc_sum_4_16(left, above);
675 uint32_t sum32 = _mm_cvtsi128_si32(sum);
676 sum32 += 10;
677 sum32 /= 20;
678 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
679
680 dc_common_predictor_16xh_kernel(dst, stride, 4, dc);
681 }
682
svt_aom_highbd_dc_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)683 void svt_aom_highbd_dc_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
684 const uint16_t *left, int32_t bd) {
685 (void)bd;
686 __m128i sum = dc_sum_8_16(left, above);
687 uint32_t sum32 = _mm_cvtsi128_si32(sum);
688 sum32 += 12;
689 sum32 /= 24;
690 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
691
692 dc_common_predictor_16xh_kernel(dst, stride, 8, dc);
693 }
694
svt_aom_highbd_dc_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)695 void svt_aom_highbd_dc_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
696 const uint16_t *left, int32_t bd) {
697 (void)bd;
698 __m128i sum = dc_sum_16_16(above, left);
699 sum = _mm_add_epi32(sum, _mm_set1_epi32(16));
700 sum = _mm_srli_epi32(sum, 5);
701 dc_common_predictor_16xh(dst, stride, 16, sum);
702 }
703
svt_aom_highbd_dc_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)704 void svt_aom_highbd_dc_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
705 const uint16_t *left, int32_t bd) {
706 (void)bd;
707 __m128i sum = dc_sum_16_32(above, left);
708 uint32_t sum32 = _mm_cvtsi128_si32(sum);
709 sum32 += 24;
710 sum32 /= 48;
711 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
712
713 dc_common_predictor_16xh_kernel(dst, stride, 32, dc);
714 }
715
svt_aom_highbd_dc_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)716 void svt_aom_highbd_dc_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
717 const uint16_t *left, int32_t bd) {
718 (void)bd;
719 __m128i sum = dc_sum_16_64(above, left);
720 uint32_t sum32 = _mm_cvtsi128_si32(sum);
721 sum32 += 40;
722 sum32 /= 80;
723 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
724
725 dc_common_predictor_16xh_kernel(dst, stride, 64, dc);
726 }
727
728 // 32xN
729
svt_aom_highbd_dc_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)730 void svt_aom_highbd_dc_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
731 const uint16_t *left, int32_t bd) {
732 (void)bd;
733 __m128i sum = dc_sum_8_32(left, above);
734 uint32_t sum32 = _mm_cvtsi128_si32(sum);
735 sum32 += 20;
736 sum32 /= 40;
737 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
738
739 dc_common_predictor_32xh_kernel(dst, stride, 8, dc);
740 }
741
svt_aom_highbd_dc_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)742 void svt_aom_highbd_dc_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
743 const uint16_t *left, int32_t bd) {
744 (void)bd;
745 __m128i sum = dc_sum_16_32(left, above);
746 uint32_t sum32 = _mm_cvtsi128_si32(sum);
747 sum32 += 24;
748 sum32 /= 48;
749 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
750
751 dc_common_predictor_32xh_kernel(dst, stride, 16, dc);
752 }
753
svt_aom_highbd_dc_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)754 void svt_aom_highbd_dc_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
755 const uint16_t *left, int32_t bd) {
756 (void)bd;
757 __m128i sum = dc_sum_32_32(above, left);
758 sum = _mm_add_epi32(sum, _mm_set1_epi32(32));
759 sum = _mm_srli_epi32(sum, 6);
760 dc_common_predictor_32xh(dst, stride, 32, sum);
761 }
762
svt_aom_highbd_dc_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)763 void svt_aom_highbd_dc_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
764 const uint16_t *left, int32_t bd) {
765 (void)bd;
766 __m128i sum = dc_sum_32_64(above, left);
767 uint32_t sum32 = _mm_cvtsi128_si32(sum);
768 sum32 += 48;
769 sum32 /= 96;
770 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
771
772 dc_common_predictor_32xh_kernel(dst, stride, 64, dc);
773 }
774
775 // 64xN
776
svt_aom_highbd_dc_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)777 void svt_aom_highbd_dc_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
778 const uint16_t *left, int32_t bd) {
779 (void)bd;
780 __m128i sum = dc_sum_16_64(left, above);
781 uint32_t sum32 = _mm_cvtsi128_si32(sum);
782 sum32 += 40;
783 sum32 /= 80;
784 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
785
786 dc_common_predictor_64xh_kernel(dst, stride, 16, dc);
787 }
788
svt_aom_highbd_dc_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)789 void svt_aom_highbd_dc_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
790 const uint16_t *left, int32_t bd) {
791 (void)bd;
792 __m128i sum = dc_sum_32_64(left, above);
793 uint32_t sum32 = _mm_cvtsi128_si32(sum);
794 sum32 += 48;
795 sum32 /= 96;
796 const __m256i dc = _mm256_set1_epi16((int16_t)sum32);
797
798 dc_common_predictor_64xh_kernel(dst, stride, 32, dc);
799 }
800
svt_aom_highbd_dc_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)801 void svt_aom_highbd_dc_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
802 const uint16_t *left, int32_t bd) {
803 (void)bd;
804 __m128i sum = dc_sum_64_64(above, left);
805 sum = _mm_add_epi32(sum, _mm_set1_epi32(64));
806 sum = _mm_srli_epi32(sum, 7);
807 dc_common_predictor_64xh(dst, stride, 64, sum);
808 }
809
810 // =============================================================================
811
812 // H_PRED
813
814 // 16xN
815
h_pred_16(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)816 static INLINE void h_pred_16(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
817 // Broadcast the 16-bit left pixel to 256-bit register.
818 const __m256i row = _mm256_broadcastw_epi16(left);
819
820 _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
821 *dst += stride;
822 }
823
824 // Process 8 rows.
h_pred_16x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)825 static INLINE void h_pred_16x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
826 const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
827
828 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 0));
829 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 2));
830 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 4));
831 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 6));
832 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 8));
833 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 10));
834 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 12));
835 h_pred_16(dst, stride, _mm_srli_si128(left_u16, 14));
836 }
837
838 // 16x4
839
svt_aom_highbd_h_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)840 void svt_aom_highbd_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
841 const uint16_t *left, int32_t bd) {
842 (void)above;
843 (void)bd;
844 const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
845
846 h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 0));
847 h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 2));
848 h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 4));
849 h_pred_16(&dst, stride, _mm_srli_si128(left_u16, 6));
850 }
851
852 // 16x64
853
svt_aom_highbd_h_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)854 void svt_aom_highbd_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
855 const uint16_t *left, int32_t bd) {
856 (void)above;
857 (void)bd;
858
859 for (int32_t i = 0; i < 8; i++, left += 8) h_pred_16x8(&dst, stride, left);
860 }
861
862 // -----------------------------------------------------------------------------
863
864 // 32xN
865
h_pred_32(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)866 static INLINE void h_pred_32(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
867 // Broadcast the 16-bit left pixel to 256-bit register.
868 const __m256i row = _mm256_broadcastw_epi16(left);
869
870 _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
871 _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
872 *dst += stride;
873 }
874
875 // Process 8 rows.
h_pred_32x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)876 static INLINE void h_pred_32x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
877 const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
878
879 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 0));
880 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 2));
881 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 4));
882 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 6));
883 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 8));
884 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 10));
885 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 12));
886 h_pred_32(dst, stride, _mm_srli_si128(left_u16, 14));
887 }
888
889 // 32x8
890
svt_aom_highbd_h_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)891 void svt_aom_highbd_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
892 const uint16_t *left, int32_t bd) {
893 (void)above;
894 (void)bd;
895
896 h_pred_32x8(&dst, stride, left);
897 }
898
899 // 32x64
900
svt_aom_highbd_h_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)901 void svt_aom_highbd_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
902 const uint16_t *left, int32_t bd) {
903 (void)above;
904 (void)bd;
905
906 for (int32_t i = 0; i < 8; i++, left += 8) h_pred_32x8(&dst, stride, left);
907 }
908
909 // -----------------------------------------------------------------------------
910
911 // 64xN
912
h_pred_64(uint16_t ** const dst,const ptrdiff_t stride,const __m128i left)913 static INLINE void h_pred_64(uint16_t **const dst, const ptrdiff_t stride, const __m128i left) {
914 // Broadcast the 16-bit left pixel to 256-bit register.
915 const __m256i row = _mm256_broadcastw_epi16(left);
916
917 _mm256_storeu_si256((__m256i *)(*dst + 0x00), row);
918 _mm256_storeu_si256((__m256i *)(*dst + 0x10), row);
919 _mm256_storeu_si256((__m256i *)(*dst + 0x20), row);
920 _mm256_storeu_si256((__m256i *)(*dst + 0x30), row);
921 *dst += stride;
922 }
923
924 // Process 8 rows.
h_pred_64x8(uint16_t ** dst,const ptrdiff_t stride,const uint16_t * const left)925 static INLINE void h_pred_64x8(uint16_t **dst, const ptrdiff_t stride, const uint16_t *const left) {
926 const __m128i left_u16 = _mm_loadu_si128((const __m128i *)left);
927
928 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 0));
929 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 2));
930 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 4));
931 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 6));
932 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 8));
933 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 10));
934 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 12));
935 h_pred_64(dst, stride, _mm_srli_si128(left_u16, 14));
936 }
937
938 // 64x16
939
svt_aom_highbd_h_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)940 void svt_aom_highbd_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
941 const uint16_t *left, int32_t bd) {
942 (void)above;
943 (void)bd;
944
945 for (int32_t i = 0; i < 2; i++, left += 8) h_pred_64x8(&dst, stride, left);
946 }
947
948 // 64x32
949
svt_aom_highbd_h_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)950 void svt_aom_highbd_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
951 const uint16_t *left, int32_t bd) {
952 (void)above;
953 (void)bd;
954
955 for (int32_t i = 0; i < 4; i++, left += 8) h_pred_64x8(&dst, stride, left);
956 }
957
958 // 64x64
959
svt_aom_highbd_h_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)960 void svt_aom_highbd_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
961 const uint16_t *left, int32_t bd) {
962 (void)above;
963 (void)bd;
964
965 for (int32_t i = 0; i < 8; i++, left += 8) h_pred_64x8(&dst, stride, left);
966 }
967
968 // =============================================================================
969
970 // V_PRED
971
972 // 16xN
973
v_pred_16(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0)974 static INLINE void v_pred_16(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0) {
975 _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
976 *dst += stride;
977 }
978
979 // Process 8 rows.
v_pred_16x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above)980 static INLINE void v_pred_16x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above) {
981 v_pred_16(dst, stride, above);
982 v_pred_16(dst, stride, above);
983 v_pred_16(dst, stride, above);
984 v_pred_16(dst, stride, above);
985 v_pred_16(dst, stride, above);
986 v_pred_16(dst, stride, above);
987 v_pred_16(dst, stride, above);
988 v_pred_16(dst, stride, above);
989 }
990
991 // 16x4
992
svt_aom_highbd_v_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)993 void svt_aom_highbd_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
994 const uint16_t *left, int32_t bd) {
995 // Load all 16 pixels in a row into 256-bit registers.
996 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
997
998 (void)left;
999 (void)bd;
1000
1001 v_pred_16(&dst, stride, above0);
1002 v_pred_16(&dst, stride, above0);
1003 v_pred_16(&dst, stride, above0);
1004 v_pred_16(&dst, stride, above0);
1005 }
1006
1007 // 16x8
1008
svt_aom_highbd_v_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1009 void svt_aom_highbd_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1010 const uint16_t *left, int32_t bd) {
1011 // Load all 16 pixels in a row into 256-bit registers.
1012 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1013
1014 (void)left;
1015 (void)bd;
1016
1017 v_pred_16x8(&dst, stride, above0);
1018 }
1019
1020 // 16x16
1021
svt_aom_highbd_v_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1022 void svt_aom_highbd_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1023 const uint16_t *left, int32_t bd) {
1024 // Load all 16 pixels in a row into 256-bit registers.
1025 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1026
1027 (void)left;
1028 (void)bd;
1029
1030 for (int32_t i = 0; i < 2; i++) v_pred_16x8(&dst, stride, above0);
1031 }
1032
1033 // 16x32
1034
svt_aom_highbd_v_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1035 void svt_aom_highbd_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1036 const uint16_t *left, int32_t bd) {
1037 // Load all 16 pixels in a row into 256-bit registers.
1038 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1039
1040 (void)left;
1041 (void)bd;
1042
1043 for (int32_t i = 0; i < 4; i++) v_pred_16x8(&dst, stride, above0);
1044 }
1045
1046 // 16x64
1047
svt_aom_highbd_v_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1048 void svt_aom_highbd_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1049 const uint16_t *left, int32_t bd) {
1050 // Load all 16 pixels in a row into 256-bit registers.
1051 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1052
1053 (void)left;
1054 (void)bd;
1055
1056 for (int32_t i = 0; i < 8; i++) v_pred_16x8(&dst, stride, above0);
1057 }
1058
1059 // -----------------------------------------------------------------------------
1060
1061 // 32xN
1062
v_pred_32(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1)1063 static INLINE void v_pred_32(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1064 const __m256i above1) {
1065 _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1066 _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1067 *dst += stride;
1068 }
1069
1070 // Process 8 rows.
v_pred_32x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1)1071 static INLINE void v_pred_32x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1072 const __m256i above1) {
1073 v_pred_32(dst, stride, above0, above1);
1074 v_pred_32(dst, stride, above0, above1);
1075 v_pred_32(dst, stride, above0, above1);
1076 v_pred_32(dst, stride, above0, above1);
1077 v_pred_32(dst, stride, above0, above1);
1078 v_pred_32(dst, stride, above0, above1);
1079 v_pred_32(dst, stride, above0, above1);
1080 v_pred_32(dst, stride, above0, above1);
1081 }
1082
1083 // 32x8
1084
svt_aom_highbd_v_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1085 void svt_aom_highbd_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1086 const uint16_t *left, int32_t bd) {
1087 // Load all 32 pixels in a row into 256-bit registers.
1088 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1089 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1090
1091 (void)left;
1092 (void)bd;
1093
1094 v_pred_32x8(&dst, stride, above0, above1);
1095 }
1096
1097 // 32x16
1098
svt_aom_highbd_v_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1099 void svt_aom_highbd_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1100 const uint16_t *left, int32_t bd) {
1101 // Load all 32 pixels in a row into 256-bit registers.
1102 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1103 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1104
1105 (void)left;
1106 (void)bd;
1107
1108 for (int32_t i = 0; i < 2; i++) v_pred_32x8(&dst, stride, above0, above1);
1109 }
1110
1111 // 32x32
1112
svt_aom_highbd_v_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1113 void svt_aom_highbd_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1114 const uint16_t *left, int32_t bd) {
1115 // Load all 32 pixels in a row into 256-bit registers.
1116 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1117 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1118
1119 (void)left;
1120 (void)bd;
1121
1122 for (int32_t i = 0; i < 4; i++) v_pred_32x8(&dst, stride, above0, above1);
1123 }
1124
1125 // 32x64
1126
svt_aom_highbd_v_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1127 void svt_aom_highbd_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1128 const uint16_t *left, int32_t bd) {
1129 // Load all 32 pixels in a row into 256-bit registers.
1130 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1131 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1132
1133 (void)left;
1134 (void)bd;
1135
1136 for (int32_t i = 0; i < 8; i++) v_pred_32x8(&dst, stride, above0, above1);
1137 }
1138
1139 // -----------------------------------------------------------------------------
1140
1141 // 64xN
1142
v_pred_64(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1,const __m256i above2,const __m256i above3)1143 static INLINE void v_pred_64(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1144 const __m256i above1, const __m256i above2, const __m256i above3) {
1145 _mm256_storeu_si256((__m256i *)(*dst + 0x00), above0);
1146 _mm256_storeu_si256((__m256i *)(*dst + 0x10), above1);
1147 _mm256_storeu_si256((__m256i *)(*dst + 0x20), above2);
1148 _mm256_storeu_si256((__m256i *)(*dst + 0x30), above3);
1149 *dst += stride;
1150 }
1151
1152 // Process 8 rows.
v_pred_64x8(uint16_t ** const dst,const ptrdiff_t stride,const __m256i above0,const __m256i above1,const __m256i above2,const __m256i above3)1153 static INLINE void v_pred_64x8(uint16_t **const dst, const ptrdiff_t stride, const __m256i above0,
1154 const __m256i above1, const __m256i above2, const __m256i above3) {
1155 v_pred_64(dst, stride, above0, above1, above2, above3);
1156 v_pred_64(dst, stride, above0, above1, above2, above3);
1157 v_pred_64(dst, stride, above0, above1, above2, above3);
1158 v_pred_64(dst, stride, above0, above1, above2, above3);
1159 v_pred_64(dst, stride, above0, above1, above2, above3);
1160 v_pred_64(dst, stride, above0, above1, above2, above3);
1161 v_pred_64(dst, stride, above0, above1, above2, above3);
1162 v_pred_64(dst, stride, above0, above1, above2, above3);
1163 }
1164
1165 // 64x16
1166
svt_aom_highbd_v_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1167 void svt_aom_highbd_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1168 const uint16_t *left, int32_t bd) {
1169 // Load all 64 pixels in a row into 256-bit registers.
1170 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1171 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1172 const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1173 const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1174
1175 (void)left;
1176 (void)bd;
1177
1178 for (int32_t i = 0; i < 2; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1179 }
1180
1181 // 64x32
1182
svt_aom_highbd_v_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1183 void svt_aom_highbd_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1184 const uint16_t *left, int32_t bd) {
1185 // Load all 64 pixels in a row into 256-bit registers.
1186 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1187 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1188 const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1189 const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1190
1191 (void)left;
1192 (void)bd;
1193
1194 for (int32_t i = 0; i < 4; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1195 }
1196
1197 // 64x64
1198
svt_aom_highbd_v_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1199 void svt_aom_highbd_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1200 const uint16_t *left, int32_t bd) {
1201 // Load all 64 pixels in a row into 256-bit registers.
1202 const __m256i above0 = _mm256_loadu_si256((const __m256i *)(above + 0x00));
1203 const __m256i above1 = _mm256_loadu_si256((const __m256i *)(above + 0x10));
1204 const __m256i above2 = _mm256_loadu_si256((const __m256i *)(above + 0x20));
1205 const __m256i above3 = _mm256_loadu_si256((const __m256i *)(above + 0x30));
1206
1207 (void)left;
1208 (void)bd;
1209
1210 for (int32_t i = 0; i < 8; i++) v_pred_64x8(&dst, stride, above0, above1, above2, above3);
1211 }
1212
1213 // =============================================================================
1214
1215 // Repeat for AVX2 optimizations.
1216
1217 // bs = 4
1218 EB_ALIGN(32)
1219 static const uint16_t sm_weights_d_4[16] = {
1220 255,
1221 1,
1222 149,
1223 107,
1224 85,
1225 171,
1226 64,
1227 192, // 0 1 2 3
1228 255,
1229 1,
1230 149,
1231 107,
1232 85,
1233 171,
1234 64,
1235 192 // 0 1 2 3
1236 };
1237
1238 // bs = 8
1239 EB_ALIGN(32)
1240 static const uint16_t sm_weights_d_8[32] = {
1241 255, 1, 197, 59, 146, 110, 105, 151, // 0 1 2 3
1242 255, 1, 197, 59, 146, 110, 105, 151, // 0 1 2 3
1243 73, 183, 50, 206, 37, 219, 32, 224, // 4 5 6 7
1244 73, 183, 50, 206, 37, 219, 32, 224 // 4 5 6 7
1245 };
1246
1247 // bs = 16
1248 EB_ALIGN(32)
1249 static const uint16_t sm_weights_d_16[64] = {
1250 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1251 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1252 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1253 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1254 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1255 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1256 26, 230, 20, 236, 17, 239, 16, 240, // 12 13 14 15
1257 26, 230, 20, 236, 17, 239, 16, 240 // 12 13 14 15
1258 };
1259
1260 // bs = 32
1261 EB_ALIGN(32)
1262 static const uint16_t sm_weights_d_32[128] = {
1263 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1264 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1265 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1266 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1267 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1268 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1269 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1270 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1271 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1272 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1273 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1274 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1275 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1276 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1277 10, 246, 9, 247, 8, 248, 8, 248, // 28 29 30 31
1278 10, 246, 9, 247, 8, 248, 8, 248 // 28 29 30 31
1279 };
1280
1281 // bs = 64
1282 EB_ALIGN(32)
1283 static const uint16_t sm_weights_d_64[256] = {
1284 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1285 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1286 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1287 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1288 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1289 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1290 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1291 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1292 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1293 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1294 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1295 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1296 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1297 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1298 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1299 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1300 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1301 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1302 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1303 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1304 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1305 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1306 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1307 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1308 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1309 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1310 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1311 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1312 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1313 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1314 5, 251, 4, 252, 4, 252, 4, 252, // 60 61 62 63
1315 5, 251, 4, 252, 4, 252, 4, 252 // 60 61 62 63
1316 };
1317
1318 // -----------------------------------------------------------------------------
1319
1320 // Shuffle for AVX2 optimizations.
1321
1322 // bs = 16
1323 EB_ALIGN(32)
1324 static const uint16_t sm_weights_16[32] = {
1325 255, 1, 225, 31, 196, 60, 170, 86, // 0 1 2 3
1326 68, 188, 54, 202, 43, 213, 33, 223, // 8 9 10 11
1327 145, 111, 123, 133, 102, 154, 84, 172, // 4 5 6 7
1328 26, 230, 20, 236, 17, 239, 16, 240 // 12 13 14 15
1329 };
1330
1331 // bs = 32
1332 EB_ALIGN(32)
1333 static const uint16_t sm_weights_32[64] = {
1334 255, 1, 240, 16, 225, 31, 210, 46, // 0 1 2 3
1335 145, 111, 133, 123, 122, 134, 111, 145, // 8 9 10 11
1336 196, 60, 182, 74, 169, 87, 157, 99, // 4 5 6 7
1337 101, 155, 92, 164, 83, 173, 74, 182, // 12 13 14 15
1338 66, 190, 59, 197, 52, 204, 45, 211, // 16 17 18 19
1339 21, 235, 17, 239, 14, 242, 12, 244, // 24 25 26 27
1340 39, 217, 34, 222, 29, 227, 25, 231, // 20 21 22 23
1341 10, 246, 9, 247, 8, 248, 8, 248 // 28 29 30 31
1342 };
1343
1344 // bs = 64
1345 EB_ALIGN(32)
1346 static const uint16_t sm_weights_64[128] = {
1347 255, 1, 248, 8, 240, 16, 233, 23, // 0 1 2 3
1348 196, 60, 189, 67, 182, 74, 176, 80, // 8 9 10 11
1349 225, 31, 218, 38, 210, 46, 203, 53, // 4 5 6 7
1350 169, 87, 163, 93, 156, 100, 150, 106, // 12 13 14 15
1351 144, 112, 138, 118, 133, 123, 127, 129, // 16 17 18 19
1352 101, 155, 96, 160, 91, 165, 86, 170, // 24 25 26 27
1353 121, 135, 116, 140, 111, 145, 106, 150, // 20 21 22 23
1354 82, 174, 77, 179, 73, 183, 69, 187, // 28 29 30 31
1355 65, 191, 61, 195, 57, 199, 54, 202, // 32 33 34 35
1356 38, 218, 35, 221, 32, 224, 29, 227, // 40 41 42 43
1357 50, 206, 47, 209, 44, 212, 41, 215, // 36 37 38 39
1358 27, 229, 25, 231, 22, 234, 20, 236, // 44 45 46 47
1359 18, 238, 16, 240, 15, 241, 13, 243, // 48 49 50 51
1360 7, 249, 6, 250, 6, 250, 5, 251, // 56 57 58 59
1361 12, 244, 10, 246, 9, 247, 8, 248, // 52 53 54 55
1362 5, 251, 4, 252, 4, 252, 4, 252 // 60 61 62 63
1363 };
1364
1365 // SMOOTH_PRED
1366
1367 // 8xN
1368
load_right_weights_8(const uint16_t * const above,__m256i * const r,__m256i * const weights)1369 static INLINE void load_right_weights_8(const uint16_t *const above, __m256i *const r,
1370 __m256i *const weights) {
1371 *r = _mm256_set1_epi16((uint16_t)above[7]);
1372
1373 // 0 1 2 3 0 1 2 3
1374 weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_d_8 + 0x00));
1375 // 4 5 6 7 4 5 6 7
1376 weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_d_8 + 0x10));
1377 }
1378
load_left_4(const uint16_t * const left,const __m256i r)1379 static INLINE __m256i load_left_4(const uint16_t *const left, const __m256i r) {
1380 const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
1381 // 0 1 2 3 x x x x 0 1 2 3 x x x x
1382 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1383 return _mm256_unpacklo_epi16(l, r); // 0 1 2 3 0 1 2 3
1384 }
1385
load_left_8(const uint16_t * const left,const __m256i r,__m256i * const lr)1386 static INLINE void load_left_8(const uint16_t *const left, const __m256i r, __m256i *const lr) {
1387 const __m128i l0 = _mm_loadu_si128((const __m128i *)left);
1388 // 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
1389 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l0, 1);
1390 lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 0 1 2 3
1391 lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7 4 5 6 7
1392 }
1393
init_8(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1394 static INLINE void init_8(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1395 __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1396 __m256i *const rep) {
1397 const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
1398 const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
1399 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1400 __m256i a[2];
1401 a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
1402 a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
1403 ab[0] = _mm256_unpacklo_epi16(a[0], b);
1404 ab[1] = _mm256_unpacklo_epi16(a[1], b);
1405 load_right_weights_8(above, r, weights_w);
1406
1407 const __m128i rep0 = _mm_set1_epi32(0x03020100);
1408 const __m128i rep1 = _mm_set1_epi32(0x07060504);
1409 const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
1410 const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
1411 rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
1412 rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
1413 }
1414
smooth_pred_kernel(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr)1415 static INLINE __m256i smooth_pred_kernel(const __m256i *const weights_w, const __m256i weights_h,
1416 const __m256i rep, const __m256i *const ab,
1417 const __m256i lr) {
1418 const __m256i round = _mm256_set1_epi32((1 << sm_weight_log2_scale));
1419 __m256i s[2], sum[2];
1420 // 0 0 0 0 1 1 1 1
1421 const __m256i w = _mm256_shuffle_epi8(weights_h, rep);
1422 const __m256i t = _mm256_shuffle_epi8(lr, rep);
1423 s[0] = _mm256_madd_epi16(ab[0], w);
1424 s[1] = _mm256_madd_epi16(ab[1], w);
1425 // width 8: 00 01 02 03 10 11 12 13
1426 // width 16: 0 1 2 3 8 9 A b
1427 sum[0] = _mm256_madd_epi16(t, weights_w[0]);
1428 // width 8: 04 05 06 07 14 15 16 17
1429 // width 16: 4 5 6 7 C D E F
1430 sum[1] = _mm256_madd_epi16(t, weights_w[1]);
1431 sum[0] = _mm256_add_epi32(sum[0], s[0]);
1432 sum[1] = _mm256_add_epi32(sum[1], s[1]);
1433 sum[0] = _mm256_add_epi32(sum[0], round);
1434 sum[1] = _mm256_add_epi32(sum[1], round);
1435 sum[0] = _mm256_srai_epi32(sum[0], 1 + sm_weight_log2_scale);
1436 sum[1] = _mm256_srai_epi32(sum[1], 1 + sm_weight_log2_scale);
1437 // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1438 // width 16: 0 1 2 3 4 5 6 7 8 9 A b C D E F
1439 return _mm256_packs_epi32(sum[0], sum[1]);
1440 }
1441
smooth_pred_8x2(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1442 static INLINE void smooth_pred_8x2(const __m256i *const weights_w, const __m256i weights_h,
1443 const __m256i rep, const __m256i *const ab, const __m256i lr,
1444 uint16_t **const dst, const ptrdiff_t stride) {
1445 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1446 const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1447 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
1448 *dst += stride;
1449 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
1450 *dst += stride;
1451 }
1452
smooth_pred_8x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1453 static INLINE void smooth_pred_8x4(const __m256i *const weights_w,
1454 const uint16_t *const sm_weights_h, const __m256i *const rep,
1455 const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1456 const ptrdiff_t stride) {
1457 const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1458 smooth_pred_8x2(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1459 smooth_pred_8x2(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1460 }
1461
smooth_pred_8x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1462 static INLINE void smooth_pred_8x8(const uint16_t *const left, const __m256i *const weights_w,
1463 const uint16_t *const sm_weights_h, const __m256i *const rep,
1464 const __m256i *const ab, const __m256i r, uint16_t **const dst,
1465 const ptrdiff_t stride) {
1466 __m256i lr[2];
1467 load_left_8(left, r, lr);
1468
1469 smooth_pred_8x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1470 smooth_pred_8x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1471 }
1472
1473 // 8x4
1474
svt_aom_highbd_smooth_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1475 void svt_aom_highbd_smooth_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
1476 const uint16_t *above, const uint16_t *left,
1477 int32_t bd) {
1478 __m256i ab[2], r, lr, weights_w[2], rep[2];
1479 (void)bd;
1480
1481 init_8(above, left, 4, ab, &r, weights_w, rep);
1482 lr = load_left_4(left, r);
1483 smooth_pred_8x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1484 }
1485
1486 // 8x8
1487
svt_aom_highbd_smooth_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1488 void svt_aom_highbd_smooth_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
1489 const uint16_t *above, const uint16_t *left,
1490 int32_t bd) {
1491 __m256i ab[2], r, weights_w[2], rep[2];
1492 (void)bd;
1493
1494 init_8(above, left, 8, ab, &r, weights_w, rep);
1495
1496 smooth_pred_8x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1497 }
1498
1499 // 8x16
1500
svt_aom_highbd_smooth_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1501 void svt_aom_highbd_smooth_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
1502 const uint16_t *above, const uint16_t *left,
1503 int32_t bd) {
1504 __m256i ab[2], r, weights_w[2], rep[2];
1505 (void)bd;
1506
1507 init_8(above, left, 16, ab, &r, weights_w, rep);
1508
1509 for (int32_t i = 0; i < 2; i++) {
1510 smooth_pred_8x8(
1511 left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1512 }
1513 }
1514
1515 // 8x32
1516
svt_aom_highbd_smooth_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1517 void svt_aom_highbd_smooth_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
1518 const uint16_t *above, const uint16_t *left,
1519 int32_t bd) {
1520 __m256i ab[2], r, weights_w[2], rep[2];
1521 (void)bd;
1522
1523 init_8(above, left, 32, ab, &r, weights_w, rep);
1524
1525 for (int32_t i = 0; i < 4; i++) {
1526 smooth_pred_8x8(
1527 left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1528 }
1529 }
1530
1531 // -----------------------------------------------------------------------------
1532 // 16xN
1533
load_right_weights_16(const uint16_t * const above,__m256i * const r,__m256i * const weights)1534 static INLINE void load_right_weights_16(const uint16_t *const above, __m256i *const r,
1535 __m256i *const weights) {
1536 *r = _mm256_set1_epi16((uint16_t)above[15]);
1537
1538 // 0 1 2 3 8 9 10 11
1539 weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_16 + 0x00));
1540 // 4 5 6 7 12 13 14 15
1541 weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_16 + 0x10));
1542 }
1543
prepare_ab(const uint16_t * const above,const __m256i b,__m256i * const ab)1544 static INLINE void prepare_ab(const uint16_t *const above, const __m256i b, __m256i *const ab) {
1545 const __m256i a = _mm256_loadu_si256((const __m256i *)above);
1546 ab[0] = _mm256_unpacklo_epi16(a, b);
1547 ab[1] = _mm256_unpackhi_epi16(a, b);
1548 }
1549
init_16(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1550 static INLINE void init_16(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1551 __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1552 __m256i *const rep) {
1553 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1554 prepare_ab(above, b, ab);
1555 load_right_weights_16(above, r, weights_w);
1556
1557 rep[0] = _mm256_set1_epi32(0x03020100);
1558 rep[1] = _mm256_set1_epi32(0x07060504);
1559 rep[2] = _mm256_set1_epi32(0x0B0A0908);
1560 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1561 }
1562
smooth_pred_16(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1563 static INLINE void smooth_pred_16(const __m256i *const weights_w, const __m256i weights_h,
1564 const __m256i rep, const __m256i *const ab, const __m256i lr,
1565 uint16_t **const dst, const ptrdiff_t stride) {
1566 const __m256i d = smooth_pred_kernel(weights_w, weights_h, rep, ab, lr);
1567 _mm256_storeu_si256((__m256i *)*dst, d);
1568 *dst += stride;
1569 }
1570
smooth_pred_16x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1571 static INLINE void smooth_pred_16x4(const __m256i *const weights_w,
1572 const uint16_t *const sm_weights_h, const __m256i *const rep,
1573 const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1574 const ptrdiff_t stride) {
1575 const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1576 smooth_pred_16(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1577 smooth_pred_16(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1578 smooth_pred_16(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1579 smooth_pred_16(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1580 }
1581
smooth_pred_16x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1582 static INLINE void smooth_pred_16x8(const uint16_t *const left, const __m256i *const weights_w,
1583 const uint16_t *const sm_weights_h, const __m256i *const rep,
1584 const __m256i *const ab, const __m256i r, uint16_t **const dst,
1585 const ptrdiff_t stride) {
1586 __m256i lr[2];
1587 load_left_8(left, r, lr);
1588
1589 smooth_pred_16x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1590 smooth_pred_16x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1591 }
1592
1593 // 16x4
1594
svt_aom_highbd_smooth_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1595 void svt_aom_highbd_smooth_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
1596 const uint16_t *above, const uint16_t *left,
1597 int32_t bd) {
1598 __m256i ab[2], r, lr, weights_w[2], rep[4];
1599 (void)bd;
1600
1601 init_16(above, left, 4, ab, &r, weights_w, rep);
1602 lr = load_left_4(left, r);
1603 smooth_pred_16x4(weights_w, sm_weights_d_4, rep, ab, lr, &dst, stride);
1604 }
1605
1606 // 16x8
1607
svt_aom_highbd_smooth_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1608 void svt_aom_highbd_smooth_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
1609 const uint16_t *above, const uint16_t *left,
1610 int32_t bd) {
1611 __m256i ab[2], r, weights_w[2], rep[4];
1612 (void)bd;
1613
1614 init_16(above, left, 8, ab, &r, weights_w, rep);
1615
1616 smooth_pred_16x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1617 }
1618
1619 // 16x16
1620
svt_aom_highbd_smooth_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1621 void svt_aom_highbd_smooth_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
1622 const uint16_t *above, const uint16_t *left,
1623 int32_t bd) {
1624 __m256i ab[2], r, weights_w[2], rep[4];
1625 (void)bd;
1626
1627 init_16(above, left, 16, ab, &r, weights_w, rep);
1628
1629 for (int32_t i = 0; i < 2; i++) {
1630 smooth_pred_16x8(
1631 left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1632 }
1633 }
1634
1635 // 16x32
1636
svt_aom_highbd_smooth_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1637 void svt_aom_highbd_smooth_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
1638 const uint16_t *above, const uint16_t *left,
1639 int32_t bd) {
1640 __m256i ab[2], r, weights_w[2], rep[4];
1641 (void)bd;
1642
1643 init_16(above, left, 32, ab, &r, weights_w, rep);
1644
1645 for (int32_t i = 0; i < 4; i++) {
1646 smooth_pred_16x8(
1647 left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1648 }
1649 }
1650
1651 // 16x64
1652
svt_aom_highbd_smooth_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1653 void svt_aom_highbd_smooth_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
1654 const uint16_t *above, const uint16_t *left,
1655 int32_t bd) {
1656 __m256i ab[2], r, weights_w[2], rep[4];
1657 (void)bd;
1658
1659 init_16(above, left, 64, ab, &r, weights_w, rep);
1660
1661 for (int32_t i = 0; i < 8; i++) {
1662 smooth_pred_16x8(
1663 left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1664 }
1665 }
1666
1667 // -----------------------------------------------------------------------------
1668 // 32xN
1669
load_right_weights_32(const uint16_t * const above,__m256i * const r,__m256i * const weights)1670 static INLINE void load_right_weights_32(const uint16_t *const above, __m256i *const r,
1671 __m256i *const weights) {
1672 *r = _mm256_set1_epi16((uint16_t)above[31]);
1673
1674 // 0 1 2 3 8 9 10 11
1675 weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x00));
1676 // 4 5 6 7 12 13 14 15
1677 weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x10));
1678 // 16 17 18 19 24 25 26 27
1679 weights[2] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x20));
1680 // 20 21 22 23 28 29 30 31
1681 weights[3] = _mm256_loadu_si256((const __m256i *)(sm_weights_32 + 0x30));
1682 }
1683
init_32(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1684 static INLINE void init_32(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1685 __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1686 __m256i *const rep) {
1687 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1688 prepare_ab(above + 0x00, b, ab + 0);
1689 prepare_ab(above + 0x10, b, ab + 2);
1690 load_right_weights_32(above, r, weights_w);
1691
1692 rep[0] = _mm256_set1_epi32(0x03020100);
1693 rep[1] = _mm256_set1_epi32(0x07060504);
1694 rep[2] = _mm256_set1_epi32(0x0B0A0908);
1695 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1696 }
1697
smooth_pred_32(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1698 static INLINE void smooth_pred_32(const __m256i *const weights_w, const __m256i weights_h,
1699 const __m256i rep, const __m256i *const ab, const __m256i lr,
1700 uint16_t **const dst, const ptrdiff_t stride) {
1701 __m256i d;
1702
1703 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1704 d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1705 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1706
1707 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1708 d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1709 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1710 *dst += stride;
1711 }
1712
smooth_pred_32x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1713 static INLINE void smooth_pred_32x4(const __m256i *const weights_w,
1714 const uint16_t *const sm_weights_h, const __m256i *const rep,
1715 const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1716 const ptrdiff_t stride) {
1717 const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1718 smooth_pred_32(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1719 smooth_pred_32(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1720 smooth_pred_32(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1721 smooth_pred_32(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1722 }
1723
smooth_pred_32x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1724 static INLINE void smooth_pred_32x8(const uint16_t *const left, const __m256i *const weights_w,
1725 const uint16_t *const sm_weights_h, const __m256i *const rep,
1726 const __m256i *const ab, const __m256i r, uint16_t **const dst,
1727 const ptrdiff_t stride) {
1728 __m256i lr[2];
1729 load_left_8(left, r, lr);
1730
1731 smooth_pred_32x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1732 smooth_pred_32x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1733 }
1734
1735 // 32x8
1736
svt_aom_highbd_smooth_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1737 void svt_aom_highbd_smooth_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
1738 const uint16_t *above, const uint16_t *left,
1739 int32_t bd) {
1740 __m256i ab[4], r, weights_w[4], rep[4];
1741 (void)bd;
1742
1743 init_32(above, left, 8, ab, &r, weights_w, rep);
1744
1745 smooth_pred_32x8(left, weights_w, sm_weights_d_8, rep, ab, r, &dst, stride);
1746 }
1747
1748 // 32x16
1749
svt_aom_highbd_smooth_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1750 void svt_aom_highbd_smooth_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
1751 const uint16_t *above, const uint16_t *left,
1752 int32_t bd) {
1753 __m256i ab[4], r, weights_w[4], rep[4];
1754 (void)bd;
1755
1756 init_32(above, left, 16, ab, &r, weights_w, rep);
1757
1758 for (int32_t i = 0; i < 2; i++) {
1759 smooth_pred_32x8(
1760 left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1761 }
1762 }
1763
1764 // 32x32
1765
svt_aom_highbd_smooth_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1766 void svt_aom_highbd_smooth_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
1767 const uint16_t *above, const uint16_t *left,
1768 int32_t bd) {
1769 __m256i ab[4], r, weights_w[4], rep[4];
1770 (void)bd;
1771
1772 init_32(above, left, 32, ab, &r, weights_w, rep);
1773
1774 for (int32_t i = 0; i < 4; i++) {
1775 smooth_pred_32x8(
1776 left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1777 }
1778 }
1779
1780 // 32x64
1781
svt_aom_highbd_smooth_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1782 void svt_aom_highbd_smooth_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
1783 const uint16_t *above, const uint16_t *left,
1784 int32_t bd) {
1785 __m256i ab[4], r, weights_w[4], rep[4];
1786 (void)bd;
1787
1788 init_32(above, left, 64, ab, &r, weights_w, rep);
1789
1790 for (int32_t i = 0; i < 8; i++) {
1791 smooth_pred_32x8(
1792 left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1793 }
1794 }
1795
1796 // -----------------------------------------------------------------------------
1797 // 64xN
1798
load_right_weights_64(const uint16_t * const above,__m256i * const r,__m256i * const weights)1799 static INLINE void load_right_weights_64(const uint16_t *const above, __m256i *const r,
1800 __m256i *const weights) {
1801 *r = _mm256_set1_epi16((uint16_t)above[63]);
1802
1803 // 0 1 2 3 8 9 10 11
1804 weights[0] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x00));
1805 // 4 5 6 7 12 13 14 15
1806 weights[1] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x10));
1807 // 16 17 18 19 24 25 26 27
1808 weights[2] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x20));
1809 // 20 21 22 23 28 29 30 31
1810 weights[3] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x30));
1811 // 32 33 34 35 40 41 42 43
1812 weights[4] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x40));
1813 // 36 37 38 39 44 45 46 47
1814 weights[5] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x50));
1815 // 48 49 50 51 56 57 58 59
1816 weights[6] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x60));
1817 // 52 53 54 55 60 61 62 63
1818 weights[7] = _mm256_loadu_si256((const __m256i *)(sm_weights_64 + 0x70));
1819 }
1820
init_64(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const r,__m256i * const weights_w,__m256i * const rep)1821 static INLINE void init_64(const uint16_t *const above, const uint16_t *const left, const int32_t h,
1822 __m256i *const ab, __m256i *const r, __m256i *const weights_w,
1823 __m256i *const rep) {
1824 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
1825 prepare_ab(above + 0x00, b, ab + 0);
1826 prepare_ab(above + 0x10, b, ab + 2);
1827 prepare_ab(above + 0x20, b, ab + 4);
1828 prepare_ab(above + 0x30, b, ab + 6);
1829 load_right_weights_64(above, r, weights_w);
1830
1831 rep[0] = _mm256_set1_epi32(0x03020100);
1832 rep[1] = _mm256_set1_epi32(0x07060504);
1833 rep[2] = _mm256_set1_epi32(0x0B0A0908);
1834 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
1835 }
1836
smooth_pred_64(const __m256i * const weights_w,const __m256i weights_h,const __m256i rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1837 static INLINE void smooth_pred_64(const __m256i *const weights_w, const __m256i weights_h,
1838 const __m256i rep, const __m256i *const ab, const __m256i lr,
1839 uint16_t **const dst, const ptrdiff_t stride) {
1840 __m256i d;
1841
1842 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1843 d = smooth_pred_kernel(weights_w + 0, weights_h, rep, ab + 0, lr);
1844 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
1845
1846 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1847 d = smooth_pred_kernel(weights_w + 2, weights_h, rep, ab + 2, lr);
1848 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
1849
1850 // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
1851 d = smooth_pred_kernel(weights_w + 4, weights_h, rep, ab + 4, lr);
1852 _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
1853
1854 // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
1855 d = smooth_pred_kernel(weights_w + 6, weights_h, rep, ab + 6, lr);
1856 _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
1857 *dst += stride;
1858 }
1859
smooth_pred_64x4(const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i lr,uint16_t ** const dst,const ptrdiff_t stride)1860 static INLINE void smooth_pred_64x4(const __m256i *const weights_w,
1861 const uint16_t *const sm_weights_h, const __m256i *const rep,
1862 const __m256i *const ab, const __m256i lr, uint16_t **const dst,
1863 const ptrdiff_t stride) {
1864 const __m256i weights_h = _mm256_loadu_si256((const __m256i *)sm_weights_h);
1865 smooth_pred_64(weights_w, weights_h, rep[0], ab, lr, dst, stride);
1866 smooth_pred_64(weights_w, weights_h, rep[1], ab, lr, dst, stride);
1867 smooth_pred_64(weights_w, weights_h, rep[2], ab, lr, dst, stride);
1868 smooth_pred_64(weights_w, weights_h, rep[3], ab, lr, dst, stride);
1869 }
1870
smooth_pred_64x8(const uint16_t * const left,const __m256i * const weights_w,const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,const __m256i r,uint16_t ** const dst,const ptrdiff_t stride)1871 static INLINE void smooth_pred_64x8(const uint16_t *const left, const __m256i *const weights_w,
1872 const uint16_t *const sm_weights_h, const __m256i *const rep,
1873 const __m256i *const ab, const __m256i r, uint16_t **const dst,
1874 const ptrdiff_t stride) {
1875 __m256i lr[2];
1876 load_left_8(left, r, lr);
1877
1878 smooth_pred_64x4(weights_w, sm_weights_h + 0, rep, ab, lr[0], dst, stride);
1879 smooth_pred_64x4(weights_w, sm_weights_h + 16, rep, ab, lr[1], dst, stride);
1880 }
1881
1882 // 64x16
1883
svt_aom_highbd_smooth_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1884 void svt_aom_highbd_smooth_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
1885 const uint16_t *above, const uint16_t *left,
1886 int32_t bd) {
1887 __m256i ab[8], r, weights_w[8], rep[4];
1888 (void)bd;
1889
1890 init_64(above, left, 16, ab, &r, weights_w, rep);
1891
1892 for (int32_t i = 0; i < 2; i++) {
1893 smooth_pred_64x8(
1894 left + 8 * i, weights_w, sm_weights_d_16 + 32 * i, rep, ab, r, &dst, stride);
1895 }
1896 }
1897
1898 // 64x32
1899
svt_aom_highbd_smooth_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1900 void svt_aom_highbd_smooth_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
1901 const uint16_t *above, const uint16_t *left,
1902 int32_t bd) {
1903 __m256i ab[8], r, weights_w[8], rep[4];
1904 (void)bd;
1905
1906 init_64(above, left, 32, ab, &r, weights_w, rep);
1907
1908 for (int32_t i = 0; i < 4; i++) {
1909 smooth_pred_64x8(
1910 left + 8 * i, weights_w, sm_weights_d_32 + 32 * i, rep, ab, r, &dst, stride);
1911 }
1912 }
1913
1914 // 64x64
1915
svt_aom_highbd_smooth_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1916 void svt_aom_highbd_smooth_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
1917 const uint16_t *above, const uint16_t *left,
1918 int32_t bd) {
1919 __m256i ab[8], r, weights_w[8], rep[4];
1920 (void)bd;
1921
1922 init_64(above, left, 64, ab, &r, weights_w, rep);
1923
1924 for (int32_t i = 0; i < 8; i++) {
1925 smooth_pred_64x8(
1926 left + 8 * i, weights_w, sm_weights_d_64 + 32 * i, rep, ab, r, &dst, stride);
1927 }
1928 }
1929
1930 // =============================================================================
1931
1932 // SMOOTH_H_PRED
1933
1934 // 8xN
1935
smooth_h_pred_kernel(const __m256i * const weights,const __m256i lr)1936 static INLINE __m256i smooth_h_pred_kernel(const __m256i *const weights, const __m256i lr) {
1937 const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1938 __m256i sum[2];
1939 // width 8: 00 01 02 03 10 11 12 13
1940 // width 16: 0 1 2 3 8 9 A b
1941 sum[0] = _mm256_madd_epi16(lr, weights[0]);
1942 // width 8: 04 05 06 07 14 15 16 17
1943 // width 16: 4 5 6 7 C D E F
1944 sum[1] = _mm256_madd_epi16(lr, weights[1]);
1945 sum[0] = _mm256_add_epi32(sum[0], round);
1946 sum[1] = _mm256_add_epi32(sum[1], round);
1947 sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
1948 sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
1949 // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1950 // width 16: 0 1 2 3 4 5 6 7 8 9 A b C D E F
1951 return _mm256_packs_epi32(sum[0], sum[1]);
1952 }
1953
smooth_h_pred_8x2(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)1954 static INLINE void smooth_h_pred_8x2(const __m256i *const weights, __m256i *const lr,
1955 uint16_t **const dst, const ptrdiff_t stride) {
1956 const __m256i rep = _mm256_set1_epi32(0x03020100);
1957 // lr: 0 1 2 3 1 2 3 4
1958 const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 1 1 1 1
1959 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
1960 const __m256i d = smooth_h_pred_kernel(weights, t);
1961 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
1962 *dst += stride;
1963 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
1964 *dst += stride;
1965 *lr = _mm256_srli_si256(*lr, 8); // 2 3 x x 3 4 x x
1966 }
1967
smooth_h_pred_8x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)1968 static INLINE void smooth_h_pred_8x4(const __m256i *const weights, __m256i *const lr,
1969 uint16_t **const dst, const ptrdiff_t stride) {
1970 smooth_h_pred_8x2(weights, lr, dst, stride);
1971 smooth_h_pred_8x2(weights, lr, dst, stride);
1972 }
1973
smooth_h_pred_8x8(const uint16_t * const left,const __m256i r,const __m256i * const weights,uint16_t ** const dst,const ptrdiff_t stride)1974 static INLINE void smooth_h_pred_8x8(const uint16_t *const left, const __m256i r,
1975 const __m256i *const weights, uint16_t **const dst,
1976 const ptrdiff_t stride) {
1977 const __m128i l0 = _mm_loadu_si128((const __m128i *)left);
1978 const __m128i l1 = _mm_srli_si128(l0, 2);
1979 // 0 1 2 3 4 5 6 7 1 2 3 4 5 6 7 x
1980 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
1981 __m256i lr[2];
1982 lr[0] = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 1 2 3 4
1983 lr[1] = _mm256_unpackhi_epi16(l, r); // 4 5 6 7 5 6 7 x
1984 smooth_h_pred_8x4(weights, &lr[0], dst, stride);
1985 smooth_h_pred_8x4(weights, &lr[1], dst, stride);
1986 }
1987
1988 // 8x4
1989
svt_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)1990 void svt_aom_highbd_smooth_h_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
1991 const uint16_t *above, const uint16_t *left,
1992 int32_t bd) {
1993 const __m128i l0 = _mm_loadl_epi64((const __m128i *)left);
1994 const __m128i l1 = _mm_srli_si128(l0, 2);
1995 // 0 1 2 3 x x x x 1 2 3 4 x x x x
1996 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(l0), l1, 1);
1997 __m256i r, weights[2];
1998 (void)bd;
1999
2000 load_right_weights_8(above, &r, weights);
2001 __m256i lr = _mm256_unpacklo_epi16(l, r); // 0 1 2 3 1 2 3 4
2002 smooth_h_pred_8x4(weights, &lr, &dst, stride);
2003 }
2004
2005 // 8x8
2006
svt_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2007 void svt_aom_highbd_smooth_h_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2008 const uint16_t *above, const uint16_t *left,
2009 int32_t bd) {
2010 __m256i r, weights[2];
2011 (void)bd;
2012
2013 load_right_weights_8(above, &r, weights);
2014 smooth_h_pred_8x8(left, r, weights, &dst, stride);
2015 }
2016
2017 // 8x16
2018
svt_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2019 void svt_aom_highbd_smooth_h_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2020 const uint16_t *above, const uint16_t *left,
2021 int32_t bd) {
2022 __m256i r, weights[2];
2023 (void)bd;
2024
2025 load_right_weights_8(above, &r, weights);
2026 smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2027 smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2028 }
2029
2030 // 8x32
2031
svt_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2032 void svt_aom_highbd_smooth_h_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2033 const uint16_t *above, const uint16_t *left,
2034 int32_t bd) {
2035 __m256i r, weights[2];
2036 (void)bd;
2037
2038 load_right_weights_8(above, &r, weights);
2039
2040 for (int32_t i = 0; i < 2; i++) {
2041 smooth_h_pred_8x8(left + 0, r, weights, &dst, stride);
2042 smooth_h_pred_8x8(left + 8, r, weights, &dst, stride);
2043 left += 16;
2044 }
2045 }
2046
2047 // -----------------------------------------------------------------------------
2048 // 16xN
2049
smooth_h_pred_16(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2050 static INLINE void smooth_h_pred_16(const __m256i *const weights, __m256i *const lr,
2051 uint16_t **const dst, const ptrdiff_t stride) {
2052 const __m256i rep = _mm256_set1_epi32(0x03020100);
2053 // lr: 0 1 2 3 0 1 2 3
2054 const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2055 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2056 const __m256i d = smooth_h_pred_kernel(weights, t);
2057 _mm256_storeu_si256((__m256i *)*dst, d);
2058 *dst += stride;
2059 *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2060 }
2061
smooth_h_pred_16x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2062 static INLINE void smooth_h_pred_16x4(const __m256i *const weights, __m256i *const lr,
2063 uint16_t **const dst, const ptrdiff_t stride) {
2064 smooth_h_pred_16(weights, lr, dst, stride);
2065 smooth_h_pred_16(weights, lr, dst, stride);
2066 smooth_h_pred_16(weights, lr, dst, stride);
2067 smooth_h_pred_16(weights, lr, dst, stride);
2068 }
2069
smooth_h_pred_16x8(const uint16_t * const left,const __m256i r,const __m256i * const weights,uint16_t ** const dst,const ptrdiff_t stride)2070 static INLINE void smooth_h_pred_16x8(const uint16_t *const left, const __m256i r,
2071 const __m256i *const weights, uint16_t **const dst,
2072 const ptrdiff_t stride) {
2073 __m256i lr[2];
2074 load_left_8(left, r, lr);
2075 smooth_h_pred_16x4(weights, &lr[0], dst, stride);
2076 smooth_h_pred_16x4(weights, &lr[1], dst, stride);
2077 }
2078
smooth_h_predictor_16x16(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2079 static INLINE void smooth_h_predictor_16x16(uint16_t *dst, const ptrdiff_t stride,
2080 const uint16_t *const above, const uint16_t *left,
2081 const int32_t n) {
2082 __m256i r, weights[2];
2083
2084 load_right_weights_16(above, &r, weights);
2085
2086 for (int32_t i = 0; i < n; i++) {
2087 smooth_h_pred_16x8(left + 0, r, weights, &dst, stride);
2088 smooth_h_pred_16x8(left + 8, r, weights, &dst, stride);
2089 left += 16;
2090 }
2091 }
2092
2093 // 16x4
2094
svt_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2095 void svt_aom_highbd_smooth_h_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2096 const uint16_t *above, const uint16_t *left,
2097 int32_t bd) {
2098 __m256i r, lr, weights[2];
2099 (void)bd;
2100
2101 load_right_weights_16(above, &r, weights);
2102 lr = load_left_4(left, r);
2103 smooth_h_pred_16x4(weights, &lr, &dst, stride);
2104 }
2105
2106 // 16x8
2107
svt_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2108 void svt_aom_highbd_smooth_h_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2109 const uint16_t *above, const uint16_t *left,
2110 int32_t bd) {
2111 __m256i r, weights[2];
2112 (void)bd;
2113
2114 load_right_weights_16(above, &r, weights);
2115 smooth_h_pred_16x8(left, r, weights, &dst, stride);
2116 }
2117
2118 // 16x16
2119
svt_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2120 void svt_aom_highbd_smooth_h_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2121 const uint16_t *above, const uint16_t *left,
2122 int32_t bd) {
2123 (void)bd;
2124 smooth_h_predictor_16x16(dst, stride, above, left, 1);
2125 }
2126
2127 // 16x32
2128
svt_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2129 void svt_aom_highbd_smooth_h_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2130 const uint16_t *above, const uint16_t *left,
2131 int32_t bd) {
2132 (void)bd;
2133 smooth_h_predictor_16x16(dst, stride, above, left, 2);
2134 }
2135
2136 // 16x64
2137
svt_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2138 void svt_aom_highbd_smooth_h_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2139 const uint16_t *above, const uint16_t *left,
2140 int32_t bd) {
2141 (void)bd;
2142 smooth_h_predictor_16x16(dst, stride, above, left, 4);
2143 }
2144
2145 // -----------------------------------------------------------------------------
2146 // 32xN
2147
smooth_h_pred_32(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2148 static INLINE void smooth_h_pred_32(const __m256i *const weights, __m256i *const lr,
2149 uint16_t **const dst, const ptrdiff_t stride) {
2150 const __m256i rep = _mm256_set1_epi32(0x03020100);
2151 // lr: 0 1 2 3 0 1 2 3
2152 const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2153 __m256i d;
2154
2155 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2156 d = smooth_h_pred_kernel(weights + 0, t);
2157 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2158
2159 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2160 d = smooth_h_pred_kernel(weights + 2, t);
2161 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2162 *dst += stride;
2163 *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2164 }
2165
smooth_h_pred_32x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2166 static INLINE void smooth_h_pred_32x4(const __m256i *const weights, __m256i *const lr,
2167 uint16_t **const dst, const ptrdiff_t stride) {
2168 smooth_h_pred_32(weights, lr, dst, stride);
2169 smooth_h_pred_32(weights, lr, dst, stride);
2170 smooth_h_pred_32(weights, lr, dst, stride);
2171 smooth_h_pred_32(weights, lr, dst, stride);
2172 }
2173
smooth_h_pred_32x8(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2174 static INLINE void smooth_h_pred_32x8(uint16_t *dst, const ptrdiff_t stride,
2175 const uint16_t *const above, const uint16_t *left,
2176 const int32_t n) {
2177 __m256i r, lr[2], weights[4];
2178
2179 load_right_weights_32(above, &r, weights);
2180
2181 for (int32_t i = 0; i < n; i++) {
2182 load_left_8(left, r, lr);
2183 smooth_h_pred_32x4(weights, &lr[0], &dst, stride);
2184 smooth_h_pred_32x4(weights, &lr[1], &dst, stride);
2185 left += 8;
2186 }
2187 }
2188
2189 // 32x8
2190
svt_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2191 void svt_aom_highbd_smooth_h_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2192 const uint16_t *above, const uint16_t *left,
2193 int32_t bd) {
2194 (void)bd;
2195 smooth_h_pred_32x8(dst, stride, above, left, 1);
2196 }
2197
2198 // 32x16
2199
svt_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2200 void svt_aom_highbd_smooth_h_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2201 const uint16_t *above, const uint16_t *left,
2202 int32_t bd) {
2203 (void)bd;
2204 smooth_h_pred_32x8(dst, stride, above, left, 2);
2205 }
2206
2207 // 32x32
2208
svt_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2209 void svt_aom_highbd_smooth_h_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2210 const uint16_t *above, const uint16_t *left,
2211 int32_t bd) {
2212 (void)bd;
2213 smooth_h_pred_32x8(dst, stride, above, left, 4);
2214 }
2215
2216 // 32x64
2217
svt_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2218 void svt_aom_highbd_smooth_h_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2219 const uint16_t *above, const uint16_t *left,
2220 int32_t bd) {
2221 (void)bd;
2222 smooth_h_pred_32x8(dst, stride, above, left, 8);
2223 }
2224
2225 // -----------------------------------------------------------------------------
2226 // 64xN
2227
smooth_h_pred_64(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2228 static INLINE void smooth_h_pred_64(const __m256i *const weights, __m256i *const lr,
2229 uint16_t **const dst, const ptrdiff_t stride) {
2230 const __m256i rep = _mm256_set1_epi32(0x03020100);
2231 // lr: 0 1 2 3 0 1 2 3
2232 const __m256i t = _mm256_shuffle_epi8(*lr, rep); // 0 0 0 0 0 0 0 0
2233 __m256i d;
2234
2235 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2236 d = smooth_h_pred_kernel(weights + 0, t);
2237 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2238
2239 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2240 d = smooth_h_pred_kernel(weights + 2, t);
2241 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2242
2243 // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
2244 d = smooth_h_pred_kernel(weights + 4, t);
2245 _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2246
2247 // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
2248 d = smooth_h_pred_kernel(weights + 6, t);
2249 _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2250 *dst += stride;
2251 *lr = _mm256_srli_si256(*lr, 4); // 1 2 3 x 1 2 3 x
2252 }
2253
smooth_h_pred_64x4(const __m256i * const weights,__m256i * const lr,uint16_t ** const dst,const ptrdiff_t stride)2254 static INLINE void smooth_h_pred_64x4(const __m256i *const weights, __m256i *const lr,
2255 uint16_t **const dst, const ptrdiff_t stride) {
2256 smooth_h_pred_64(weights, lr, dst, stride);
2257 smooth_h_pred_64(weights, lr, dst, stride);
2258 smooth_h_pred_64(weights, lr, dst, stride);
2259 smooth_h_pred_64(weights, lr, dst, stride);
2260 }
2261
smooth_h_pred_64x8(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const above,const uint16_t * left,const int32_t n)2262 static INLINE void smooth_h_pred_64x8(uint16_t *dst, const ptrdiff_t stride,
2263 const uint16_t *const above, const uint16_t *left,
2264 const int32_t n) {
2265 __m256i r, lr[2], weights[8];
2266
2267 load_right_weights_64(above, &r, weights);
2268
2269 for (int32_t i = 0; i < n; i++) {
2270 load_left_8(left, r, lr);
2271 smooth_h_pred_64x4(weights, &lr[0], &dst, stride);
2272 smooth_h_pred_64x4(weights, &lr[1], &dst, stride);
2273 left += 8;
2274 }
2275 }
2276
2277 // 64x16
2278
svt_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2279 void svt_aom_highbd_smooth_h_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2280 const uint16_t *above, const uint16_t *left,
2281 int32_t bd) {
2282 (void)bd;
2283 smooth_h_pred_64x8(dst, stride, above, left, 2);
2284 }
2285
2286 // 64x32
2287
svt_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2288 void svt_aom_highbd_smooth_h_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2289 const uint16_t *above, const uint16_t *left,
2290 int32_t bd) {
2291 (void)bd;
2292 smooth_h_pred_64x8(dst, stride, above, left, 4);
2293 }
2294
2295 // 64x64
2296
svt_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2297 void svt_aom_highbd_smooth_h_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2298 const uint16_t *above, const uint16_t *left,
2299 int32_t bd) {
2300 (void)bd;
2301 smooth_h_pred_64x8(dst, stride, above, left, 8);
2302 }
2303
2304 // =============================================================================
2305
2306 // SMOOTH_V_PRED
2307
2308 // 8xN
2309
smooth_v_init_8(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2310 static INLINE void smooth_v_init_8(const uint16_t *const above, const uint16_t *const left,
2311 const int32_t h, __m256i *const ab, __m256i *const rep) {
2312 const __m128i a0 = _mm_loadl_epi64(((const __m128i *)(above + 0)));
2313 const __m128i a1 = _mm_loadl_epi64(((const __m128i *)(above + 4)));
2314 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2315 __m256i a[2];
2316 a[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a0, 1);
2317 a[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(a1), a1, 1);
2318 ab[0] = _mm256_unpacklo_epi16(a[0], b);
2319 ab[1] = _mm256_unpacklo_epi16(a[1], b);
2320
2321 const __m128i rep0 = _mm_set1_epi32(0x03020100);
2322 const __m128i rep1 = _mm_set1_epi32(0x07060504);
2323 const __m128i rep2 = _mm_set1_epi32(0x0B0A0908);
2324 const __m128i rep3 = _mm_set1_epi32(0x0F0E0D0C);
2325 rep[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep0), rep1, 1);
2326 rep[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(rep2), rep3, 1);
2327 }
2328
smooth_v_pred_kernel(const __m256i weights,const __m256i rep,const __m256i * const ab)2329 static INLINE __m256i smooth_v_pred_kernel(const __m256i weights, const __m256i rep,
2330 const __m256i *const ab) {
2331 const __m256i round = _mm256_set1_epi32((1 << (sm_weight_log2_scale - 1)));
2332 __m256i sum[2];
2333 // 0 0 0 0 1 1 1 1
2334 const __m256i w = _mm256_shuffle_epi8(weights, rep);
2335 sum[0] = _mm256_madd_epi16(ab[0], w);
2336 sum[1] = _mm256_madd_epi16(ab[1], w);
2337 sum[0] = _mm256_add_epi32(sum[0], round);
2338 sum[1] = _mm256_add_epi32(sum[1], round);
2339 sum[0] = _mm256_srai_epi32(sum[0], sm_weight_log2_scale);
2340 sum[1] = _mm256_srai_epi32(sum[1], sm_weight_log2_scale);
2341 // width 8: 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2342 // width 16: 0 1 2 3 4 5 6 7 8 9 A b C D E F
2343 return _mm256_packs_epi32(sum[0], sum[1]);
2344 }
2345
smooth_v_pred_8x2(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2346 static INLINE void smooth_v_pred_8x2(const __m256i weights, const __m256i rep,
2347 const __m256i *const ab, uint16_t **const dst,
2348 const ptrdiff_t stride) {
2349 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
2350 const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2351 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 0));
2352 *dst += stride;
2353 _mm_storeu_si128((__m128i *)*dst, _mm256_extracti128_si256(d, 1));
2354 *dst += stride;
2355 }
2356
smooth_v_pred_8x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2357 static INLINE void smooth_v_pred_8x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2358 const __m256i *const ab, uint16_t **const dst,
2359 const ptrdiff_t stride) {
2360 const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2361 smooth_v_pred_8x2(weights, rep[0], ab, dst, stride);
2362 smooth_v_pred_8x2(weights, rep[1], ab, dst, stride);
2363 }
2364
smooth_v_pred_8x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2365 static INLINE void smooth_v_pred_8x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2366 const __m256i *const ab, uint16_t **const dst,
2367 const ptrdiff_t stride) {
2368 smooth_v_pred_8x4(sm_weights_h + 0, rep, ab, dst, stride);
2369 smooth_v_pred_8x4(sm_weights_h + 16, rep, ab, dst, stride);
2370 }
2371
2372 // 8x4
2373
svt_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2374 void svt_aom_highbd_smooth_v_predictor_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2375 const uint16_t *above, const uint16_t *left,
2376 int32_t bd) {
2377 __m256i ab[2], rep[2];
2378 (void)bd;
2379
2380 smooth_v_init_8(above, left, 4, ab, rep);
2381 smooth_v_pred_8x4(sm_weights_d_4, rep, ab, &dst, stride);
2382 }
2383
2384 // 8x8
2385
svt_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2386 void svt_aom_highbd_smooth_v_predictor_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2387 const uint16_t *above, const uint16_t *left,
2388 int32_t bd) {
2389 __m256i ab[2], rep[2];
2390 (void)bd;
2391
2392 smooth_v_init_8(above, left, 8, ab, rep);
2393
2394 smooth_v_pred_8x8(sm_weights_d_8, rep, ab, &dst, stride);
2395 }
2396
2397 // 8x16
2398
svt_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2399 void svt_aom_highbd_smooth_v_predictor_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2400 const uint16_t *above, const uint16_t *left,
2401 int32_t bd) {
2402 __m256i ab[2], rep[2];
2403 (void)bd;
2404
2405 smooth_v_init_8(above, left, 16, ab, rep);
2406
2407 for (int32_t i = 0; i < 2; i++)
2408 smooth_v_pred_8x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2409 }
2410
2411 // 8x32
2412
svt_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2413 void svt_aom_highbd_smooth_v_predictor_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
2414 const uint16_t *above, const uint16_t *left,
2415 int32_t bd) {
2416 __m256i ab[2], rep[2];
2417 (void)bd;
2418
2419 smooth_v_init_8(above, left, 32, ab, rep);
2420
2421 for (int32_t i = 0; i < 4; i++)
2422 smooth_v_pred_8x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2423 }
2424
2425 // -----------------------------------------------------------------------------
2426 // 16xN
2427
smooth_v_prepare_ab(const uint16_t * const above,const __m256i b,__m256i * const ab)2428 static INLINE void smooth_v_prepare_ab(const uint16_t *const above, const __m256i b,
2429 __m256i *const ab) {
2430 const __m256i a = _mm256_loadu_si256((const __m256i *)above);
2431 ab[0] = _mm256_unpacklo_epi16(a, b);
2432 ab[1] = _mm256_unpackhi_epi16(a, b);
2433 }
2434
smooth_v_init_16(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2435 static INLINE void smooth_v_init_16(const uint16_t *const above, const uint16_t *const left,
2436 const int32_t h, __m256i *const ab, __m256i *const rep) {
2437 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2438 smooth_v_prepare_ab(above, b, ab);
2439
2440 rep[0] = _mm256_set1_epi32(0x03020100);
2441 rep[1] = _mm256_set1_epi32(0x07060504);
2442 rep[2] = _mm256_set1_epi32(0x0B0A0908);
2443 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2444 }
2445
smooth_v_pred_16(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2446 static INLINE void smooth_v_pred_16(const __m256i weights, const __m256i rep,
2447 const __m256i *const ab, uint16_t **const dst,
2448 const ptrdiff_t stride) {
2449 const __m256i d = smooth_v_pred_kernel(weights, rep, ab);
2450 _mm256_storeu_si256((__m256i *)*dst, d);
2451 *dst += stride;
2452 }
2453
smooth_v_pred_16x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2454 static INLINE void smooth_v_pred_16x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2455 const __m256i *const ab, uint16_t **const dst,
2456 const ptrdiff_t stride) {
2457 const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2458 smooth_v_pred_16(weights, rep[0], ab, dst, stride);
2459 smooth_v_pred_16(weights, rep[1], ab, dst, stride);
2460 smooth_v_pred_16(weights, rep[2], ab, dst, stride);
2461 smooth_v_pred_16(weights, rep[3], ab, dst, stride);
2462 }
2463
smooth_v_pred_16x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2464 static INLINE void smooth_v_pred_16x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2465 const __m256i *const ab, uint16_t **const dst,
2466 const ptrdiff_t stride) {
2467 smooth_v_pred_16x4(sm_weights_h + 0, rep, ab, dst, stride);
2468 smooth_v_pred_16x4(sm_weights_h + 16, rep, ab, dst, stride);
2469 }
2470
2471 // 16x4
2472
svt_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2473 void svt_aom_highbd_smooth_v_predictor_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
2474 const uint16_t *above, const uint16_t *left,
2475 int32_t bd) {
2476 __m256i ab[2], rep[4];
2477 (void)bd;
2478
2479 smooth_v_init_16(above, left, 4, ab, rep);
2480 smooth_v_pred_16x4(sm_weights_d_4, rep, ab, &dst, stride);
2481 }
2482
2483 // 16x8
2484
svt_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2485 void svt_aom_highbd_smooth_v_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
2486 const uint16_t *above, const uint16_t *left,
2487 int32_t bd) {
2488 __m256i ab[2], rep[4];
2489 (void)bd;
2490
2491 smooth_v_init_16(above, left, 8, ab, rep);
2492
2493 smooth_v_pred_16x8(sm_weights_d_8, rep, ab, &dst, stride);
2494 }
2495
2496 // 16x16
2497
svt_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2498 void svt_aom_highbd_smooth_v_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
2499 const uint16_t *above, const uint16_t *left,
2500 int32_t bd) {
2501 __m256i ab[2], rep[4];
2502 (void)bd;
2503
2504 smooth_v_init_16(above, left, 16, ab, rep);
2505
2506 for (int32_t i = 0; i < 2; i++)
2507 smooth_v_pred_16x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2508 }
2509
2510 // 16x32
2511
svt_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2512 void svt_aom_highbd_smooth_v_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
2513 const uint16_t *above, const uint16_t *left,
2514 int32_t bd) {
2515 __m256i ab[2], rep[4];
2516 (void)bd;
2517
2518 smooth_v_init_16(above, left, 32, ab, rep);
2519
2520 for (int32_t i = 0; i < 4; i++)
2521 smooth_v_pred_16x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2522 }
2523
2524 // 16x64
2525
svt_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2526 void svt_aom_highbd_smooth_v_predictor_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
2527 const uint16_t *above, const uint16_t *left,
2528 int32_t bd) {
2529 __m256i ab[2], rep[4];
2530 (void)bd;
2531
2532 smooth_v_init_16(above, left, 64, ab, rep);
2533
2534 for (int32_t i = 0; i < 8; i++)
2535 smooth_v_pred_16x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2536 }
2537
2538 // -----------------------------------------------------------------------------
2539 // 32xN
2540
smooth_v_init_32(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2541 static INLINE void smooth_v_init_32(const uint16_t *const above, const uint16_t *const left,
2542 const int32_t h, __m256i *const ab, __m256i *const rep) {
2543 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2544 smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2545 smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2546
2547 rep[0] = _mm256_set1_epi32(0x03020100);
2548 rep[1] = _mm256_set1_epi32(0x07060504);
2549 rep[2] = _mm256_set1_epi32(0x0B0A0908);
2550 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2551 }
2552
smooth_v_pred_32(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2553 static INLINE void smooth_v_pred_32(const __m256i weights, const __m256i rep,
2554 const __m256i *const ab, uint16_t **const dst,
2555 const ptrdiff_t stride) {
2556 __m256i d;
2557
2558 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2559 d = smooth_v_pred_kernel(weights, rep, ab + 0);
2560 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2561
2562 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2563 d = smooth_v_pred_kernel(weights, rep, ab + 2);
2564 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2565 *dst += stride;
2566 }
2567
smooth_v_pred_32x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2568 static INLINE void smooth_v_pred_32x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2569 const __m256i *const ab, uint16_t **const dst,
2570 const ptrdiff_t stride) {
2571 const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2572 smooth_v_pred_32(weights, rep[0], ab, dst, stride);
2573 smooth_v_pred_32(weights, rep[1], ab, dst, stride);
2574 smooth_v_pred_32(weights, rep[2], ab, dst, stride);
2575 smooth_v_pred_32(weights, rep[3], ab, dst, stride);
2576 }
2577
smooth_v_pred_32x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2578 static INLINE void smooth_v_pred_32x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2579 const __m256i *const ab, uint16_t **const dst,
2580 const ptrdiff_t stride) {
2581 smooth_v_pred_32x4(sm_weights_h + 0, rep, ab, dst, stride);
2582 smooth_v_pred_32x4(sm_weights_h + 16, rep, ab, dst, stride);
2583 }
2584
2585 // 32x8
2586
svt_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2587 void svt_aom_highbd_smooth_v_predictor_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
2588 const uint16_t *above, const uint16_t *left,
2589 int32_t bd) {
2590 __m256i ab[4], rep[4];
2591 (void)bd;
2592
2593 smooth_v_init_32(above, left, 8, ab, rep);
2594
2595 smooth_v_pred_32x8(sm_weights_d_8, rep, ab, &dst, stride);
2596 }
2597
2598 // 32x16
2599
svt_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2600 void svt_aom_highbd_smooth_v_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
2601 const uint16_t *above, const uint16_t *left,
2602 int32_t bd) {
2603 __m256i ab[4], rep[4];
2604 (void)bd;
2605
2606 smooth_v_init_32(above, left, 16, ab, rep);
2607
2608 for (int32_t i = 0; i < 2; i++)
2609 smooth_v_pred_32x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2610 }
2611
2612 // 32x32
2613
svt_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2614 void svt_aom_highbd_smooth_v_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
2615 const uint16_t *above, const uint16_t *left,
2616 int32_t bd) {
2617 __m256i ab[4], rep[4];
2618 (void)bd;
2619
2620 smooth_v_init_32(above, left, 32, ab, rep);
2621
2622 for (int32_t i = 0; i < 4; i++)
2623 smooth_v_pred_32x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2624 }
2625
2626 // 32x64
2627
svt_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2628 void svt_aom_highbd_smooth_v_predictor_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
2629 const uint16_t *above, const uint16_t *left,
2630 int32_t bd) {
2631 __m256i ab[4], rep[4];
2632 (void)bd;
2633
2634 smooth_v_init_32(above, left, 64, ab, rep);
2635
2636 for (int32_t i = 0; i < 8; i++)
2637 smooth_v_pred_32x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2638 }
2639
2640 // -----------------------------------------------------------------------------
2641 // 64xN
2642
smooth_v_init_64(const uint16_t * const above,const uint16_t * const left,const int32_t h,__m256i * const ab,__m256i * const rep)2643 static INLINE void smooth_v_init_64(const uint16_t *const above, const uint16_t *const left,
2644 const int32_t h, __m256i *const ab, __m256i *const rep) {
2645 const __m256i b = _mm256_set1_epi16((uint16_t)left[h - 1]);
2646 smooth_v_prepare_ab(above + 0x00, b, ab + 0);
2647 smooth_v_prepare_ab(above + 0x10, b, ab + 2);
2648 smooth_v_prepare_ab(above + 0x20, b, ab + 4);
2649 smooth_v_prepare_ab(above + 0x30, b, ab + 6);
2650
2651 rep[0] = _mm256_set1_epi32(0x03020100);
2652 rep[1] = _mm256_set1_epi32(0x07060504);
2653 rep[2] = _mm256_set1_epi32(0x0B0A0908);
2654 rep[3] = _mm256_set1_epi32(0x0F0E0D0C);
2655 }
2656
smooth_v_pred_64(const __m256i weights,const __m256i rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2657 static INLINE void smooth_v_pred_64(const __m256i weights, const __m256i rep,
2658 const __m256i *const ab, uint16_t **const dst,
2659 const ptrdiff_t stride) {
2660 __m256i d;
2661
2662 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2663 d = smooth_v_pred_kernel(weights, rep, ab + 0);
2664 _mm256_storeu_si256((__m256i *)(*dst + 0x00), d);
2665
2666 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
2667 d = smooth_v_pred_kernel(weights, rep, ab + 2);
2668 _mm256_storeu_si256((__m256i *)(*dst + 0x10), d);
2669
2670 // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
2671 d = smooth_v_pred_kernel(weights, rep, ab + 4);
2672 _mm256_storeu_si256((__m256i *)(*dst + 0x20), d);
2673
2674 // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
2675 d = smooth_v_pred_kernel(weights, rep, ab + 6);
2676 _mm256_storeu_si256((__m256i *)(*dst + 0x30), d);
2677 *dst += stride;
2678 }
2679
smooth_v_pred_64x4(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2680 static INLINE void smooth_v_pred_64x4(const uint16_t *const sm_weights_h, const __m256i *const rep,
2681 const __m256i *const ab, uint16_t **const dst,
2682 const ptrdiff_t stride) {
2683 const __m256i weights = _mm256_loadu_si256((const __m256i *)sm_weights_h);
2684 smooth_v_pred_64(weights, rep[0], ab, dst, stride);
2685 smooth_v_pred_64(weights, rep[1], ab, dst, stride);
2686 smooth_v_pred_64(weights, rep[2], ab, dst, stride);
2687 smooth_v_pred_64(weights, rep[3], ab, dst, stride);
2688 }
2689
smooth_v_pred_64x8(const uint16_t * const sm_weights_h,const __m256i * const rep,const __m256i * const ab,uint16_t ** const dst,const ptrdiff_t stride)2690 static INLINE void smooth_v_pred_64x8(const uint16_t *const sm_weights_h, const __m256i *const rep,
2691 const __m256i *const ab, uint16_t **const dst,
2692 const ptrdiff_t stride) {
2693 smooth_v_pred_64x4(sm_weights_h + 0, rep, ab, dst, stride);
2694 smooth_v_pred_64x4(sm_weights_h + 16, rep, ab, dst, stride);
2695 }
2696
2697 // 64x16
2698
svt_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2699 void svt_aom_highbd_smooth_v_predictor_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
2700 const uint16_t *above, const uint16_t *left,
2701 int32_t bd) {
2702 __m256i ab[8], rep[4];
2703 (void)bd;
2704
2705 smooth_v_init_64(above, left, 16, ab, rep);
2706
2707 for (int32_t i = 0; i < 2; i++)
2708 smooth_v_pred_64x8(sm_weights_d_16 + 32 * i, rep, ab, &dst, stride);
2709 }
2710
2711 // 64x32
2712
svt_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2713 void svt_aom_highbd_smooth_v_predictor_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
2714 const uint16_t *above, const uint16_t *left,
2715 int32_t bd) {
2716 __m256i ab[8], rep[4];
2717 (void)bd;
2718
2719 smooth_v_init_64(above, left, 32, ab, rep);
2720
2721 for (int32_t i = 0; i < 4; i++)
2722 smooth_v_pred_64x8(sm_weights_d_32 + 32 * i, rep, ab, &dst, stride);
2723 }
2724
2725 // 64x64
2726
svt_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int32_t bd)2727 void svt_aom_highbd_smooth_v_predictor_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
2728 const uint16_t *above, const uint16_t *left,
2729 int32_t bd) {
2730 __m256i ab[8], rep[4];
2731 (void)bd;
2732
2733 smooth_v_init_64(above, left, 64, ab, rep);
2734
2735 for (int32_t i = 0; i < 8; i++)
2736 smooth_v_pred_64x8(sm_weights_d_64 + 32 * i, rep, ab, &dst, stride);
2737 }
2738