1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "common_dsp_rtcd.h"
15 
16 #include "convolve.h"
17 #include "convolve_avx2.h"
18 // #include "aom_ports/mem.h"
19 
20 #if defined(__clang__)
21 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
22     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
23     (defined(__APPLE__) && defined(__apple_build_version__) && \
24      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
25       (__clang_major__ == 5 && __clang_minor__ == 0)))
26 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256((__m128i const *)&(x))
27 #else // clang > 3.3, and not 5.0 on macosx.
28 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
29 #endif // clang <= 3.3
30 #elif defined(__GNUC__)
31 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
32 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256((__m128i const *)&(x))
33 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
34 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
35 #else // gcc > 4.7
36 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
37 #endif // gcc <= 4.6
38 #else // !(gcc || clang)
39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
40 #endif // __clang__
41 
42 typedef void Filter81dFunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
43                                ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter);
44 void         svt_aom_filter_block1d4_v8_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
45                                              uint8_t *output_ptr, ptrdiff_t out_pitch,
46                                              uint32_t output_height, const int16_t *filter);
47 void         svt_aom_filter_block1d16_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
48                                                uint8_t *output_ptr, ptrdiff_t out_pitch,
49                                                uint32_t output_height, const int16_t *filter);
50 void         svt_aom_filter_block1d16_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
51                                                uint8_t *output_ptr, ptrdiff_t out_pitch,
52                                                uint32_t output_height, const int16_t *filter);
53 void         svt_aom_filter_block1d8_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
54                                               uint8_t *output_ptr, ptrdiff_t out_pitch,
55                                               uint32_t output_height, const int16_t *filter);
56 void         svt_aom_filter_block1d8_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
57                                               uint8_t *output_ptr, ptrdiff_t out_pitch,
58                                               uint32_t output_height, const int16_t *filter);
59 void         svt_aom_filter_block1d4_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
60                                               uint8_t *output_ptr, ptrdiff_t out_pitch,
61                                               uint32_t output_height, const int16_t *filter);
62 void         svt_aom_filter_block1d4_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
63                                               uint8_t *output_ptr, ptrdiff_t out_pitch,
64                                               uint32_t output_height, const int16_t *filter);
65 
66 Filter81dFunction svt_aom_filter_block1d4_v8_ssse3;
67 Filter81dFunction svt_aom_filter_block1d16_v2_ssse3;
68 Filter81dFunction svt_aom_filter_block1d16_h2_ssse3;
69 Filter81dFunction svt_aom_filter_block1d8_v2_ssse3;
70 Filter81dFunction svt_aom_filter_block1d8_h2_ssse3;
71 Filter81dFunction svt_aom_filter_block1d4_v2_ssse3;
72 Filter81dFunction svt_aom_filter_block1d4_h2_ssse3;
73 #define svt_aom_filter_block1d4_v8_avx2 svt_aom_filter_block1d4_v8_sse2
74 #define svt_aom_filter_block1d16_v2_avx2 svt_aom_filter_block1d16_v2_ssse3
75 #define svt_aom_filter_block1d16_h2_avx2 svt_aom_filter_block1d16_h2_ssse3
76 #define svt_aom_filter_block1d8_v2_avx2 svt_aom_filter_block1d8_v2_ssse3
77 #define svt_aom_filter_block1d8_h2_avx2 svt_aom_filter_block1d8_h2_ssse3
78 #define svt_aom_filter_block1d4_v2_avx2 svt_aom_filter_block1d4_v2_ssse3
79 #define svt_aom_filter_block1d4_h2_avx2 svt_aom_filter_block1d4_h2_ssse3
80 
81 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)                               \
82     void svt_aom_convolve8_##name##_##opt(const uint8_t *src,                                      \
83                                           ptrdiff_t      src_stride,                               \
84                                           uint8_t *      dst,                                      \
85                                           ptrdiff_t      dst_stride,                               \
86                                           const int16_t *filter_x,                                 \
87                                           int            x_step_q4,                                \
88                                           const int16_t *filter_y,                                 \
89                                           int            y_step_q4,                                \
90                                           int            w,                                        \
91                                           int            h) {                                                 \
92         (void)filter_x;                                                                            \
93         (void)x_step_q4;                                                                           \
94         (void)filter_y;                                                                            \
95         (void)y_step_q4;                                                                           \
96         assert((-128 <= filter[3]) && (filter[3] <= 127));                                         \
97         assert(step_q4 == 16);                                                                     \
98         if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && (filter[2] | filter[5])) {   \
99             while (w >= 16) {                                                                      \
100                 svt_aom_filter_block1d16_##dir##4_##avg##opt(                                      \
101                     src_start, src_stride, dst, dst_stride, h, filter);                            \
102                 src += 16;                                                                         \
103                 dst += 16;                                                                         \
104                 w -= 16;                                                                           \
105             }                                                                                      \
106             while (w >= 8) {                                                                       \
107                 svt_aom_filter_block1d8_##dir##4_##avg##opt(                                       \
108                     src_start, src_stride, dst, dst_stride, h, filter);                            \
109                 src += 8;                                                                          \
110                 dst += 8;                                                                          \
111                 w -= 8;                                                                            \
112             }                                                                                      \
113             while (w >= 4) {                                                                       \
114                 svt_aom_filter_block1d4_##dir##4_##avg##opt(                                       \
115                     src_start, src_stride, dst, dst_stride, h, filter);                            \
116                 src += 4;                                                                          \
117                 dst += 4;                                                                          \
118                 w -= 4;                                                                            \
119             }                                                                                      \
120         } else if (filter[0] | filter[1] | filter[2]) {                                            \
121             while (w >= 16) {                                                                      \
122                 svt_aom_filter_block1d16_##dir##8_##avg##opt(                                      \
123                     src_start, src_stride, dst, dst_stride, h, filter);                            \
124                 src += 16;                                                                         \
125                 dst += 16;                                                                         \
126                 w -= 16;                                                                           \
127             }                                                                                      \
128             while (w >= 8) {                                                                       \
129                 svt_aom_filter_block1d8_##dir##8_##avg##opt(                                       \
130                     src_start, src_stride, dst, dst_stride, h, filter);                            \
131                 src += 8;                                                                          \
132                 dst += 8;                                                                          \
133                 w -= 8;                                                                            \
134             }                                                                                      \
135             while (w >= 4) {                                                                       \
136                 svt_aom_filter_block1d4_##dir##8_##avg##opt(                                       \
137                     src_start, src_stride, dst, dst_stride, h, filter);                            \
138                 src += 4;                                                                          \
139                 dst += 4;                                                                          \
140                 w -= 4;                                                                            \
141             }                                                                                      \
142         } else {                                                                                   \
143             while (w >= 16) {                                                                      \
144                 svt_aom_filter_block1d16_##dir##2_##avg##opt(                                      \
145                     src, src_stride, dst, dst_stride, h, filter);                                  \
146                 src += 16;                                                                         \
147                 dst += 16;                                                                         \
148                 w -= 16;                                                                           \
149             }                                                                                      \
150             while (w >= 8) {                                                                       \
151                 svt_aom_filter_block1d8_##dir##2_##avg##opt(                                       \
152                     src, src_stride, dst, dst_stride, h, filter);                                  \
153                 src += 8;                                                                          \
154                 dst += 8;                                                                          \
155                 w -= 8;                                                                            \
156             }                                                                                      \
157             while (w >= 4) {                                                                       \
158                 svt_aom_filter_block1d4_##dir##2_##avg##opt(                                       \
159                     src, src_stride, dst, dst_stride, h, filter);                                  \
160                 src += 4;                                                                          \
161                 dst += 4;                                                                          \
162                 w -= 4;                                                                            \
163             }                                                                                      \
164         }                                                                                          \
165         if (w) {                                                                                   \
166             svt_aom_convolve8_##name##_c(                                                          \
167                 src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
168         }                                                                                          \
169     }
170 
171 // filters for 16
172 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
173     0,  1,  1, 2,  2,  3,  3,  4,  4, 5,  5,  6,  6,  7,  7,  8,  0,  1,  1,  2,  2,  3,
174     3,  4,  4, 5,  5,  6,  6,  7,  7, 8,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
175     8,  9,  9, 10, 2,  3,  3,  4,  4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 4,  5,
176     5,  6,  6, 7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 4,  5,  5,  6,  6,  7,  7,  8,
177     8,  9,  9, 10, 10, 11, 11, 12, 6, 7,  7,  8,  8,  9,  9,  10, 10, 11, 11, 12, 12, 13,
178     13, 14, 6, 7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
179 
180 DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
181     0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
182     3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
183     7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
184 };
185 
186 DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
187     2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
188 };
189 
xx_storeu2_epi32(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)190 static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, const ptrdiff_t stride,
191                                     const __m256i *a) {
192     *((uint32_t *)(output_ptr))          = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
193     *((uint32_t *)(output_ptr + stride)) = _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
194 }
195 
xx_loadu2_epi64(const void * hi,const void * lo)196 static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
197     __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
198     a         = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
199     return a;
200 }
201 
xx_storeu2_epi64(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)202 static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, const ptrdiff_t stride,
203                                     const __m256i *a) {
204     _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
205     _mm_storel_epi64((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1));
206 }
207 
xx_loadu2_mi128(const void * hi,const void * lo)208 static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
209     __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
210     a         = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
211     return a;
212 }
213 
xx_store2_mi128(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)214 static INLINE void xx_store2_mi128(const uint8_t *output_ptr, const ptrdiff_t stride,
215                                    const __m256i *a) {
216     _mm_storeu_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
217     _mm_storeu_si128((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1));
218 }
219 
svt_aom_filter_block1d4_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)220 static void svt_aom_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
221                                             uint8_t *output_ptr, ptrdiff_t output_pitch,
222                                             uint32_t output_height, const int16_t *filter) {
223     __m128i      filters_reg;
224     __m256i      add_filter_reg32, filt1_reg, first_filters, src_reg32b1, src_reg_filt32b1_1;
225     unsigned int i;
226     ptrdiff_t    src_stride, dst_stride;
227     src_ptr -= 3;
228     add_filter_reg32 = _mm256_set1_epi16(32);
229     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
230     filters_reg      = _mm_srai_epi16(filters_reg, 1);
231     // converting the 16 bit (short) to 8 bit (byte) and have the same data
232     // in both lanes of 128 bit register.
233     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
234     // have the same data in both lanes of a 256 bit register
235     const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
236 
237     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi32(0x5040302u));
238     filt1_reg     = _mm256_loadu_si256((__m256i const *)(filt4_d4_global_avx2));
239 
240     // multiple the size of the source and destination stride by two
241     src_stride = src_pixels_per_line << 1;
242     dst_stride = output_pitch << 1;
243     for (i = output_height; i > 1; i -= 2) {
244         // load the 2 strides of source
245         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
246 
247         // filter the source buffer
248         src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
249 
250         // multiply 4 adjacent elements with the filter and add the result
251         src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
252 
253         src_reg_filt32b1_1 = _mm256_hadds_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
254 
255         // shift by 6 bit each 16 bit
256         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
257         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
258 
259         // shrink to 8 bit each 16 bits, the first lane contain the first
260         // convolve result and the second lane contain the second convolve result
261         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
262 
263         src_ptr += src_stride;
264 
265         xx_storeu2_epi32(output_ptr, output_pitch, &src_reg_filt32b1_1);
266         output_ptr += dst_stride;
267     }
268 
269     // if the number of strides is odd.
270     // process only 4 bytes
271     if (i > 0) {
272         __m128i src_reg1, src_reg_filt1_1;
273 
274         src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
275 
276         // filter the source buffer
277         src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
278 
279         // multiply 4 adjacent elements with the filter and add the result
280         src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
281 
282         src_reg_filt1_1 = _mm_hadds_epi16(src_reg_filt1_1, _mm_setzero_si128());
283         // shift by 6 bit each 16 bit
284         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
285         src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
286 
287         // shrink to 8 bit each 16 bits, the first lane contain the first
288         // convolve result and the second lane contain the second convolve result
289         src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
290 
291         // save 4 bytes
292         *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(src_reg_filt1_1);
293     }
294 }
295 
svt_aom_filter_block1d4_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)296 static void svt_aom_filter_block1d4_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
297                                             uint8_t *output_ptr, ptrdiff_t output_pitch,
298                                             uint32_t output_height, const int16_t *filter) {
299     __m128i      filters_reg;
300     __m256i      add_filter_reg32, filt1_reg, filt2_reg;
301     __m256i      first_filters, second_filters;
302     __m256i      src_reg_filt32b1_1, src_reg_Filt32b2;
303     __m256i      src_reg32b1;
304     unsigned int i;
305     ptrdiff_t    src_stride, dst_stride;
306     src_ptr -= 3;
307     add_filter_reg32 = _mm256_set1_epi16(32);
308     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
309     filters_reg      = _mm_srai_epi16(filters_reg, 1);
310     // converting the 16 bit (short) to 8 bit (byte) and have the same data
311     // in both lanes of 128 bit register.
312     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
313     // have the same data in both lanes of a 256 bit register
314     const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
315 
316     // duplicate only the first 32 bits
317     first_filters = _mm256_shuffle_epi32(filters_reg32, 0);
318     // duplicate only the second 32 bits
319     second_filters = _mm256_shuffle_epi32(filters_reg32, 0x55);
320 
321     filt1_reg = _mm256_loadu_si256((__m256i const *)filt_d4_global_avx2);
322     filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_d4_global_avx2 + 32));
323 
324     // multiple the size of the source and destination stride by two
325     src_stride = src_pixels_per_line << 1;
326     dst_stride = output_pitch << 1;
327     for (i = output_height; i > 1; i -= 2) {
328         // load the 2 strides of source
329         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
330 
331         // filter the source buffer
332         src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
333 
334         // multiply 4 adjacent elements with the filter and add the result
335         src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
336 
337         // filter the source buffer
338         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
339 
340         // multiply 4 adjacent elements with the filter and add the result
341         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, second_filters);
342 
343         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
344 
345         src_reg_filt32b1_1 = _mm256_hadds_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
346 
347         // shift by 6 bit each 16 bit
348         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
349         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
350 
351         // shrink to 8 bit each 16 bits, the first lane contain the first
352         // convolve result and the second lane contain the second convolve result
353         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
354 
355         src_ptr += src_stride;
356 
357         xx_storeu2_epi32(output_ptr, output_pitch, &src_reg_filt32b1_1);
358         output_ptr += dst_stride;
359     }
360 
361     // if the number of strides is odd.
362     // process only 4 bytes
363     if (i > 0) {
364         __m128i src_reg1, src_reg_filt1_1;
365         __m128i src_reg_filt2;
366 
367         src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
368 
369         // filter the source buffer
370         src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
371 
372         // multiply 4 adjacent elements with the filter and add the result
373         src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
374 
375         // filter the source buffer
376         src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
377 
378         // multiply 4 adjacent elements with the filter and add the result
379         src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(second_filters));
380 
381         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
382         src_reg_filt1_1 = _mm_hadds_epi16(src_reg_filt1_1, _mm_setzero_si128());
383         // shift by 6 bit each 16 bit
384         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
385         src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
386 
387         // shrink to 8 bit each 16 bits, the first lane contain the first
388         // convolve result and the second lane contain the second convolve result
389         src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
390 
391         // save 4 bytes
392         *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(src_reg_filt1_1);
393     }
394 }
395 
svt_aom_filter_block1d8_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)396 static void svt_aom_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
397                                             uint8_t *output_ptr, ptrdiff_t output_pitch,
398                                             uint32_t output_height, const int16_t *filter) {
399     __m128i      filters_reg;
400     __m256i      add_filter_reg32, filt2_reg, filt3_reg;
401     __m256i      second_filters, third_filters;
402     __m256i      src_reg_filt32b1_1, src_reg_Filt32b2, src_reg_Filt32b3;
403     __m256i      src_reg32b1, filters_reg32;
404     unsigned int i;
405     ptrdiff_t    src_stride, dst_stride;
406     src_ptr -= 3;
407     add_filter_reg32 = _mm256_set1_epi16(32);
408     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
409     filters_reg      = _mm_srai_epi16(filters_reg, 1);
410     // converting the 16 bit (short) to 8 bit (byte) and have the same data
411     // in both lanes of 128 bit register.
412     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
413     // have the same data in both lanes of a 256 bit register
414     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
415 
416     // duplicate only the second 16 bits (third and forth byte)
417     // across 256 bit register
418     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
419     // duplicate only the third 16 bits (fifth and sixth byte)
420     // across 256 bit register
421     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
422 
423     filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
424     filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
425 
426     // multiply the size of the source and destination stride by two
427     src_stride = src_pixels_per_line << 1;
428     dst_stride = output_pitch << 1;
429     for (i = output_height; i > 1; i -= 2) {
430         // load the 2 strides of source
431         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
432 
433         // filter the source buffer
434         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
435         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
436 
437         // multiply 2 adjacent elements with the filter and add the result
438         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
439         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
440 
441         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
442 
443         // shift by 6 bit each 16 bit
444         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
445         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
446 
447         // shrink to 8 bit each 16 bits
448         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_filt32b1_1);
449 
450         src_ptr += src_stride;
451 
452         xx_storeu2_epi64(output_ptr, output_pitch, &src_reg_filt32b1_1);
453         output_ptr += dst_stride;
454     }
455 
456     // if the number of strides is odd.
457     // process only 8 bytes
458     if (i > 0) {
459         __m128i src_reg1, src_reg_filt1_1;
460         __m128i src_reg_filt2, src_reg_filt3;
461 
462         src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
463 
464         // filter the source buffer
465         src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
466         src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
467 
468         // multiply 2 adjacent elements with the filter and add the result
469         src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(second_filters));
470         src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(third_filters));
471 
472         // add and saturate the results together
473         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt2, src_reg_filt3);
474 
475         // shift by 6 bit each 16 bit
476         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
477         src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
478 
479         // shrink to 8 bit each 16 bits
480         src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
481 
482         // save 8 bytes
483         _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1_1);
484     }
485 }
486 
svt_aom_filter_block1d8_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)487 static void svt_aom_filter_block1d8_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
488                                             uint8_t *output_ptr, ptrdiff_t output_pitch,
489                                             uint32_t output_height, const int16_t *filter) {
490     __m128i      filters_reg;
491     __m256i      add_filter_reg32, filt1_reg, filt2_reg, filt3_reg, filt4_reg;
492     __m256i      first_filters, second_filters, third_filters, forth_filters;
493     __m256i      src_reg_filt32b1_1, src_reg_Filt32b2, src_reg_Filt32b3;
494     __m256i      src_reg32b1;
495     unsigned int i;
496     ptrdiff_t    src_stride, dst_stride;
497     src_ptr -= 3;
498     add_filter_reg32 = _mm256_set1_epi16(32);
499     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
500     filters_reg      = _mm_srai_epi16(filters_reg, 1);
501     // converting the 16 bit (short) to 8 bit (byte) and have the same data
502     // in both lanes of 128 bit register.
503     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
504     // have the same data in both lanes of a 256 bit register
505     const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
506 
507     // duplicate only the first 16 bits (first and second byte)
508     // across 256 bit register
509     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
510     // duplicate only the second 16 bits (third and forth byte)
511     // across 256 bit register
512     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
513     // duplicate only the third 16 bits (fifth and sixth byte)
514     // across 256 bit register
515     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
516     // duplicate only the forth 16 bits (seventh and eighth byte)
517     // across 256 bit register
518     forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
519 
520     filt1_reg = _mm256_loadu_si256((__m256i const *)filt_global_avx2);
521     filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
522     filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
523     filt4_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
524 
525     // multiple the size of the source and destination stride by two
526     src_stride = src_pixels_per_line << 1;
527     dst_stride = output_pitch << 1;
528     for (i = output_height; i > 1; i -= 2) {
529         // load the 2 strides of source
530         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
531 
532         // filter the source buffer
533         src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
534         src_reg_Filt32b2   = _mm256_shuffle_epi8(src_reg32b1, filt4_reg);
535 
536         // multiply 2 adjacent elements with the filter and add the result
537         src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
538         src_reg_Filt32b2   = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
539 
540         // add and saturate the results together
541         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
542 
543         // filter the source buffer
544         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
545         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
546 
547         // multiply 2 adjacent elements with the filter and add the result
548         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
549         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
550 
551         __m256i sum23      = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
552         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, sum23);
553 
554         // shift by 6 bit each 16 bit
555         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
556         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
557 
558         // shrink to 8 bit each 16 bits, the first lane contain the first
559         // convolve result and the second lane contain the second convolve result
560         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
561 
562         src_ptr += src_stride;
563 
564         xx_storeu2_epi64(output_ptr, output_pitch, &src_reg_filt32b1_1);
565         output_ptr += dst_stride;
566     }
567 
568     // if the number of strides is odd.
569     // process only 8 bytes
570     if (i > 0) {
571         __m128i src_reg1, src_reg_filt1_1;
572         __m128i src_reg_filt2, src_reg_filt3;
573 
574         src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
575 
576         // filter the source buffer
577         src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
578         src_reg_filt2   = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt4_reg));
579 
580         // multiply 2 adjacent elements with the filter and add the result
581         src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
582         src_reg_filt2   = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
583 
584         // add and saturate the results together
585         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
586 
587         // filter the source buffer
588         src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
589         src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
590 
591         // multiply 2 adjacent elements with the filter and add the result
592         src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
593         src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
594 
595         // add and saturate the results together
596         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1,
597                                          _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
598 
599         // shift by 6 bit each 16 bit
600         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
601         src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
602 
603         // shrink to 8 bit each 16 bits, the first lane contain the first
604         // convolve result and the second lane contain the second convolve
605         // result
606         src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
607 
608         // save 8 bytes
609         _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1_1);
610     }
611 }
612 
svt_aom_filter_block1d16_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)613 static void svt_aom_filter_block1d16_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
614                                              uint8_t *output_ptr, ptrdiff_t output_pitch,
615                                              uint32_t output_height, const int16_t *filter) {
616     __m128i      filters_reg;
617     __m256i      add_filter_reg32, filt2_reg, filt3_reg;
618     __m256i      second_filters, third_filters;
619     __m256i      src_reg_filt32b1_1, src_reg_Filt32b2_1, src_reg_Filt32b2, src_reg_Filt32b3;
620     __m256i      src_reg32b1, src_reg_32b2, filters_reg32;
621     unsigned int i;
622     ptrdiff_t    src_stride, dst_stride;
623     src_ptr -= 3;
624     add_filter_reg32 = _mm256_set1_epi16(32);
625     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
626     filters_reg      = _mm_srai_epi16(filters_reg, 1);
627     // converting the 16 bit (short) to 8 bit (byte) and have the same data
628     // in both lanes of 128 bit register.
629     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
630     // have the same data in both lanes of a 256 bit register
631     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
632 
633     // duplicate only the second 16 bits (third and forth byte)
634     // across 256 bit register
635     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
636     // duplicate only the third 16 bits (fifth and sixth byte)
637     // across 256 bit register
638     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
639 
640     filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
641     filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
642 
643     // multiply the size of the source and destination stride by two
644     src_stride = src_pixels_per_line << 1;
645     dst_stride = output_pitch << 1;
646     for (i = output_height; i > 1; i -= 2) {
647         // load the 2 strides of source
648         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
649 
650         // filter the source buffer
651         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
652         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
653 
654         // multiply 2 adjacent elements with the filter and add the result
655         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
656         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
657 
658         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
659 
660         // reading 2 strides of the next 16 bytes
661         // (part of it was being read by earlier read)
662         src_reg_32b2 = xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
663 
664         // filter the source buffer
665         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg_32b2, filt2_reg);
666         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg_32b2, filt3_reg);
667 
668         // multiply 2 adjacent elements with the filter and add the result
669         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
670         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
671 
672         // add and saturate the results together
673         src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
674 
675         // shift by 6 bit each 16 bit
676         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
677         src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, add_filter_reg32);
678         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
679         src_reg_Filt32b2_1 = _mm256_srai_epi16(src_reg_Filt32b2_1, 6);
680 
681         // shrink to 8 bit each 16 bits, the first lane contain the first
682         // convolve result and the second lane contain the second convolve result
683         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_Filt32b2_1);
684 
685         src_ptr += src_stride;
686 
687         xx_store2_mi128(output_ptr, output_pitch, &src_reg_filt32b1_1);
688         output_ptr += dst_stride;
689     }
690 
691     // if the number of strides is odd.
692     // process only 16 bytes
693     if (i > 0) {
694         __m256i src_reg1, src_reg12;
695         __m256i src_reg_filt2, src_reg_filt3, src_reg_filt1_1;
696 
697         src_reg1  = _mm256_loadu_si256((const __m256i *)(src_ptr));
698         src_reg12 = _mm256_permute4x64_epi64(src_reg1, 0x94);
699 
700         // filter the source buffer
701         src_reg_filt2 = _mm256_shuffle_epi8(src_reg12, filt2_reg);
702         src_reg_filt3 = _mm256_shuffle_epi8(src_reg12, filt3_reg);
703 
704         // multiply 2 adjacent elements with the filter and add the result
705         src_reg_filt2 = _mm256_maddubs_epi16(src_reg_filt2, second_filters);
706         src_reg_filt3 = _mm256_maddubs_epi16(src_reg_filt3, third_filters);
707 
708         // add and saturate the results together
709         src_reg_filt1_1 = _mm256_adds_epi16(src_reg_filt2, src_reg_filt3);
710 
711         // shift by 6 bit each 16 bit
712         src_reg_filt1_1 = _mm256_adds_epi16(src_reg_filt1_1, add_filter_reg32);
713         src_reg_filt1_1 = _mm256_srai_epi16(src_reg_filt1_1, 6);
714 
715         // shrink to 8 bit each 16 bits, the first lane contain the first
716         // convolve result and the second lane contain the second convolve
717         // result
718         src_reg_filt1_1 = _mm256_packus_epi16(src_reg_filt1_1, src_reg_filt1_1);
719         src_reg_filt1_1 = _mm256_permute4x64_epi64(src_reg_filt1_1, 0x8);
720 
721         // save 16 bytes
722         _mm_storeu_si128((__m128i *)output_ptr, _mm256_castsi256_si128(src_reg_filt1_1));
723     }
724 }
725 
svt_aom_filter_block1d16_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)726 static void svt_aom_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
727                                              uint8_t *output_ptr, ptrdiff_t output_pitch,
728                                              uint32_t output_height, const int16_t *filter) {
729     __m128i      filters_reg;
730     __m256i      add_filter_reg32, filt1_reg, filt2_reg, filt3_reg, filt4_reg;
731     __m256i      first_filters, second_filters, third_filters, forth_filters;
732     __m256i      src_reg_filt32b1_1, src_reg_Filt32b2_1, src_reg_Filt32b2, src_reg_Filt32b3;
733     __m256i      src_reg32b1, src_reg_32b2, filters_reg32;
734     unsigned int i;
735     ptrdiff_t    src_stride, dst_stride;
736     src_ptr -= 3;
737     add_filter_reg32 = _mm256_set1_epi16(32);
738     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
739     filters_reg      = _mm_srai_epi16(filters_reg, 1);
740     // converting the 16 bit (short) to 8 bit (byte) and have the same data
741     // in both lanes of 128 bit register.
742     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
743     // have the same data in both lanes of a 256 bit register
744     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
745 
746     // duplicate only the first 16 bits (first and second byte)
747     // across 256 bit register
748     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
749     // duplicate only the second 16 bits (third and forth byte)
750     // across 256 bit register
751     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
752     // duplicate only the third 16 bits (fifth and sixth byte)
753     // across 256 bit register
754     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
755     // duplicate only the forth 16 bits (seventh and eighth byte)
756     // across 256 bit register
757     forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
758 
759     filt1_reg = _mm256_loadu_si256((__m256i const *)filt_global_avx2);
760     filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
761     filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
762     filt4_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
763 
764     // multiple the size of the source and destination stride by two
765     src_stride = src_pixels_per_line << 1;
766     dst_stride = output_pitch << 1;
767     for (i = output_height; i > 1; i -= 2) {
768         // load the 2 strides of source
769         src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
770 
771         // filter the source buffer
772         src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
773         src_reg_Filt32b2   = _mm256_shuffle_epi8(src_reg32b1, filt4_reg);
774 
775         // multiply 2 adjacent elements with the filter and add the result
776         src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
777         src_reg_Filt32b2   = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
778 
779         // add and saturate the results together
780         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
781 
782         // filter the source buffer
783         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
784         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
785 
786         // multiply 2 adjacent elements with the filter and add the result
787         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
788         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
789 
790         __m256i sum23      = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
791         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, sum23);
792 
793         // reading 2 strides of the next 16 bytes
794         // (part of it was being read by earlier read)
795         src_reg_32b2 = xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
796 
797         // filter the source buffer
798         src_reg_Filt32b2_1 = _mm256_shuffle_epi8(src_reg_32b2, filt1_reg);
799         src_reg_Filt32b2   = _mm256_shuffle_epi8(src_reg_32b2, filt4_reg);
800 
801         // multiply 2 adjacent elements with the filter and add the result
802         src_reg_Filt32b2_1 = _mm256_maddubs_epi16(src_reg_Filt32b2_1, first_filters);
803         src_reg_Filt32b2   = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
804 
805         // add and saturate the results together
806         src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, src_reg_Filt32b2);
807 
808         // filter the source buffer
809         src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg_32b2, filt2_reg);
810         src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg_32b2, filt3_reg);
811 
812         // multiply 2 adjacent elements with the filter and add the result
813         src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
814         src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
815 
816         // add and saturate the results together
817         src_reg_Filt32b2_1 = _mm256_adds_epi16(
818             src_reg_Filt32b2_1, _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2));
819 
820         // shift by 6 bit each 16 bit
821         src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
822         src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, add_filter_reg32);
823         src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
824         src_reg_Filt32b2_1 = _mm256_srai_epi16(src_reg_Filt32b2_1, 6);
825 
826         // shrink to 8 bit each 16 bits, the first lane contain the first
827         // convolve result and the second lane contain the second convolve result
828         src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_Filt32b2_1);
829 
830         src_ptr += src_stride;
831 
832         xx_store2_mi128(output_ptr, output_pitch, &src_reg_filt32b1_1);
833         output_ptr += dst_stride;
834     }
835 
836     // if the number of strides is odd.
837     // process only 16 bytes
838     if (i > 0) {
839         __m128i src_reg1, src_reg2, src_reg_filt1_1, src_reg_filt2_1;
840         __m128i src_reg_filt2, src_reg_filt3;
841 
842         src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
843 
844         // filter the source buffer
845         src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
846         src_reg_filt2   = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt4_reg));
847 
848         // multiply 2 adjacent elements with the filter and add the result
849         src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
850         src_reg_filt2   = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
851 
852         // add and saturate the results together
853         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
854 
855         // filter the source buffer
856         src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
857         src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
858 
859         // multiply 2 adjacent elements with the filter and add the result
860         src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
861         src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
862 
863         // add and saturate the results together
864         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1,
865                                          _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
866 
867         // reading the next 16 bytes
868         // (part of it was being read by earlier read)
869         src_reg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
870 
871         // filter the source buffer
872         src_reg_filt2_1 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt1_reg));
873         src_reg_filt2   = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt4_reg));
874 
875         // multiply 2 adjacent elements with the filter and add the result
876         src_reg_filt2_1 = _mm_maddubs_epi16(src_reg_filt2_1, _mm256_castsi256_si128(first_filters));
877         src_reg_filt2   = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
878 
879         // add and saturate the results together
880         src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1, src_reg_filt2);
881 
882         // filter the source buffer
883         src_reg_filt3 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt2_reg));
884         src_reg_filt2 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt3_reg));
885 
886         // multiply 2 adjacent elements with the filter and add the result
887         src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
888         src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
889 
890         // add and saturate the results together
891         src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1,
892                                          _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
893 
894         // shift by 6 bit each 16 bit
895         src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
896         src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
897 
898         src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1, _mm256_castsi256_si128(add_filter_reg32));
899         src_reg_filt2_1 = _mm_srai_epi16(src_reg_filt2_1, 6);
900 
901         // shrink to 8 bit each 16 bits, the first lane contain the first
902         // convolve result and the second lane contain the second convolve
903         // result
904         src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, src_reg_filt2_1);
905 
906         // save 16 bytes
907         _mm_storeu_si128((__m128i *)output_ptr, src_reg_filt1_1);
908     }
909 }
910 
svt_aom_filter_block1d8_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)911 static void svt_aom_filter_block1d8_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
912                                             uint8_t *output_ptr, ptrdiff_t out_pitch,
913                                             uint32_t output_height, const int16_t *filter) {
914     __m128i      filters_reg;
915     __m256i      filters_reg32, add_filter_reg32;
916     __m256i      src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
917     __m256i      src_reg_23_34_lo, src_reg_45_56_lo;
918     __m256i      res_reg23_34_lo, res_reg45_56_lo;
919     __m256i      res_reg_lo, res_reg;
920     __m256i      second_filters, third_filters;
921     unsigned int i;
922     ptrdiff_t    src_stride, dst_stride;
923 
924     add_filter_reg32 = _mm256_set1_epi16(32);
925     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
926     // converting the 16 bit (short) to  8 bit (byte) and have the
927     // same data in both lanes of 128 bit register.
928     filters_reg = _mm_srai_epi16(filters_reg, 1);
929     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
930     // have the same data in both lanes of a 256 bit register
931     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
932 
933     // duplicate only the second 16 bits (third and forth byte)
934     // across 256 bit register
935     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
936     // duplicate only the third 16 bits (fifth and sixth byte)
937     // across 256 bit register
938     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
939 
940     // multiple the size of the source and destination stride by two
941     src_stride = src_pitch << 1;
942     dst_stride = out_pitch << 1;
943 
944     src_reg23  = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
945     src_reg_4x = _mm256_castsi128_si256(
946         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
947 
948     // have consecutive loads on the same 256 register
949     src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
950 
951     src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
952 
953     for (i = output_height; i > 1; i -= 2) {
954         // load the last 2 loads of 16 bytes and have every two
955         // consecutive loads in the same 256 bit register
956         src_reg_5x = _mm256_castsi128_si256(
957             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
958         src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
959 
960         src_reg_6x = _mm256_castsi128_si256(
961             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
962         src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
963 
964         // merge every two consecutive registers
965         src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
966 
967         // multiply 2 adjacent elements with the filter and add the result
968         res_reg23_34_lo = _mm256_maddubs_epi16(src_reg_23_34_lo, second_filters);
969         res_reg45_56_lo = _mm256_maddubs_epi16(src_reg_45_56_lo, third_filters);
970 
971         // add and saturate the results together
972         res_reg_lo = _mm256_adds_epi16(res_reg23_34_lo, res_reg45_56_lo);
973 
974         // shift by 6 bit each 16 bit
975         res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
976         res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
977 
978         // shrink to 8 bit each 16 bits, the first lane contain the first
979         // convolve result and the second lane contain the second convolve
980         // result
981         res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_lo);
982 
983         src_ptr += src_stride;
984 
985         xx_storeu2_epi64(output_ptr, out_pitch, &res_reg);
986 
987         output_ptr += dst_stride;
988 
989         // save part of the registers for next strides
990         src_reg_23_34_lo = src_reg_45_56_lo;
991         src_reg_4x       = src_reg_6x;
992     }
993 }
994 
svt_aom_filter_block1d8_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)995 static void svt_aom_filter_block1d8_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
996                                             uint8_t *output_ptr, ptrdiff_t out_pitch,
997                                             uint32_t output_height, const int16_t *filter) {
998     __m128i      filters_reg;
999     __m256i      add_filter_reg32;
1000     __m256i      src_reg32b1, src_reg_32b2, src_reg_32b3, src_reg_32b4, src_reg_32b5;
1001     __m256i      src_reg_32b6, src_reg_32b7, src_reg_32b8, src_reg_32b9, src_reg_32b10;
1002     __m256i      src_reg_32b11, src_reg_32b12, filters_reg32;
1003     __m256i      first_filters, second_filters, third_filters, forth_filters;
1004     unsigned int i;
1005     ptrdiff_t    src_stride, dst_stride;
1006 
1007     add_filter_reg32 = _mm256_set1_epi16(32);
1008     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
1009     // converting the 16 bit (short) to  8 bit (byte) and have the
1010     // same data in both lanes of 128 bit register.
1011     filters_reg = _mm_srai_epi16(filters_reg, 1);
1012     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1013     // have the same data in both lanes of a 256 bit register
1014     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1015 
1016     // duplicate only the first 16 bits (first and second byte)
1017     // across 256 bit register
1018     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
1019     // duplicate only the second 16 bits (third and forth byte)
1020     // across 256 bit register
1021     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1022     // duplicate only the third 16 bits (fifth and sixth byte)
1023     // across 256 bit register
1024     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1025     // duplicate only the forth 16 bits (seventh and eighth byte)
1026     // across 256 bit register
1027     forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
1028 
1029     // multiple the size of the source and destination stride by two
1030     src_stride = src_pitch << 1;
1031     dst_stride = out_pitch << 1;
1032 
1033     // load 16 bytes 7 times in stride of src_pitch
1034     src_reg32b1  = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
1035     src_reg_32b3 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1036     src_reg_32b5 = xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1037     src_reg_32b7 = _mm256_castsi128_si256(
1038         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1039 
1040     // have each consecutive loads on the same 256 register
1041     src_reg_32b2 = _mm256_permute2x128_si256(src_reg32b1, src_reg_32b3, 0x21);
1042     src_reg_32b4 = _mm256_permute2x128_si256(src_reg_32b3, src_reg_32b5, 0x21);
1043     src_reg_32b6 = _mm256_permute2x128_si256(src_reg_32b5, src_reg_32b7, 0x21);
1044     // merge every two consecutive registers except the last one
1045     src_reg_32b10 = _mm256_unpacklo_epi8(src_reg32b1, src_reg_32b2);
1046     src_reg_32b11 = _mm256_unpacklo_epi8(src_reg_32b3, src_reg_32b4);
1047     src_reg_32b2  = _mm256_unpacklo_epi8(src_reg_32b5, src_reg_32b6);
1048 
1049     for (i = output_height; i > 1; i -= 2) {
1050         // load the last 2 loads of 16 bytes and have every two
1051         // consecutive loads in the same 256 bit register
1052         src_reg_32b8 = _mm256_castsi128_si256(
1053             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
1054         src_reg_32b7 = _mm256_inserti128_si256(
1055             src_reg_32b7, _mm256_castsi256_si128(src_reg_32b8), 1);
1056         src_reg_32b9 = _mm256_castsi128_si256(
1057             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
1058         src_reg_32b8 = _mm256_inserti128_si256(
1059             src_reg_32b8, _mm256_castsi256_si128(src_reg_32b9), 1);
1060 
1061         // merge every two consecutive registers
1062         // save
1063         src_reg_32b4 = _mm256_unpacklo_epi8(src_reg_32b7, src_reg_32b8);
1064 
1065         // multiply 2 adjacent elements with the filter and add the result
1066         src_reg_32b10 = _mm256_maddubs_epi16(src_reg_32b10, first_filters);
1067         src_reg_32b6  = _mm256_maddubs_epi16(src_reg_32b4, forth_filters);
1068 
1069         // add and saturate the results together
1070         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, src_reg_32b6);
1071 
1072         // multiply 2 adjacent elements with the filter and add the result
1073         src_reg_32b8  = _mm256_maddubs_epi16(src_reg_32b11, second_filters);
1074         src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b2, third_filters);
1075 
1076         // add and saturate the results together
1077         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10,
1078                                           _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1079 
1080         // shift by 6 bit each 16 bit
1081         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, add_filter_reg32);
1082         src_reg_32b10 = _mm256_srai_epi16(src_reg_32b10, 6);
1083 
1084         // shrink to 8 bit each 16 bits, the first lane contain the first
1085         // convolve result and the second lane contain the second convolve
1086         // result
1087         src_reg32b1 = _mm256_packus_epi16(src_reg_32b10, _mm256_setzero_si256());
1088 
1089         src_ptr += src_stride;
1090 
1091         xx_storeu2_epi64(output_ptr, out_pitch, &src_reg32b1);
1092 
1093         output_ptr += dst_stride;
1094 
1095         // save part of the registers for next strides
1096         src_reg_32b10 = src_reg_32b11;
1097         src_reg_32b11 = src_reg_32b2;
1098         src_reg_32b2  = src_reg_32b4;
1099         src_reg_32b7  = src_reg_32b9;
1100     }
1101     if (i > 0) {
1102         __m128i src_reg_filt1, src_reg_filt4, src_reg_filt6, src_reg_filt8;
1103         // load the last 16 bytes
1104         src_reg_filt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
1105 
1106         // merge the last 2 results together
1107         src_reg_filt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1108 
1109         // multiply 2 adjacent elements with the filter and add the result
1110         src_reg_filt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b10),
1111                                           _mm256_castsi256_si128(first_filters));
1112         src_reg_filt4 = _mm_maddubs_epi16(src_reg_filt4, _mm256_castsi256_si128(forth_filters));
1113 
1114         // add and saturate the results together
1115         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, src_reg_filt4);
1116 
1117         // multiply 2 adjacent elements with the filter and add the result
1118         src_reg_filt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b11),
1119                                           _mm256_castsi256_si128(second_filters));
1120 
1121         // multiply 2 adjacent elements with the filter and add the result
1122         src_reg_filt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b2),
1123                                           _mm256_castsi256_si128(third_filters));
1124 
1125         // add and saturate the results together
1126         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm_adds_epi16(src_reg_filt4, src_reg_filt6));
1127 
1128         // shift by 6 bit each 16 bit
1129         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm256_castsi256_si128(add_filter_reg32));
1130         src_reg_filt1 = _mm_srai_epi16(src_reg_filt1, 6);
1131 
1132         // shrink to 8 bit each 16 bits, the first lane contain the first
1133         // convolve result and the second lane contain the second convolve result
1134         src_reg_filt1 = _mm_packus_epi16(src_reg_filt1, _mm_setzero_si128());
1135 
1136         // save 8 bytes
1137         _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1);
1138     }
1139 }
1140 
svt_aom_filter_block1d16_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1141 static void svt_aom_filter_block1d16_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1142                                              uint8_t *output_ptr, ptrdiff_t out_pitch,
1143                                              uint32_t output_height, const int16_t *filter) {
1144     __m128i      filters_reg;
1145     __m256i      filters_reg32, add_filter_reg32;
1146     __m256i      src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
1147     __m256i      src_reg_23_34_lo, src_reg_23_34_hi, src_reg_45_56_lo, src_reg_45_56_hi;
1148     __m256i      res_reg23_34_lo, res_reg23_34_hi, res_reg45_56_lo, res_reg45_56_hi;
1149     __m256i      res_reg_lo, res_reg_hi, res_reg;
1150     __m256i      second_filters, third_filters;
1151     unsigned int i;
1152     ptrdiff_t    src_stride, dst_stride;
1153 
1154     add_filter_reg32 = _mm256_set1_epi16(32);
1155     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
1156     // converting the 16 bit (short) to  8 bit (byte) and have the
1157     // same data in both lanes of 128 bit register.
1158     filters_reg = _mm_srai_epi16(filters_reg, 1);
1159     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1160     // have the same data in both lanes of a 256 bit register
1161     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1162 
1163     // duplicate only the second 16 bits (third and forth byte)
1164     // across 256 bit register
1165     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1166     // duplicate only the third 16 bits (fifth and sixth byte)
1167     // across 256 bit register
1168     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1169 
1170     // multiple the size of the source and destination stride by two
1171     src_stride = src_pitch << 1;
1172     dst_stride = out_pitch << 1;
1173 
1174     src_reg23  = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1175     src_reg_4x = _mm256_castsi128_si256(
1176         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
1177 
1178     // have consecutive loads on the same 256 register
1179     src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
1180 
1181     src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
1182     src_reg_23_34_hi = _mm256_unpackhi_epi8(src_reg23, src_reg_34);
1183 
1184     for (i = output_height; i > 1; i -= 2) {
1185         // load the last 2 loads of 16 bytes and have every two
1186         // consecutive loads in the same 256 bit register
1187         src_reg_5x = _mm256_castsi128_si256(
1188             _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
1189         src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
1190 
1191         src_reg_6x = _mm256_castsi128_si256(
1192             _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1193         src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
1194 
1195         // merge every two consecutive registers
1196         src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
1197         src_reg_45_56_hi = _mm256_unpackhi_epi8(src_reg_45, src_reg_56);
1198 
1199         // multiply 2 adjacent elements with the filter and add the result
1200         res_reg23_34_lo = _mm256_maddubs_epi16(src_reg_23_34_lo, second_filters);
1201         res_reg45_56_lo = _mm256_maddubs_epi16(src_reg_45_56_lo, third_filters);
1202 
1203         // add and saturate the results together
1204         res_reg_lo = _mm256_adds_epi16(res_reg23_34_lo, res_reg45_56_lo);
1205 
1206         // multiply 2 adjacent elements with the filter and add the result
1207         res_reg23_34_hi = _mm256_maddubs_epi16(src_reg_23_34_hi, second_filters);
1208         res_reg45_56_hi = _mm256_maddubs_epi16(src_reg_45_56_hi, third_filters);
1209 
1210         // add and saturate the results together
1211         res_reg_hi = _mm256_adds_epi16(res_reg23_34_hi, res_reg45_56_hi);
1212 
1213         // shift by 6 bit each 16 bit
1214         res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
1215         res_reg_hi = _mm256_adds_epi16(res_reg_hi, add_filter_reg32);
1216         res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
1217         res_reg_hi = _mm256_srai_epi16(res_reg_hi, 6);
1218 
1219         // shrink to 8 bit each 16 bits, the first lane contain the first
1220         // convolve result and the second lane contain the second convolve
1221         // result
1222         res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
1223 
1224         src_ptr += src_stride;
1225 
1226         xx_store2_mi128(output_ptr, out_pitch, &res_reg);
1227 
1228         output_ptr += dst_stride;
1229 
1230         // save part of the registers for next strides
1231         src_reg_23_34_lo = src_reg_45_56_lo;
1232         src_reg_23_34_hi = src_reg_45_56_hi;
1233         src_reg_4x       = src_reg_6x;
1234     }
1235 }
1236 
svt_aom_filter_block1d16_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1237 static void svt_aom_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1238                                              uint8_t *output_ptr, ptrdiff_t out_pitch,
1239                                              uint32_t output_height, const int16_t *filter) {
1240     __m128i      filters_reg;
1241     __m256i      add_filter_reg32;
1242     __m256i      src_reg32b1, src_reg_32b2, src_reg_32b3, src_reg_32b4, src_reg_32b5;
1243     __m256i      src_reg_32b6, src_reg_32b7, src_reg_32b8, src_reg_32b9, src_reg_32b10;
1244     __m256i      src_reg_32b11, src_reg_32b12, filters_reg32;
1245     __m256i      first_filters, second_filters, third_filters, forth_filters;
1246     unsigned int i;
1247     ptrdiff_t    src_stride, dst_stride;
1248 
1249     add_filter_reg32 = _mm256_set1_epi16(32);
1250     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
1251     // converting the 16 bit (short) to  8 bit (byte) and have the
1252     // same data in both lanes of 128 bit register.
1253     filters_reg = _mm_srai_epi16(filters_reg, 1);
1254     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1255     // have the same data in both lanes of a 256 bit register
1256     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1257 
1258     // duplicate only the first 16 bits (first and second byte)
1259     // across 256 bit register
1260     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
1261     // duplicate only the second 16 bits (third and forth byte)
1262     // across 256 bit register
1263     second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1264     // duplicate only the third 16 bits (fifth and sixth byte)
1265     // across 256 bit register
1266     third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1267     // duplicate only the forth 16 bits (seventh and eighth byte)
1268     // across 256 bit register
1269     forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
1270 
1271     // multiple the size of the source and destination stride by two
1272     src_stride = src_pitch << 1;
1273     dst_stride = out_pitch << 1;
1274 
1275     // load 16 bytes 7 times in stride of src_pitch
1276     src_reg32b1  = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
1277     src_reg_32b3 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1278     src_reg_32b5 = xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1279     src_reg_32b7 = _mm256_castsi128_si256(
1280         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1281 
1282     // have each consecutive loads on the same 256 register
1283     src_reg_32b2 = _mm256_permute2x128_si256(src_reg32b1, src_reg_32b3, 0x21);
1284     src_reg_32b4 = _mm256_permute2x128_si256(src_reg_32b3, src_reg_32b5, 0x21);
1285     src_reg_32b6 = _mm256_permute2x128_si256(src_reg_32b5, src_reg_32b7, 0x21);
1286     // merge every two consecutive registers except the last one
1287     src_reg_32b10 = _mm256_unpacklo_epi8(src_reg32b1, src_reg_32b2);
1288     src_reg32b1   = _mm256_unpackhi_epi8(src_reg32b1, src_reg_32b2);
1289 
1290     // save
1291     src_reg_32b11 = _mm256_unpacklo_epi8(src_reg_32b3, src_reg_32b4);
1292     src_reg_32b3  = _mm256_unpackhi_epi8(src_reg_32b3, src_reg_32b4);
1293     src_reg_32b2  = _mm256_unpacklo_epi8(src_reg_32b5, src_reg_32b6);
1294     src_reg_32b5  = _mm256_unpackhi_epi8(src_reg_32b5, src_reg_32b6);
1295 
1296     for (i = output_height; i > 1; i -= 2) {
1297         // load the last 2 loads of 16 bytes and have every two
1298         // consecutive loads in the same 256 bit register
1299         src_reg_32b8 = _mm256_castsi128_si256(
1300             _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
1301         src_reg_32b7 = _mm256_inserti128_si256(
1302             src_reg_32b7, _mm256_castsi256_si128(src_reg_32b8), 1);
1303         src_reg_32b9 = _mm256_castsi128_si256(
1304             _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
1305         src_reg_32b8 = _mm256_inserti128_si256(
1306             src_reg_32b8, _mm256_castsi256_si128(src_reg_32b9), 1);
1307 
1308         // merge every two consecutive registers
1309         // save
1310         src_reg_32b4 = _mm256_unpacklo_epi8(src_reg_32b7, src_reg_32b8);
1311         src_reg_32b7 = _mm256_unpackhi_epi8(src_reg_32b7, src_reg_32b8);
1312 
1313         // multiply 2 adjacent elements with the filter and add the result
1314         src_reg_32b10 = _mm256_maddubs_epi16(src_reg_32b10, first_filters);
1315         src_reg_32b6  = _mm256_maddubs_epi16(src_reg_32b4, forth_filters);
1316 
1317         // add and saturate the results together
1318         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, src_reg_32b6);
1319 
1320         // multiply 2 adjacent elements with the filter and add the result
1321         src_reg_32b8  = _mm256_maddubs_epi16(src_reg_32b11, second_filters);
1322         src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b2, third_filters);
1323 
1324         // add and saturate the results together
1325         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10,
1326                                           _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1327 
1328         // multiply 2 adjacent elements with the filter and add the result
1329         src_reg32b1  = _mm256_maddubs_epi16(src_reg32b1, first_filters);
1330         src_reg_32b6 = _mm256_maddubs_epi16(src_reg_32b7, forth_filters);
1331 
1332         src_reg32b1 = _mm256_adds_epi16(src_reg32b1, src_reg_32b6);
1333 
1334         // multiply 2 adjacent elements with the filter and add the result
1335         src_reg_32b8  = _mm256_maddubs_epi16(src_reg_32b3, second_filters);
1336         src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b5, third_filters);
1337 
1338         // add and saturate the results together
1339         src_reg32b1 = _mm256_adds_epi16(src_reg32b1,
1340                                         _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1341 
1342         // shift by 6 bit each 16 bit
1343         src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, add_filter_reg32);
1344         src_reg32b1   = _mm256_adds_epi16(src_reg32b1, add_filter_reg32);
1345         src_reg_32b10 = _mm256_srai_epi16(src_reg_32b10, 6);
1346         src_reg32b1   = _mm256_srai_epi16(src_reg32b1, 6);
1347 
1348         // shrink to 8 bit each 16 bits, the first lane contain the first
1349         // convolve result and the second lane contain the second convolve
1350         // result
1351         src_reg32b1 = _mm256_packus_epi16(src_reg_32b10, src_reg32b1);
1352 
1353         src_ptr += src_stride;
1354 
1355         xx_store2_mi128(output_ptr, out_pitch, &src_reg32b1);
1356 
1357         output_ptr += dst_stride;
1358 
1359         // save part of the registers for next strides
1360         src_reg_32b10 = src_reg_32b11;
1361         src_reg32b1   = src_reg_32b3;
1362         src_reg_32b11 = src_reg_32b2;
1363         src_reg_32b3  = src_reg_32b5;
1364         src_reg_32b2  = src_reg_32b4;
1365         src_reg_32b5  = src_reg_32b7;
1366         src_reg_32b7  = src_reg_32b9;
1367     }
1368     if (i > 0) {
1369         __m128i src_reg_filt1, src_reg_filt3, src_reg_filt4, src_reg_filt5;
1370         __m128i src_reg_filt6, src_reg_filt7, src_reg_filt8;
1371         // load the last 16 bytes
1372         src_reg_filt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1373 
1374         // merge the last 2 results together
1375         src_reg_filt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1376         src_reg_filt7 = _mm_unpackhi_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1377 
1378         // multiply 2 adjacent elements with the filter and add the result
1379         src_reg_filt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b10),
1380                                           _mm256_castsi256_si128(first_filters));
1381         src_reg_filt4 = _mm_maddubs_epi16(src_reg_filt4, _mm256_castsi256_si128(forth_filters));
1382         src_reg_filt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg32b1),
1383                                           _mm256_castsi256_si128(first_filters));
1384         src_reg_filt7 = _mm_maddubs_epi16(src_reg_filt7, _mm256_castsi256_si128(forth_filters));
1385 
1386         // add and saturate the results together
1387         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, src_reg_filt4);
1388         src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, src_reg_filt7);
1389 
1390         // multiply 2 adjacent elements with the filter and add the result
1391         src_reg_filt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b11),
1392                                           _mm256_castsi256_si128(second_filters));
1393         src_reg_filt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b3),
1394                                           _mm256_castsi256_si128(second_filters));
1395 
1396         // multiply 2 adjacent elements with the filter and add the result
1397         src_reg_filt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b2),
1398                                           _mm256_castsi256_si128(third_filters));
1399         src_reg_filt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b5),
1400                                           _mm256_castsi256_si128(third_filters));
1401 
1402         // add and saturate the results together
1403         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm_adds_epi16(src_reg_filt4, src_reg_filt6));
1404         src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, _mm_adds_epi16(src_reg_filt5, src_reg_filt7));
1405 
1406         // shift by 6 bit each 16 bit
1407         src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm256_castsi256_si128(add_filter_reg32));
1408         src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, _mm256_castsi256_si128(add_filter_reg32));
1409         src_reg_filt1 = _mm_srai_epi16(src_reg_filt1, 6);
1410         src_reg_filt3 = _mm_srai_epi16(src_reg_filt3, 6);
1411 
1412         // shrink to 8 bit each 16 bits, the first lane contain the first
1413         // convolve result and the second lane contain the second convolve
1414         // result
1415         src_reg_filt1 = _mm_packus_epi16(src_reg_filt1, src_reg_filt3);
1416 
1417         // save 16 bytes
1418         _mm_storeu_si128((__m128i *)output_ptr, src_reg_filt1);
1419     }
1420 }
1421 
svt_aom_filter_block1d4_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1422 static void svt_aom_filter_block1d4_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1423                                             uint8_t *output_ptr, ptrdiff_t out_pitch,
1424                                             uint32_t output_height, const int16_t *filter) {
1425     __m128i      filters_reg;
1426     __m256i      filters_reg32, add_filter_reg32;
1427     __m256i      src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
1428     __m256i      src_reg_23_34_lo, src_reg_45_56_lo;
1429     __m256i      src_reg_2345_3456_lo;
1430     __m256i      res_reg_lo, res_reg;
1431     __m256i      first_filters;
1432     unsigned int i;
1433     ptrdiff_t    src_stride, dst_stride;
1434 
1435     add_filter_reg32 = _mm256_set1_epi16(32);
1436     filters_reg      = _mm_loadu_si128((const __m128i *)filter);
1437     // converting the 16 bit (short) to  8 bit (byte) and have the
1438     // same data in both lanes of 128 bit register.
1439     filters_reg = _mm_srai_epi16(filters_reg, 1);
1440     filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1441     // have the same data in both lanes of a 256 bit register
1442     filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1443 
1444     first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi32(0x5040302u));
1445 
1446     // multiple the size of the source and destination stride by two
1447     src_stride = src_pitch << 1;
1448     dst_stride = out_pitch << 1;
1449 
1450     src_reg23  = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1451     src_reg_4x = _mm256_castsi128_si256(
1452         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
1453 
1454     // have consecutive loads on the same 256 register
1455     src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
1456 
1457     src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
1458 
1459     for (i = output_height; i > 1; i -= 2) {
1460         // load the last 2 loads of 16 bytes and have every two
1461         // consecutive loads in the same 256 bit register
1462         src_reg_5x = _mm256_castsi128_si256(
1463             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
1464         src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
1465 
1466         src_reg_6x = _mm256_castsi128_si256(
1467             _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1468         src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
1469 
1470         // merge every two consecutive registers
1471         src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
1472 
1473         src_reg_2345_3456_lo = _mm256_unpacklo_epi16(src_reg_23_34_lo, src_reg_45_56_lo);
1474 
1475         // multiply 2 adjacent elements with the filter and add the result
1476         res_reg_lo = _mm256_maddubs_epi16(src_reg_2345_3456_lo, first_filters);
1477 
1478         res_reg_lo = _mm256_hadds_epi16(res_reg_lo, _mm256_setzero_si256());
1479 
1480         // shift by 6 bit each 16 bit
1481         res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
1482         res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
1483 
1484         // shrink to 8 bit each 16 bits, the first lane contain the first
1485         // convolve result and the second lane contain the second convolve
1486         // result
1487         res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_lo);
1488 
1489         src_ptr += src_stride;
1490 
1491         xx_storeu2_epi32(output_ptr, out_pitch, &res_reg);
1492 
1493         output_ptr += dst_stride;
1494 
1495         // save part of the registers for next strides
1496         src_reg_23_34_lo = src_reg_45_56_lo;
1497         src_reg_4x       = src_reg_6x;
1498     }
1499 }
1500 
1501 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
1502 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
1503