1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <immintrin.h>
13
14 #include "common_dsp_rtcd.h"
15
16 #include "convolve.h"
17 #include "convolve_avx2.h"
18 // #include "aom_ports/mem.h"
19
20 #if defined(__clang__)
21 #if (__clang_major__ > 0 && __clang_major__ < 3) || \
22 (__clang_major__ == 3 && __clang_minor__ <= 3) || \
23 (defined(__APPLE__) && defined(__apple_build_version__) && \
24 ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
25 (__clang_major__ == 5 && __clang_minor__ == 0)))
26 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256((__m128i const *)&(x))
27 #else // clang > 3.3, and not 5.0 on macosx.
28 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
29 #endif // clang <= 3.3
30 #elif defined(__GNUC__)
31 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
32 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256((__m128i const *)&(x))
33 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
34 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
35 #else // gcc > 4.7
36 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
37 #endif // gcc <= 4.6
38 #else // !(gcc || clang)
39 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
40 #endif // __clang__
41
42 typedef void Filter81dFunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
43 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter);
44 void svt_aom_filter_block1d4_v8_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
45 uint8_t *output_ptr, ptrdiff_t out_pitch,
46 uint32_t output_height, const int16_t *filter);
47 void svt_aom_filter_block1d16_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
48 uint8_t *output_ptr, ptrdiff_t out_pitch,
49 uint32_t output_height, const int16_t *filter);
50 void svt_aom_filter_block1d16_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
51 uint8_t *output_ptr, ptrdiff_t out_pitch,
52 uint32_t output_height, const int16_t *filter);
53 void svt_aom_filter_block1d8_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
54 uint8_t *output_ptr, ptrdiff_t out_pitch,
55 uint32_t output_height, const int16_t *filter);
56 void svt_aom_filter_block1d8_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
57 uint8_t *output_ptr, ptrdiff_t out_pitch,
58 uint32_t output_height, const int16_t *filter);
59 void svt_aom_filter_block1d4_v2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
60 uint8_t *output_ptr, ptrdiff_t out_pitch,
61 uint32_t output_height, const int16_t *filter);
62 void svt_aom_filter_block1d4_h2_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
63 uint8_t *output_ptr, ptrdiff_t out_pitch,
64 uint32_t output_height, const int16_t *filter);
65
66 Filter81dFunction svt_aom_filter_block1d4_v8_ssse3;
67 Filter81dFunction svt_aom_filter_block1d16_v2_ssse3;
68 Filter81dFunction svt_aom_filter_block1d16_h2_ssse3;
69 Filter81dFunction svt_aom_filter_block1d8_v2_ssse3;
70 Filter81dFunction svt_aom_filter_block1d8_h2_ssse3;
71 Filter81dFunction svt_aom_filter_block1d4_v2_ssse3;
72 Filter81dFunction svt_aom_filter_block1d4_h2_ssse3;
73 #define svt_aom_filter_block1d4_v8_avx2 svt_aom_filter_block1d4_v8_sse2
74 #define svt_aom_filter_block1d16_v2_avx2 svt_aom_filter_block1d16_v2_ssse3
75 #define svt_aom_filter_block1d16_h2_avx2 svt_aom_filter_block1d16_h2_ssse3
76 #define svt_aom_filter_block1d8_v2_avx2 svt_aom_filter_block1d8_v2_ssse3
77 #define svt_aom_filter_block1d8_h2_avx2 svt_aom_filter_block1d8_h2_ssse3
78 #define svt_aom_filter_block1d4_v2_avx2 svt_aom_filter_block1d4_v2_ssse3
79 #define svt_aom_filter_block1d4_h2_avx2 svt_aom_filter_block1d4_h2_ssse3
80
81 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
82 void svt_aom_convolve8_##name##_##opt(const uint8_t *src, \
83 ptrdiff_t src_stride, \
84 uint8_t * dst, \
85 ptrdiff_t dst_stride, \
86 const int16_t *filter_x, \
87 int x_step_q4, \
88 const int16_t *filter_y, \
89 int y_step_q4, \
90 int w, \
91 int h) { \
92 (void)filter_x; \
93 (void)x_step_q4; \
94 (void)filter_y; \
95 (void)y_step_q4; \
96 assert((-128 <= filter[3]) && (filter[3] <= 127)); \
97 assert(step_q4 == 16); \
98 if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && (filter[2] | filter[5])) { \
99 while (w >= 16) { \
100 svt_aom_filter_block1d16_##dir##4_##avg##opt( \
101 src_start, src_stride, dst, dst_stride, h, filter); \
102 src += 16; \
103 dst += 16; \
104 w -= 16; \
105 } \
106 while (w >= 8) { \
107 svt_aom_filter_block1d8_##dir##4_##avg##opt( \
108 src_start, src_stride, dst, dst_stride, h, filter); \
109 src += 8; \
110 dst += 8; \
111 w -= 8; \
112 } \
113 while (w >= 4) { \
114 svt_aom_filter_block1d4_##dir##4_##avg##opt( \
115 src_start, src_stride, dst, dst_stride, h, filter); \
116 src += 4; \
117 dst += 4; \
118 w -= 4; \
119 } \
120 } else if (filter[0] | filter[1] | filter[2]) { \
121 while (w >= 16) { \
122 svt_aom_filter_block1d16_##dir##8_##avg##opt( \
123 src_start, src_stride, dst, dst_stride, h, filter); \
124 src += 16; \
125 dst += 16; \
126 w -= 16; \
127 } \
128 while (w >= 8) { \
129 svt_aom_filter_block1d8_##dir##8_##avg##opt( \
130 src_start, src_stride, dst, dst_stride, h, filter); \
131 src += 8; \
132 dst += 8; \
133 w -= 8; \
134 } \
135 while (w >= 4) { \
136 svt_aom_filter_block1d4_##dir##8_##avg##opt( \
137 src_start, src_stride, dst, dst_stride, h, filter); \
138 src += 4; \
139 dst += 4; \
140 w -= 4; \
141 } \
142 } else { \
143 while (w >= 16) { \
144 svt_aom_filter_block1d16_##dir##2_##avg##opt( \
145 src, src_stride, dst, dst_stride, h, filter); \
146 src += 16; \
147 dst += 16; \
148 w -= 16; \
149 } \
150 while (w >= 8) { \
151 svt_aom_filter_block1d8_##dir##2_##avg##opt( \
152 src, src_stride, dst, dst_stride, h, filter); \
153 src += 8; \
154 dst += 8; \
155 w -= 8; \
156 } \
157 while (w >= 4) { \
158 svt_aom_filter_block1d4_##dir##2_##avg##opt( \
159 src, src_stride, dst, dst_stride, h, filter); \
160 src += 4; \
161 dst += 4; \
162 w -= 4; \
163 } \
164 } \
165 if (w) { \
166 svt_aom_convolve8_##name##_c( \
167 src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
168 } \
169 }
170
171 // filters for 16
172 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
173 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
174 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
175 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 4, 5,
176 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8,
177 8, 9, 9, 10, 10, 11, 11, 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
178 13, 14, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
179
180 DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
181 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
182 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
183 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
184 };
185
186 DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
187 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
188 };
189
xx_storeu2_epi32(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)190 static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, const ptrdiff_t stride,
191 const __m256i *a) {
192 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
193 *((uint32_t *)(output_ptr + stride)) = _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
194 }
195
xx_loadu2_epi64(const void * hi,const void * lo)196 static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
197 __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
198 a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
199 return a;
200 }
201
xx_storeu2_epi64(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)202 static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, const ptrdiff_t stride,
203 const __m256i *a) {
204 _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
205 _mm_storel_epi64((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1));
206 }
207
xx_loadu2_mi128(const void * hi,const void * lo)208 static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
209 __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
210 a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
211 return a;
212 }
213
xx_store2_mi128(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)214 static INLINE void xx_store2_mi128(const uint8_t *output_ptr, const ptrdiff_t stride,
215 const __m256i *a) {
216 _mm_storeu_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
217 _mm_storeu_si128((__m128i *)(output_ptr + stride), _mm256_extractf128_si256(*a, 1));
218 }
219
svt_aom_filter_block1d4_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)220 static void svt_aom_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
221 uint8_t *output_ptr, ptrdiff_t output_pitch,
222 uint32_t output_height, const int16_t *filter) {
223 __m128i filters_reg;
224 __m256i add_filter_reg32, filt1_reg, first_filters, src_reg32b1, src_reg_filt32b1_1;
225 unsigned int i;
226 ptrdiff_t src_stride, dst_stride;
227 src_ptr -= 3;
228 add_filter_reg32 = _mm256_set1_epi16(32);
229 filters_reg = _mm_loadu_si128((const __m128i *)filter);
230 filters_reg = _mm_srai_epi16(filters_reg, 1);
231 // converting the 16 bit (short) to 8 bit (byte) and have the same data
232 // in both lanes of 128 bit register.
233 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
234 // have the same data in both lanes of a 256 bit register
235 const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
236
237 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi32(0x5040302u));
238 filt1_reg = _mm256_loadu_si256((__m256i const *)(filt4_d4_global_avx2));
239
240 // multiple the size of the source and destination stride by two
241 src_stride = src_pixels_per_line << 1;
242 dst_stride = output_pitch << 1;
243 for (i = output_height; i > 1; i -= 2) {
244 // load the 2 strides of source
245 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
246
247 // filter the source buffer
248 src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
249
250 // multiply 4 adjacent elements with the filter and add the result
251 src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
252
253 src_reg_filt32b1_1 = _mm256_hadds_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
254
255 // shift by 6 bit each 16 bit
256 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
257 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
258
259 // shrink to 8 bit each 16 bits, the first lane contain the first
260 // convolve result and the second lane contain the second convolve result
261 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
262
263 src_ptr += src_stride;
264
265 xx_storeu2_epi32(output_ptr, output_pitch, &src_reg_filt32b1_1);
266 output_ptr += dst_stride;
267 }
268
269 // if the number of strides is odd.
270 // process only 4 bytes
271 if (i > 0) {
272 __m128i src_reg1, src_reg_filt1_1;
273
274 src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
275
276 // filter the source buffer
277 src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
278
279 // multiply 4 adjacent elements with the filter and add the result
280 src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
281
282 src_reg_filt1_1 = _mm_hadds_epi16(src_reg_filt1_1, _mm_setzero_si128());
283 // shift by 6 bit each 16 bit
284 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
285 src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
286
287 // shrink to 8 bit each 16 bits, the first lane contain the first
288 // convolve result and the second lane contain the second convolve result
289 src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
290
291 // save 4 bytes
292 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(src_reg_filt1_1);
293 }
294 }
295
svt_aom_filter_block1d4_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)296 static void svt_aom_filter_block1d4_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
297 uint8_t *output_ptr, ptrdiff_t output_pitch,
298 uint32_t output_height, const int16_t *filter) {
299 __m128i filters_reg;
300 __m256i add_filter_reg32, filt1_reg, filt2_reg;
301 __m256i first_filters, second_filters;
302 __m256i src_reg_filt32b1_1, src_reg_Filt32b2;
303 __m256i src_reg32b1;
304 unsigned int i;
305 ptrdiff_t src_stride, dst_stride;
306 src_ptr -= 3;
307 add_filter_reg32 = _mm256_set1_epi16(32);
308 filters_reg = _mm_loadu_si128((const __m128i *)filter);
309 filters_reg = _mm_srai_epi16(filters_reg, 1);
310 // converting the 16 bit (short) to 8 bit (byte) and have the same data
311 // in both lanes of 128 bit register.
312 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
313 // have the same data in both lanes of a 256 bit register
314 const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
315
316 // duplicate only the first 32 bits
317 first_filters = _mm256_shuffle_epi32(filters_reg32, 0);
318 // duplicate only the second 32 bits
319 second_filters = _mm256_shuffle_epi32(filters_reg32, 0x55);
320
321 filt1_reg = _mm256_loadu_si256((__m256i const *)filt_d4_global_avx2);
322 filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_d4_global_avx2 + 32));
323
324 // multiple the size of the source and destination stride by two
325 src_stride = src_pixels_per_line << 1;
326 dst_stride = output_pitch << 1;
327 for (i = output_height; i > 1; i -= 2) {
328 // load the 2 strides of source
329 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
330
331 // filter the source buffer
332 src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
333
334 // multiply 4 adjacent elements with the filter and add the result
335 src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
336
337 // filter the source buffer
338 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
339
340 // multiply 4 adjacent elements with the filter and add the result
341 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, second_filters);
342
343 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
344
345 src_reg_filt32b1_1 = _mm256_hadds_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
346
347 // shift by 6 bit each 16 bit
348 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
349 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
350
351 // shrink to 8 bit each 16 bits, the first lane contain the first
352 // convolve result and the second lane contain the second convolve result
353 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
354
355 src_ptr += src_stride;
356
357 xx_storeu2_epi32(output_ptr, output_pitch, &src_reg_filt32b1_1);
358 output_ptr += dst_stride;
359 }
360
361 // if the number of strides is odd.
362 // process only 4 bytes
363 if (i > 0) {
364 __m128i src_reg1, src_reg_filt1_1;
365 __m128i src_reg_filt2;
366
367 src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
368
369 // filter the source buffer
370 src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
371
372 // multiply 4 adjacent elements with the filter and add the result
373 src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
374
375 // filter the source buffer
376 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
377
378 // multiply 4 adjacent elements with the filter and add the result
379 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(second_filters));
380
381 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
382 src_reg_filt1_1 = _mm_hadds_epi16(src_reg_filt1_1, _mm_setzero_si128());
383 // shift by 6 bit each 16 bit
384 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
385 src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
386
387 // shrink to 8 bit each 16 bits, the first lane contain the first
388 // convolve result and the second lane contain the second convolve result
389 src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
390
391 // save 4 bytes
392 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(src_reg_filt1_1);
393 }
394 }
395
svt_aom_filter_block1d8_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)396 static void svt_aom_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
397 uint8_t *output_ptr, ptrdiff_t output_pitch,
398 uint32_t output_height, const int16_t *filter) {
399 __m128i filters_reg;
400 __m256i add_filter_reg32, filt2_reg, filt3_reg;
401 __m256i second_filters, third_filters;
402 __m256i src_reg_filt32b1_1, src_reg_Filt32b2, src_reg_Filt32b3;
403 __m256i src_reg32b1, filters_reg32;
404 unsigned int i;
405 ptrdiff_t src_stride, dst_stride;
406 src_ptr -= 3;
407 add_filter_reg32 = _mm256_set1_epi16(32);
408 filters_reg = _mm_loadu_si128((const __m128i *)filter);
409 filters_reg = _mm_srai_epi16(filters_reg, 1);
410 // converting the 16 bit (short) to 8 bit (byte) and have the same data
411 // in both lanes of 128 bit register.
412 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
413 // have the same data in both lanes of a 256 bit register
414 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
415
416 // duplicate only the second 16 bits (third and forth byte)
417 // across 256 bit register
418 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
419 // duplicate only the third 16 bits (fifth and sixth byte)
420 // across 256 bit register
421 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
422
423 filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
424 filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
425
426 // multiply the size of the source and destination stride by two
427 src_stride = src_pixels_per_line << 1;
428 dst_stride = output_pitch << 1;
429 for (i = output_height; i > 1; i -= 2) {
430 // load the 2 strides of source
431 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
432
433 // filter the source buffer
434 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
435 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
436
437 // multiply 2 adjacent elements with the filter and add the result
438 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
439 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
440
441 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
442
443 // shift by 6 bit each 16 bit
444 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
445 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
446
447 // shrink to 8 bit each 16 bits
448 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_filt32b1_1);
449
450 src_ptr += src_stride;
451
452 xx_storeu2_epi64(output_ptr, output_pitch, &src_reg_filt32b1_1);
453 output_ptr += dst_stride;
454 }
455
456 // if the number of strides is odd.
457 // process only 8 bytes
458 if (i > 0) {
459 __m128i src_reg1, src_reg_filt1_1;
460 __m128i src_reg_filt2, src_reg_filt3;
461
462 src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
463
464 // filter the source buffer
465 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
466 src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
467
468 // multiply 2 adjacent elements with the filter and add the result
469 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(second_filters));
470 src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(third_filters));
471
472 // add and saturate the results together
473 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt2, src_reg_filt3);
474
475 // shift by 6 bit each 16 bit
476 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
477 src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
478
479 // shrink to 8 bit each 16 bits
480 src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
481
482 // save 8 bytes
483 _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1_1);
484 }
485 }
486
svt_aom_filter_block1d8_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)487 static void svt_aom_filter_block1d8_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
488 uint8_t *output_ptr, ptrdiff_t output_pitch,
489 uint32_t output_height, const int16_t *filter) {
490 __m128i filters_reg;
491 __m256i add_filter_reg32, filt1_reg, filt2_reg, filt3_reg, filt4_reg;
492 __m256i first_filters, second_filters, third_filters, forth_filters;
493 __m256i src_reg_filt32b1_1, src_reg_Filt32b2, src_reg_Filt32b3;
494 __m256i src_reg32b1;
495 unsigned int i;
496 ptrdiff_t src_stride, dst_stride;
497 src_ptr -= 3;
498 add_filter_reg32 = _mm256_set1_epi16(32);
499 filters_reg = _mm_loadu_si128((const __m128i *)filter);
500 filters_reg = _mm_srai_epi16(filters_reg, 1);
501 // converting the 16 bit (short) to 8 bit (byte) and have the same data
502 // in both lanes of 128 bit register.
503 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
504 // have the same data in both lanes of a 256 bit register
505 const __m256i filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
506
507 // duplicate only the first 16 bits (first and second byte)
508 // across 256 bit register
509 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
510 // duplicate only the second 16 bits (third and forth byte)
511 // across 256 bit register
512 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
513 // duplicate only the third 16 bits (fifth and sixth byte)
514 // across 256 bit register
515 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
516 // duplicate only the forth 16 bits (seventh and eighth byte)
517 // across 256 bit register
518 forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
519
520 filt1_reg = _mm256_loadu_si256((__m256i const *)filt_global_avx2);
521 filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
522 filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
523 filt4_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
524
525 // multiple the size of the source and destination stride by two
526 src_stride = src_pixels_per_line << 1;
527 dst_stride = output_pitch << 1;
528 for (i = output_height; i > 1; i -= 2) {
529 // load the 2 strides of source
530 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
531
532 // filter the source buffer
533 src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
534 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt4_reg);
535
536 // multiply 2 adjacent elements with the filter and add the result
537 src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
538 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
539
540 // add and saturate the results together
541 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
542
543 // filter the source buffer
544 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
545 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
546
547 // multiply 2 adjacent elements with the filter and add the result
548 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
549 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
550
551 __m256i sum23 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
552 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, sum23);
553
554 // shift by 6 bit each 16 bit
555 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
556 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
557
558 // shrink to 8 bit each 16 bits, the first lane contain the first
559 // convolve result and the second lane contain the second convolve result
560 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, _mm256_setzero_si256());
561
562 src_ptr += src_stride;
563
564 xx_storeu2_epi64(output_ptr, output_pitch, &src_reg_filt32b1_1);
565 output_ptr += dst_stride;
566 }
567
568 // if the number of strides is odd.
569 // process only 8 bytes
570 if (i > 0) {
571 __m128i src_reg1, src_reg_filt1_1;
572 __m128i src_reg_filt2, src_reg_filt3;
573
574 src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
575
576 // filter the source buffer
577 src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
578 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt4_reg));
579
580 // multiply 2 adjacent elements with the filter and add the result
581 src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
582 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
583
584 // add and saturate the results together
585 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
586
587 // filter the source buffer
588 src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
589 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
590
591 // multiply 2 adjacent elements with the filter and add the result
592 src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
593 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
594
595 // add and saturate the results together
596 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1,
597 _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
598
599 // shift by 6 bit each 16 bit
600 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
601 src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
602
603 // shrink to 8 bit each 16 bits, the first lane contain the first
604 // convolve result and the second lane contain the second convolve
605 // result
606 src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, _mm_setzero_si128());
607
608 // save 8 bytes
609 _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1_1);
610 }
611 }
612
svt_aom_filter_block1d16_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)613 static void svt_aom_filter_block1d16_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
614 uint8_t *output_ptr, ptrdiff_t output_pitch,
615 uint32_t output_height, const int16_t *filter) {
616 __m128i filters_reg;
617 __m256i add_filter_reg32, filt2_reg, filt3_reg;
618 __m256i second_filters, third_filters;
619 __m256i src_reg_filt32b1_1, src_reg_Filt32b2_1, src_reg_Filt32b2, src_reg_Filt32b3;
620 __m256i src_reg32b1, src_reg_32b2, filters_reg32;
621 unsigned int i;
622 ptrdiff_t src_stride, dst_stride;
623 src_ptr -= 3;
624 add_filter_reg32 = _mm256_set1_epi16(32);
625 filters_reg = _mm_loadu_si128((const __m128i *)filter);
626 filters_reg = _mm_srai_epi16(filters_reg, 1);
627 // converting the 16 bit (short) to 8 bit (byte) and have the same data
628 // in both lanes of 128 bit register.
629 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
630 // have the same data in both lanes of a 256 bit register
631 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
632
633 // duplicate only the second 16 bits (third and forth byte)
634 // across 256 bit register
635 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
636 // duplicate only the third 16 bits (fifth and sixth byte)
637 // across 256 bit register
638 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
639
640 filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
641 filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
642
643 // multiply the size of the source and destination stride by two
644 src_stride = src_pixels_per_line << 1;
645 dst_stride = output_pitch << 1;
646 for (i = output_height; i > 1; i -= 2) {
647 // load the 2 strides of source
648 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
649
650 // filter the source buffer
651 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
652 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
653
654 // multiply 2 adjacent elements with the filter and add the result
655 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
656 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
657
658 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
659
660 // reading 2 strides of the next 16 bytes
661 // (part of it was being read by earlier read)
662 src_reg_32b2 = xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
663
664 // filter the source buffer
665 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg_32b2, filt2_reg);
666 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg_32b2, filt3_reg);
667
668 // multiply 2 adjacent elements with the filter and add the result
669 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
670 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
671
672 // add and saturate the results together
673 src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
674
675 // shift by 6 bit each 16 bit
676 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
677 src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, add_filter_reg32);
678 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
679 src_reg_Filt32b2_1 = _mm256_srai_epi16(src_reg_Filt32b2_1, 6);
680
681 // shrink to 8 bit each 16 bits, the first lane contain the first
682 // convolve result and the second lane contain the second convolve result
683 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_Filt32b2_1);
684
685 src_ptr += src_stride;
686
687 xx_store2_mi128(output_ptr, output_pitch, &src_reg_filt32b1_1);
688 output_ptr += dst_stride;
689 }
690
691 // if the number of strides is odd.
692 // process only 16 bytes
693 if (i > 0) {
694 __m256i src_reg1, src_reg12;
695 __m256i src_reg_filt2, src_reg_filt3, src_reg_filt1_1;
696
697 src_reg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
698 src_reg12 = _mm256_permute4x64_epi64(src_reg1, 0x94);
699
700 // filter the source buffer
701 src_reg_filt2 = _mm256_shuffle_epi8(src_reg12, filt2_reg);
702 src_reg_filt3 = _mm256_shuffle_epi8(src_reg12, filt3_reg);
703
704 // multiply 2 adjacent elements with the filter and add the result
705 src_reg_filt2 = _mm256_maddubs_epi16(src_reg_filt2, second_filters);
706 src_reg_filt3 = _mm256_maddubs_epi16(src_reg_filt3, third_filters);
707
708 // add and saturate the results together
709 src_reg_filt1_1 = _mm256_adds_epi16(src_reg_filt2, src_reg_filt3);
710
711 // shift by 6 bit each 16 bit
712 src_reg_filt1_1 = _mm256_adds_epi16(src_reg_filt1_1, add_filter_reg32);
713 src_reg_filt1_1 = _mm256_srai_epi16(src_reg_filt1_1, 6);
714
715 // shrink to 8 bit each 16 bits, the first lane contain the first
716 // convolve result and the second lane contain the second convolve
717 // result
718 src_reg_filt1_1 = _mm256_packus_epi16(src_reg_filt1_1, src_reg_filt1_1);
719 src_reg_filt1_1 = _mm256_permute4x64_epi64(src_reg_filt1_1, 0x8);
720
721 // save 16 bytes
722 _mm_storeu_si128((__m128i *)output_ptr, _mm256_castsi256_si128(src_reg_filt1_1));
723 }
724 }
725
svt_aom_filter_block1d16_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)726 static void svt_aom_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line,
727 uint8_t *output_ptr, ptrdiff_t output_pitch,
728 uint32_t output_height, const int16_t *filter) {
729 __m128i filters_reg;
730 __m256i add_filter_reg32, filt1_reg, filt2_reg, filt3_reg, filt4_reg;
731 __m256i first_filters, second_filters, third_filters, forth_filters;
732 __m256i src_reg_filt32b1_1, src_reg_Filt32b2_1, src_reg_Filt32b2, src_reg_Filt32b3;
733 __m256i src_reg32b1, src_reg_32b2, filters_reg32;
734 unsigned int i;
735 ptrdiff_t src_stride, dst_stride;
736 src_ptr -= 3;
737 add_filter_reg32 = _mm256_set1_epi16(32);
738 filters_reg = _mm_loadu_si128((const __m128i *)filter);
739 filters_reg = _mm_srai_epi16(filters_reg, 1);
740 // converting the 16 bit (short) to 8 bit (byte) and have the same data
741 // in both lanes of 128 bit register.
742 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
743 // have the same data in both lanes of a 256 bit register
744 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
745
746 // duplicate only the first 16 bits (first and second byte)
747 // across 256 bit register
748 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
749 // duplicate only the second 16 bits (third and forth byte)
750 // across 256 bit register
751 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
752 // duplicate only the third 16 bits (fifth and sixth byte)
753 // across 256 bit register
754 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
755 // duplicate only the forth 16 bits (seventh and eighth byte)
756 // across 256 bit register
757 forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
758
759 filt1_reg = _mm256_loadu_si256((__m256i const *)filt_global_avx2);
760 filt2_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32));
761 filt3_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
762 filt4_reg = _mm256_loadu_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
763
764 // multiple the size of the source and destination stride by two
765 src_stride = src_pixels_per_line << 1;
766 dst_stride = output_pitch << 1;
767 for (i = output_height; i > 1; i -= 2) {
768 // load the 2 strides of source
769 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
770
771 // filter the source buffer
772 src_reg_filt32b1_1 = _mm256_shuffle_epi8(src_reg32b1, filt1_reg);
773 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt4_reg);
774
775 // multiply 2 adjacent elements with the filter and add the result
776 src_reg_filt32b1_1 = _mm256_maddubs_epi16(src_reg_filt32b1_1, first_filters);
777 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
778
779 // add and saturate the results together
780 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, src_reg_Filt32b2);
781
782 // filter the source buffer
783 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg32b1, filt2_reg);
784 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg32b1, filt3_reg);
785
786 // multiply 2 adjacent elements with the filter and add the result
787 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
788 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
789
790 __m256i sum23 = _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2);
791 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, sum23);
792
793 // reading 2 strides of the next 16 bytes
794 // (part of it was being read by earlier read)
795 src_reg_32b2 = xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
796
797 // filter the source buffer
798 src_reg_Filt32b2_1 = _mm256_shuffle_epi8(src_reg_32b2, filt1_reg);
799 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg_32b2, filt4_reg);
800
801 // multiply 2 adjacent elements with the filter and add the result
802 src_reg_Filt32b2_1 = _mm256_maddubs_epi16(src_reg_Filt32b2_1, first_filters);
803 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, forth_filters);
804
805 // add and saturate the results together
806 src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, src_reg_Filt32b2);
807
808 // filter the source buffer
809 src_reg_Filt32b3 = _mm256_shuffle_epi8(src_reg_32b2, filt2_reg);
810 src_reg_Filt32b2 = _mm256_shuffle_epi8(src_reg_32b2, filt3_reg);
811
812 // multiply 2 adjacent elements with the filter and add the result
813 src_reg_Filt32b3 = _mm256_maddubs_epi16(src_reg_Filt32b3, second_filters);
814 src_reg_Filt32b2 = _mm256_maddubs_epi16(src_reg_Filt32b2, third_filters);
815
816 // add and saturate the results together
817 src_reg_Filt32b2_1 = _mm256_adds_epi16(
818 src_reg_Filt32b2_1, _mm256_adds_epi16(src_reg_Filt32b3, src_reg_Filt32b2));
819
820 // shift by 6 bit each 16 bit
821 src_reg_filt32b1_1 = _mm256_adds_epi16(src_reg_filt32b1_1, add_filter_reg32);
822 src_reg_Filt32b2_1 = _mm256_adds_epi16(src_reg_Filt32b2_1, add_filter_reg32);
823 src_reg_filt32b1_1 = _mm256_srai_epi16(src_reg_filt32b1_1, 6);
824 src_reg_Filt32b2_1 = _mm256_srai_epi16(src_reg_Filt32b2_1, 6);
825
826 // shrink to 8 bit each 16 bits, the first lane contain the first
827 // convolve result and the second lane contain the second convolve result
828 src_reg_filt32b1_1 = _mm256_packus_epi16(src_reg_filt32b1_1, src_reg_Filt32b2_1);
829
830 src_ptr += src_stride;
831
832 xx_store2_mi128(output_ptr, output_pitch, &src_reg_filt32b1_1);
833 output_ptr += dst_stride;
834 }
835
836 // if the number of strides is odd.
837 // process only 16 bytes
838 if (i > 0) {
839 __m128i src_reg1, src_reg2, src_reg_filt1_1, src_reg_filt2_1;
840 __m128i src_reg_filt2, src_reg_filt3;
841
842 src_reg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
843
844 // filter the source buffer
845 src_reg_filt1_1 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt1_reg));
846 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt4_reg));
847
848 // multiply 2 adjacent elements with the filter and add the result
849 src_reg_filt1_1 = _mm_maddubs_epi16(src_reg_filt1_1, _mm256_castsi256_si128(first_filters));
850 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
851
852 // add and saturate the results together
853 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, src_reg_filt2);
854
855 // filter the source buffer
856 src_reg_filt3 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt2_reg));
857 src_reg_filt2 = _mm_shuffle_epi8(src_reg1, _mm256_castsi256_si128(filt3_reg));
858
859 // multiply 2 adjacent elements with the filter and add the result
860 src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
861 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
862
863 // add and saturate the results together
864 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1,
865 _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
866
867 // reading the next 16 bytes
868 // (part of it was being read by earlier read)
869 src_reg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
870
871 // filter the source buffer
872 src_reg_filt2_1 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt1_reg));
873 src_reg_filt2 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt4_reg));
874
875 // multiply 2 adjacent elements with the filter and add the result
876 src_reg_filt2_1 = _mm_maddubs_epi16(src_reg_filt2_1, _mm256_castsi256_si128(first_filters));
877 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(forth_filters));
878
879 // add and saturate the results together
880 src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1, src_reg_filt2);
881
882 // filter the source buffer
883 src_reg_filt3 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt2_reg));
884 src_reg_filt2 = _mm_shuffle_epi8(src_reg2, _mm256_castsi256_si128(filt3_reg));
885
886 // multiply 2 adjacent elements with the filter and add the result
887 src_reg_filt3 = _mm_maddubs_epi16(src_reg_filt3, _mm256_castsi256_si128(second_filters));
888 src_reg_filt2 = _mm_maddubs_epi16(src_reg_filt2, _mm256_castsi256_si128(third_filters));
889
890 // add and saturate the results together
891 src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1,
892 _mm_adds_epi16(src_reg_filt3, src_reg_filt2));
893
894 // shift by 6 bit each 16 bit
895 src_reg_filt1_1 = _mm_adds_epi16(src_reg_filt1_1, _mm256_castsi256_si128(add_filter_reg32));
896 src_reg_filt1_1 = _mm_srai_epi16(src_reg_filt1_1, 6);
897
898 src_reg_filt2_1 = _mm_adds_epi16(src_reg_filt2_1, _mm256_castsi256_si128(add_filter_reg32));
899 src_reg_filt2_1 = _mm_srai_epi16(src_reg_filt2_1, 6);
900
901 // shrink to 8 bit each 16 bits, the first lane contain the first
902 // convolve result and the second lane contain the second convolve
903 // result
904 src_reg_filt1_1 = _mm_packus_epi16(src_reg_filt1_1, src_reg_filt2_1);
905
906 // save 16 bytes
907 _mm_storeu_si128((__m128i *)output_ptr, src_reg_filt1_1);
908 }
909 }
910
svt_aom_filter_block1d8_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)911 static void svt_aom_filter_block1d8_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
912 uint8_t *output_ptr, ptrdiff_t out_pitch,
913 uint32_t output_height, const int16_t *filter) {
914 __m128i filters_reg;
915 __m256i filters_reg32, add_filter_reg32;
916 __m256i src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
917 __m256i src_reg_23_34_lo, src_reg_45_56_lo;
918 __m256i res_reg23_34_lo, res_reg45_56_lo;
919 __m256i res_reg_lo, res_reg;
920 __m256i second_filters, third_filters;
921 unsigned int i;
922 ptrdiff_t src_stride, dst_stride;
923
924 add_filter_reg32 = _mm256_set1_epi16(32);
925 filters_reg = _mm_loadu_si128((const __m128i *)filter);
926 // converting the 16 bit (short) to 8 bit (byte) and have the
927 // same data in both lanes of 128 bit register.
928 filters_reg = _mm_srai_epi16(filters_reg, 1);
929 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
930 // have the same data in both lanes of a 256 bit register
931 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
932
933 // duplicate only the second 16 bits (third and forth byte)
934 // across 256 bit register
935 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
936 // duplicate only the third 16 bits (fifth and sixth byte)
937 // across 256 bit register
938 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
939
940 // multiple the size of the source and destination stride by two
941 src_stride = src_pitch << 1;
942 dst_stride = out_pitch << 1;
943
944 src_reg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
945 src_reg_4x = _mm256_castsi128_si256(
946 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
947
948 // have consecutive loads on the same 256 register
949 src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
950
951 src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
952
953 for (i = output_height; i > 1; i -= 2) {
954 // load the last 2 loads of 16 bytes and have every two
955 // consecutive loads in the same 256 bit register
956 src_reg_5x = _mm256_castsi128_si256(
957 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
958 src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
959
960 src_reg_6x = _mm256_castsi128_si256(
961 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
962 src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
963
964 // merge every two consecutive registers
965 src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
966
967 // multiply 2 adjacent elements with the filter and add the result
968 res_reg23_34_lo = _mm256_maddubs_epi16(src_reg_23_34_lo, second_filters);
969 res_reg45_56_lo = _mm256_maddubs_epi16(src_reg_45_56_lo, third_filters);
970
971 // add and saturate the results together
972 res_reg_lo = _mm256_adds_epi16(res_reg23_34_lo, res_reg45_56_lo);
973
974 // shift by 6 bit each 16 bit
975 res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
976 res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
977
978 // shrink to 8 bit each 16 bits, the first lane contain the first
979 // convolve result and the second lane contain the second convolve
980 // result
981 res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_lo);
982
983 src_ptr += src_stride;
984
985 xx_storeu2_epi64(output_ptr, out_pitch, &res_reg);
986
987 output_ptr += dst_stride;
988
989 // save part of the registers for next strides
990 src_reg_23_34_lo = src_reg_45_56_lo;
991 src_reg_4x = src_reg_6x;
992 }
993 }
994
svt_aom_filter_block1d8_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)995 static void svt_aom_filter_block1d8_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
996 uint8_t *output_ptr, ptrdiff_t out_pitch,
997 uint32_t output_height, const int16_t *filter) {
998 __m128i filters_reg;
999 __m256i add_filter_reg32;
1000 __m256i src_reg32b1, src_reg_32b2, src_reg_32b3, src_reg_32b4, src_reg_32b5;
1001 __m256i src_reg_32b6, src_reg_32b7, src_reg_32b8, src_reg_32b9, src_reg_32b10;
1002 __m256i src_reg_32b11, src_reg_32b12, filters_reg32;
1003 __m256i first_filters, second_filters, third_filters, forth_filters;
1004 unsigned int i;
1005 ptrdiff_t src_stride, dst_stride;
1006
1007 add_filter_reg32 = _mm256_set1_epi16(32);
1008 filters_reg = _mm_loadu_si128((const __m128i *)filter);
1009 // converting the 16 bit (short) to 8 bit (byte) and have the
1010 // same data in both lanes of 128 bit register.
1011 filters_reg = _mm_srai_epi16(filters_reg, 1);
1012 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1013 // have the same data in both lanes of a 256 bit register
1014 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1015
1016 // duplicate only the first 16 bits (first and second byte)
1017 // across 256 bit register
1018 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
1019 // duplicate only the second 16 bits (third and forth byte)
1020 // across 256 bit register
1021 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1022 // duplicate only the third 16 bits (fifth and sixth byte)
1023 // across 256 bit register
1024 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1025 // duplicate only the forth 16 bits (seventh and eighth byte)
1026 // across 256 bit register
1027 forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
1028
1029 // multiple the size of the source and destination stride by two
1030 src_stride = src_pitch << 1;
1031 dst_stride = out_pitch << 1;
1032
1033 // load 16 bytes 7 times in stride of src_pitch
1034 src_reg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
1035 src_reg_32b3 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1036 src_reg_32b5 = xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1037 src_reg_32b7 = _mm256_castsi128_si256(
1038 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1039
1040 // have each consecutive loads on the same 256 register
1041 src_reg_32b2 = _mm256_permute2x128_si256(src_reg32b1, src_reg_32b3, 0x21);
1042 src_reg_32b4 = _mm256_permute2x128_si256(src_reg_32b3, src_reg_32b5, 0x21);
1043 src_reg_32b6 = _mm256_permute2x128_si256(src_reg_32b5, src_reg_32b7, 0x21);
1044 // merge every two consecutive registers except the last one
1045 src_reg_32b10 = _mm256_unpacklo_epi8(src_reg32b1, src_reg_32b2);
1046 src_reg_32b11 = _mm256_unpacklo_epi8(src_reg_32b3, src_reg_32b4);
1047 src_reg_32b2 = _mm256_unpacklo_epi8(src_reg_32b5, src_reg_32b6);
1048
1049 for (i = output_height; i > 1; i -= 2) {
1050 // load the last 2 loads of 16 bytes and have every two
1051 // consecutive loads in the same 256 bit register
1052 src_reg_32b8 = _mm256_castsi128_si256(
1053 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
1054 src_reg_32b7 = _mm256_inserti128_si256(
1055 src_reg_32b7, _mm256_castsi256_si128(src_reg_32b8), 1);
1056 src_reg_32b9 = _mm256_castsi128_si256(
1057 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
1058 src_reg_32b8 = _mm256_inserti128_si256(
1059 src_reg_32b8, _mm256_castsi256_si128(src_reg_32b9), 1);
1060
1061 // merge every two consecutive registers
1062 // save
1063 src_reg_32b4 = _mm256_unpacklo_epi8(src_reg_32b7, src_reg_32b8);
1064
1065 // multiply 2 adjacent elements with the filter and add the result
1066 src_reg_32b10 = _mm256_maddubs_epi16(src_reg_32b10, first_filters);
1067 src_reg_32b6 = _mm256_maddubs_epi16(src_reg_32b4, forth_filters);
1068
1069 // add and saturate the results together
1070 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, src_reg_32b6);
1071
1072 // multiply 2 adjacent elements with the filter and add the result
1073 src_reg_32b8 = _mm256_maddubs_epi16(src_reg_32b11, second_filters);
1074 src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b2, third_filters);
1075
1076 // add and saturate the results together
1077 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10,
1078 _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1079
1080 // shift by 6 bit each 16 bit
1081 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, add_filter_reg32);
1082 src_reg_32b10 = _mm256_srai_epi16(src_reg_32b10, 6);
1083
1084 // shrink to 8 bit each 16 bits, the first lane contain the first
1085 // convolve result and the second lane contain the second convolve
1086 // result
1087 src_reg32b1 = _mm256_packus_epi16(src_reg_32b10, _mm256_setzero_si256());
1088
1089 src_ptr += src_stride;
1090
1091 xx_storeu2_epi64(output_ptr, out_pitch, &src_reg32b1);
1092
1093 output_ptr += dst_stride;
1094
1095 // save part of the registers for next strides
1096 src_reg_32b10 = src_reg_32b11;
1097 src_reg_32b11 = src_reg_32b2;
1098 src_reg_32b2 = src_reg_32b4;
1099 src_reg_32b7 = src_reg_32b9;
1100 }
1101 if (i > 0) {
1102 __m128i src_reg_filt1, src_reg_filt4, src_reg_filt6, src_reg_filt8;
1103 // load the last 16 bytes
1104 src_reg_filt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
1105
1106 // merge the last 2 results together
1107 src_reg_filt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1108
1109 // multiply 2 adjacent elements with the filter and add the result
1110 src_reg_filt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b10),
1111 _mm256_castsi256_si128(first_filters));
1112 src_reg_filt4 = _mm_maddubs_epi16(src_reg_filt4, _mm256_castsi256_si128(forth_filters));
1113
1114 // add and saturate the results together
1115 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, src_reg_filt4);
1116
1117 // multiply 2 adjacent elements with the filter and add the result
1118 src_reg_filt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b11),
1119 _mm256_castsi256_si128(second_filters));
1120
1121 // multiply 2 adjacent elements with the filter and add the result
1122 src_reg_filt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b2),
1123 _mm256_castsi256_si128(third_filters));
1124
1125 // add and saturate the results together
1126 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm_adds_epi16(src_reg_filt4, src_reg_filt6));
1127
1128 // shift by 6 bit each 16 bit
1129 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm256_castsi256_si128(add_filter_reg32));
1130 src_reg_filt1 = _mm_srai_epi16(src_reg_filt1, 6);
1131
1132 // shrink to 8 bit each 16 bits, the first lane contain the first
1133 // convolve result and the second lane contain the second convolve result
1134 src_reg_filt1 = _mm_packus_epi16(src_reg_filt1, _mm_setzero_si128());
1135
1136 // save 8 bytes
1137 _mm_storel_epi64((__m128i *)output_ptr, src_reg_filt1);
1138 }
1139 }
1140
svt_aom_filter_block1d16_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1141 static void svt_aom_filter_block1d16_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1142 uint8_t *output_ptr, ptrdiff_t out_pitch,
1143 uint32_t output_height, const int16_t *filter) {
1144 __m128i filters_reg;
1145 __m256i filters_reg32, add_filter_reg32;
1146 __m256i src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
1147 __m256i src_reg_23_34_lo, src_reg_23_34_hi, src_reg_45_56_lo, src_reg_45_56_hi;
1148 __m256i res_reg23_34_lo, res_reg23_34_hi, res_reg45_56_lo, res_reg45_56_hi;
1149 __m256i res_reg_lo, res_reg_hi, res_reg;
1150 __m256i second_filters, third_filters;
1151 unsigned int i;
1152 ptrdiff_t src_stride, dst_stride;
1153
1154 add_filter_reg32 = _mm256_set1_epi16(32);
1155 filters_reg = _mm_loadu_si128((const __m128i *)filter);
1156 // converting the 16 bit (short) to 8 bit (byte) and have the
1157 // same data in both lanes of 128 bit register.
1158 filters_reg = _mm_srai_epi16(filters_reg, 1);
1159 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1160 // have the same data in both lanes of a 256 bit register
1161 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1162
1163 // duplicate only the second 16 bits (third and forth byte)
1164 // across 256 bit register
1165 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1166 // duplicate only the third 16 bits (fifth and sixth byte)
1167 // across 256 bit register
1168 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1169
1170 // multiple the size of the source and destination stride by two
1171 src_stride = src_pitch << 1;
1172 dst_stride = out_pitch << 1;
1173
1174 src_reg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1175 src_reg_4x = _mm256_castsi128_si256(
1176 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
1177
1178 // have consecutive loads on the same 256 register
1179 src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
1180
1181 src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
1182 src_reg_23_34_hi = _mm256_unpackhi_epi8(src_reg23, src_reg_34);
1183
1184 for (i = output_height; i > 1; i -= 2) {
1185 // load the last 2 loads of 16 bytes and have every two
1186 // consecutive loads in the same 256 bit register
1187 src_reg_5x = _mm256_castsi128_si256(
1188 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
1189 src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
1190
1191 src_reg_6x = _mm256_castsi128_si256(
1192 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1193 src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
1194
1195 // merge every two consecutive registers
1196 src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
1197 src_reg_45_56_hi = _mm256_unpackhi_epi8(src_reg_45, src_reg_56);
1198
1199 // multiply 2 adjacent elements with the filter and add the result
1200 res_reg23_34_lo = _mm256_maddubs_epi16(src_reg_23_34_lo, second_filters);
1201 res_reg45_56_lo = _mm256_maddubs_epi16(src_reg_45_56_lo, third_filters);
1202
1203 // add and saturate the results together
1204 res_reg_lo = _mm256_adds_epi16(res_reg23_34_lo, res_reg45_56_lo);
1205
1206 // multiply 2 adjacent elements with the filter and add the result
1207 res_reg23_34_hi = _mm256_maddubs_epi16(src_reg_23_34_hi, second_filters);
1208 res_reg45_56_hi = _mm256_maddubs_epi16(src_reg_45_56_hi, third_filters);
1209
1210 // add and saturate the results together
1211 res_reg_hi = _mm256_adds_epi16(res_reg23_34_hi, res_reg45_56_hi);
1212
1213 // shift by 6 bit each 16 bit
1214 res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
1215 res_reg_hi = _mm256_adds_epi16(res_reg_hi, add_filter_reg32);
1216 res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
1217 res_reg_hi = _mm256_srai_epi16(res_reg_hi, 6);
1218
1219 // shrink to 8 bit each 16 bits, the first lane contain the first
1220 // convolve result and the second lane contain the second convolve
1221 // result
1222 res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
1223
1224 src_ptr += src_stride;
1225
1226 xx_store2_mi128(output_ptr, out_pitch, &res_reg);
1227
1228 output_ptr += dst_stride;
1229
1230 // save part of the registers for next strides
1231 src_reg_23_34_lo = src_reg_45_56_lo;
1232 src_reg_23_34_hi = src_reg_45_56_hi;
1233 src_reg_4x = src_reg_6x;
1234 }
1235 }
1236
svt_aom_filter_block1d16_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1237 static void svt_aom_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1238 uint8_t *output_ptr, ptrdiff_t out_pitch,
1239 uint32_t output_height, const int16_t *filter) {
1240 __m128i filters_reg;
1241 __m256i add_filter_reg32;
1242 __m256i src_reg32b1, src_reg_32b2, src_reg_32b3, src_reg_32b4, src_reg_32b5;
1243 __m256i src_reg_32b6, src_reg_32b7, src_reg_32b8, src_reg_32b9, src_reg_32b10;
1244 __m256i src_reg_32b11, src_reg_32b12, filters_reg32;
1245 __m256i first_filters, second_filters, third_filters, forth_filters;
1246 unsigned int i;
1247 ptrdiff_t src_stride, dst_stride;
1248
1249 add_filter_reg32 = _mm256_set1_epi16(32);
1250 filters_reg = _mm_loadu_si128((const __m128i *)filter);
1251 // converting the 16 bit (short) to 8 bit (byte) and have the
1252 // same data in both lanes of 128 bit register.
1253 filters_reg = _mm_srai_epi16(filters_reg, 1);
1254 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1255 // have the same data in both lanes of a 256 bit register
1256 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1257
1258 // duplicate only the first 16 bits (first and second byte)
1259 // across 256 bit register
1260 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x100u));
1261 // duplicate only the second 16 bits (third and forth byte)
1262 // across 256 bit register
1263 second_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x302u));
1264 // duplicate only the third 16 bits (fifth and sixth byte)
1265 // across 256 bit register
1266 third_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x504u));
1267 // duplicate only the forth 16 bits (seventh and eighth byte)
1268 // across 256 bit register
1269 forth_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi16(0x706u));
1270
1271 // multiple the size of the source and destination stride by two
1272 src_stride = src_pitch << 1;
1273 dst_stride = out_pitch << 1;
1274
1275 // load 16 bytes 7 times in stride of src_pitch
1276 src_reg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
1277 src_reg_32b3 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1278 src_reg_32b5 = xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1279 src_reg_32b7 = _mm256_castsi128_si256(
1280 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1281
1282 // have each consecutive loads on the same 256 register
1283 src_reg_32b2 = _mm256_permute2x128_si256(src_reg32b1, src_reg_32b3, 0x21);
1284 src_reg_32b4 = _mm256_permute2x128_si256(src_reg_32b3, src_reg_32b5, 0x21);
1285 src_reg_32b6 = _mm256_permute2x128_si256(src_reg_32b5, src_reg_32b7, 0x21);
1286 // merge every two consecutive registers except the last one
1287 src_reg_32b10 = _mm256_unpacklo_epi8(src_reg32b1, src_reg_32b2);
1288 src_reg32b1 = _mm256_unpackhi_epi8(src_reg32b1, src_reg_32b2);
1289
1290 // save
1291 src_reg_32b11 = _mm256_unpacklo_epi8(src_reg_32b3, src_reg_32b4);
1292 src_reg_32b3 = _mm256_unpackhi_epi8(src_reg_32b3, src_reg_32b4);
1293 src_reg_32b2 = _mm256_unpacklo_epi8(src_reg_32b5, src_reg_32b6);
1294 src_reg_32b5 = _mm256_unpackhi_epi8(src_reg_32b5, src_reg_32b6);
1295
1296 for (i = output_height; i > 1; i -= 2) {
1297 // load the last 2 loads of 16 bytes and have every two
1298 // consecutive loads in the same 256 bit register
1299 src_reg_32b8 = _mm256_castsi128_si256(
1300 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
1301 src_reg_32b7 = _mm256_inserti128_si256(
1302 src_reg_32b7, _mm256_castsi256_si128(src_reg_32b8), 1);
1303 src_reg_32b9 = _mm256_castsi128_si256(
1304 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
1305 src_reg_32b8 = _mm256_inserti128_si256(
1306 src_reg_32b8, _mm256_castsi256_si128(src_reg_32b9), 1);
1307
1308 // merge every two consecutive registers
1309 // save
1310 src_reg_32b4 = _mm256_unpacklo_epi8(src_reg_32b7, src_reg_32b8);
1311 src_reg_32b7 = _mm256_unpackhi_epi8(src_reg_32b7, src_reg_32b8);
1312
1313 // multiply 2 adjacent elements with the filter and add the result
1314 src_reg_32b10 = _mm256_maddubs_epi16(src_reg_32b10, first_filters);
1315 src_reg_32b6 = _mm256_maddubs_epi16(src_reg_32b4, forth_filters);
1316
1317 // add and saturate the results together
1318 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, src_reg_32b6);
1319
1320 // multiply 2 adjacent elements with the filter and add the result
1321 src_reg_32b8 = _mm256_maddubs_epi16(src_reg_32b11, second_filters);
1322 src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b2, third_filters);
1323
1324 // add and saturate the results together
1325 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10,
1326 _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1327
1328 // multiply 2 adjacent elements with the filter and add the result
1329 src_reg32b1 = _mm256_maddubs_epi16(src_reg32b1, first_filters);
1330 src_reg_32b6 = _mm256_maddubs_epi16(src_reg_32b7, forth_filters);
1331
1332 src_reg32b1 = _mm256_adds_epi16(src_reg32b1, src_reg_32b6);
1333
1334 // multiply 2 adjacent elements with the filter and add the result
1335 src_reg_32b8 = _mm256_maddubs_epi16(src_reg_32b3, second_filters);
1336 src_reg_32b12 = _mm256_maddubs_epi16(src_reg_32b5, third_filters);
1337
1338 // add and saturate the results together
1339 src_reg32b1 = _mm256_adds_epi16(src_reg32b1,
1340 _mm256_adds_epi16(src_reg_32b8, src_reg_32b12));
1341
1342 // shift by 6 bit each 16 bit
1343 src_reg_32b10 = _mm256_adds_epi16(src_reg_32b10, add_filter_reg32);
1344 src_reg32b1 = _mm256_adds_epi16(src_reg32b1, add_filter_reg32);
1345 src_reg_32b10 = _mm256_srai_epi16(src_reg_32b10, 6);
1346 src_reg32b1 = _mm256_srai_epi16(src_reg32b1, 6);
1347
1348 // shrink to 8 bit each 16 bits, the first lane contain the first
1349 // convolve result and the second lane contain the second convolve
1350 // result
1351 src_reg32b1 = _mm256_packus_epi16(src_reg_32b10, src_reg32b1);
1352
1353 src_ptr += src_stride;
1354
1355 xx_store2_mi128(output_ptr, out_pitch, &src_reg32b1);
1356
1357 output_ptr += dst_stride;
1358
1359 // save part of the registers for next strides
1360 src_reg_32b10 = src_reg_32b11;
1361 src_reg32b1 = src_reg_32b3;
1362 src_reg_32b11 = src_reg_32b2;
1363 src_reg_32b3 = src_reg_32b5;
1364 src_reg_32b2 = src_reg_32b4;
1365 src_reg_32b5 = src_reg_32b7;
1366 src_reg_32b7 = src_reg_32b9;
1367 }
1368 if (i > 0) {
1369 __m128i src_reg_filt1, src_reg_filt3, src_reg_filt4, src_reg_filt5;
1370 __m128i src_reg_filt6, src_reg_filt7, src_reg_filt8;
1371 // load the last 16 bytes
1372 src_reg_filt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1373
1374 // merge the last 2 results together
1375 src_reg_filt4 = _mm_unpacklo_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1376 src_reg_filt7 = _mm_unpackhi_epi8(_mm256_castsi256_si128(src_reg_32b7), src_reg_filt8);
1377
1378 // multiply 2 adjacent elements with the filter and add the result
1379 src_reg_filt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b10),
1380 _mm256_castsi256_si128(first_filters));
1381 src_reg_filt4 = _mm_maddubs_epi16(src_reg_filt4, _mm256_castsi256_si128(forth_filters));
1382 src_reg_filt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg32b1),
1383 _mm256_castsi256_si128(first_filters));
1384 src_reg_filt7 = _mm_maddubs_epi16(src_reg_filt7, _mm256_castsi256_si128(forth_filters));
1385
1386 // add and saturate the results together
1387 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, src_reg_filt4);
1388 src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, src_reg_filt7);
1389
1390 // multiply 2 adjacent elements with the filter and add the result
1391 src_reg_filt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b11),
1392 _mm256_castsi256_si128(second_filters));
1393 src_reg_filt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b3),
1394 _mm256_castsi256_si128(second_filters));
1395
1396 // multiply 2 adjacent elements with the filter and add the result
1397 src_reg_filt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b2),
1398 _mm256_castsi256_si128(third_filters));
1399 src_reg_filt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(src_reg_32b5),
1400 _mm256_castsi256_si128(third_filters));
1401
1402 // add and saturate the results together
1403 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm_adds_epi16(src_reg_filt4, src_reg_filt6));
1404 src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, _mm_adds_epi16(src_reg_filt5, src_reg_filt7));
1405
1406 // shift by 6 bit each 16 bit
1407 src_reg_filt1 = _mm_adds_epi16(src_reg_filt1, _mm256_castsi256_si128(add_filter_reg32));
1408 src_reg_filt3 = _mm_adds_epi16(src_reg_filt3, _mm256_castsi256_si128(add_filter_reg32));
1409 src_reg_filt1 = _mm_srai_epi16(src_reg_filt1, 6);
1410 src_reg_filt3 = _mm_srai_epi16(src_reg_filt3, 6);
1411
1412 // shrink to 8 bit each 16 bits, the first lane contain the first
1413 // convolve result and the second lane contain the second convolve
1414 // result
1415 src_reg_filt1 = _mm_packus_epi16(src_reg_filt1, src_reg_filt3);
1416
1417 // save 16 bytes
1418 _mm_storeu_si128((__m128i *)output_ptr, src_reg_filt1);
1419 }
1420 }
1421
svt_aom_filter_block1d4_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1422 static void svt_aom_filter_block1d4_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
1423 uint8_t *output_ptr, ptrdiff_t out_pitch,
1424 uint32_t output_height, const int16_t *filter) {
1425 __m128i filters_reg;
1426 __m256i filters_reg32, add_filter_reg32;
1427 __m256i src_reg23, src_reg_4x, src_reg_34, src_reg_5x, src_reg_45, src_reg_6x, src_reg_56;
1428 __m256i src_reg_23_34_lo, src_reg_45_56_lo;
1429 __m256i src_reg_2345_3456_lo;
1430 __m256i res_reg_lo, res_reg;
1431 __m256i first_filters;
1432 unsigned int i;
1433 ptrdiff_t src_stride, dst_stride;
1434
1435 add_filter_reg32 = _mm256_set1_epi16(32);
1436 filters_reg = _mm_loadu_si128((const __m128i *)filter);
1437 // converting the 16 bit (short) to 8 bit (byte) and have the
1438 // same data in both lanes of 128 bit register.
1439 filters_reg = _mm_srai_epi16(filters_reg, 1);
1440 filters_reg = _mm_packs_epi16(filters_reg, filters_reg);
1441 // have the same data in both lanes of a 256 bit register
1442 filters_reg32 = MM256_BROADCASTSI128_SI256(filters_reg);
1443
1444 first_filters = _mm256_shuffle_epi8(filters_reg32, _mm256_set1_epi32(0x5040302u));
1445
1446 // multiple the size of the source and destination stride by two
1447 src_stride = src_pitch << 1;
1448 dst_stride = out_pitch << 1;
1449
1450 src_reg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1451 src_reg_4x = _mm256_castsi128_si256(
1452 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
1453
1454 // have consecutive loads on the same 256 register
1455 src_reg_34 = _mm256_permute2x128_si256(src_reg23, src_reg_4x, 0x21);
1456
1457 src_reg_23_34_lo = _mm256_unpacklo_epi8(src_reg23, src_reg_34);
1458
1459 for (i = output_height; i > 1; i -= 2) {
1460 // load the last 2 loads of 16 bytes and have every two
1461 // consecutive loads in the same 256 bit register
1462 src_reg_5x = _mm256_castsi128_si256(
1463 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
1464 src_reg_45 = _mm256_inserti128_si256(src_reg_4x, _mm256_castsi256_si128(src_reg_5x), 1);
1465
1466 src_reg_6x = _mm256_castsi128_si256(
1467 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1468 src_reg_56 = _mm256_inserti128_si256(src_reg_5x, _mm256_castsi256_si128(src_reg_6x), 1);
1469
1470 // merge every two consecutive registers
1471 src_reg_45_56_lo = _mm256_unpacklo_epi8(src_reg_45, src_reg_56);
1472
1473 src_reg_2345_3456_lo = _mm256_unpacklo_epi16(src_reg_23_34_lo, src_reg_45_56_lo);
1474
1475 // multiply 2 adjacent elements with the filter and add the result
1476 res_reg_lo = _mm256_maddubs_epi16(src_reg_2345_3456_lo, first_filters);
1477
1478 res_reg_lo = _mm256_hadds_epi16(res_reg_lo, _mm256_setzero_si256());
1479
1480 // shift by 6 bit each 16 bit
1481 res_reg_lo = _mm256_adds_epi16(res_reg_lo, add_filter_reg32);
1482 res_reg_lo = _mm256_srai_epi16(res_reg_lo, 6);
1483
1484 // shrink to 8 bit each 16 bits, the first lane contain the first
1485 // convolve result and the second lane contain the second convolve
1486 // result
1487 res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_lo);
1488
1489 src_ptr += src_stride;
1490
1491 xx_storeu2_epi32(output_ptr, out_pitch, &res_reg);
1492
1493 output_ptr += dst_stride;
1494
1495 // save part of the registers for next strides
1496 src_reg_23_34_lo = src_reg_45_56_lo;
1497 src_reg_4x = src_reg_6x;
1498 }
1499 }
1500
1501 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
1502 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
1503