1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/x86/convolve.h"
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_ports/mem.h"
19
20 #if defined(__clang__)
21 #if (__clang_major__ > 0 && __clang_major__ < 3) || \
22 (__clang_major__ == 3 && __clang_minor__ <= 3) || \
23 (defined(__APPLE__) && defined(__apple_build_version__) && \
24 ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
25 (__clang_major__ == 5 && __clang_minor__ == 0)))
26 #define MM256_BROADCASTSI128_SI256(x) \
27 _mm_broadcastsi128_si256((__m128i const *)&(x))
28 #else // clang > 3.3, and not 5.0 on macosx.
29 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
30 #endif // clang <= 3.3
31 #elif defined(__GNUC__)
32 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
33 #define MM256_BROADCASTSI128_SI256(x) \
34 _mm_broadcastsi128_si256((__m128i const *)&(x))
35 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
36 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
37 #else // gcc > 4.7
38 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
39 #endif // gcc <= 4.6
40 #else // !(gcc || clang)
41 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
42 #endif // __clang__
43
xx_storeu2_epi32(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)44 static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
45 const ptrdiff_t stride, const __m256i *a) {
46 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
47 *((uint32_t *)(output_ptr + stride)) =
48 _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
49 }
50
xx_loadu2_epi64(const void * hi,const void * lo)51 static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
52 __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
53 a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
54 return a;
55 }
56
xx_storeu2_epi64(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)57 static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
58 const ptrdiff_t stride, const __m256i *a) {
59 _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
60 _mm_storel_epi64((__m128i *)(output_ptr + stride),
61 _mm256_extractf128_si256(*a, 1));
62 }
63
xx_loadu2_mi128(const void * hi,const void * lo)64 static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
65 __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
66 a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
67 return a;
68 }
69
xx_store2_mi128(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)70 static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
71 const ptrdiff_t stride, const __m256i *a) {
72 _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
73 _mm_store_si128((__m128i *)(output_ptr + stride),
74 _mm256_extractf128_si256(*a, 1));
75 }
76
aom_filter_block1d4_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)77 static void aom_filter_block1d4_h4_avx2(
78 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
79 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
80 __m128i filtersReg;
81 __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
82 unsigned int i;
83 ptrdiff_t src_stride, dst_stride;
84 src_ptr -= 3;
85 addFilterReg32 = _mm256_set1_epi16(32);
86 filtersReg = _mm_loadu_si128((const __m128i *)filter);
87 filtersReg = _mm_srai_epi16(filtersReg, 1);
88 // converting the 16 bit (short) to 8 bit (byte) and have the same data
89 // in both lanes of 128 bit register.
90 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
91 // have the same data in both lanes of a 256 bit register
92 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
93
94 firstFilters =
95 _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
96 filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
97
98 // multiple the size of the source and destination stride by two
99 src_stride = src_pixels_per_line << 1;
100 dst_stride = output_pitch << 1;
101 for (i = output_height; i > 1; i -= 2) {
102 // load the 2 strides of source
103 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
104
105 // filter the source buffer
106 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
107
108 // multiply 4 adjacent elements with the filter and add the result
109 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
110
111 srcRegFilt32b1_1 =
112 _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
113
114 // shift by 6 bit each 16 bit
115 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
116 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
117
118 // shrink to 8 bit each 16 bits, the first lane contain the first
119 // convolve result and the second lane contain the second convolve result
120 srcRegFilt32b1_1 =
121 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
122
123 src_ptr += src_stride;
124
125 xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
126 output_ptr += dst_stride;
127 }
128
129 // if the number of strides is odd.
130 // process only 4 bytes
131 if (i > 0) {
132 __m128i srcReg1, srcRegFilt1_1;
133
134 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
135
136 // filter the source buffer
137 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
138
139 // multiply 4 adjacent elements with the filter and add the result
140 srcRegFilt1_1 =
141 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
142
143 srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
144 // shift by 6 bit each 16 bit
145 srcRegFilt1_1 =
146 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
147 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
148
149 // shrink to 8 bit each 16 bits, the first lane contain the first
150 // convolve result and the second lane contain the second convolve result
151 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
152
153 // save 4 bytes
154 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
155 }
156 }
157
aom_filter_block1d4_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)158 static void aom_filter_block1d4_h8_avx2(
159 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
160 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
161 __m128i filtersReg;
162 __m256i addFilterReg32, filt1Reg, filt2Reg;
163 __m256i firstFilters, secondFilters;
164 __m256i srcRegFilt32b1_1, srcRegFilt32b2;
165 __m256i srcReg32b1;
166 unsigned int i;
167 ptrdiff_t src_stride, dst_stride;
168 src_ptr -= 3;
169 addFilterReg32 = _mm256_set1_epi16(32);
170 filtersReg = _mm_loadu_si128((const __m128i *)filter);
171 filtersReg = _mm_srai_epi16(filtersReg, 1);
172 // converting the 16 bit (short) to 8 bit (byte) and have the same data
173 // in both lanes of 128 bit register.
174 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
175 // have the same data in both lanes of a 256 bit register
176 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
177
178 // duplicate only the first 32 bits
179 firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
180 // duplicate only the second 32 bits
181 secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
182
183 filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
184 filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
185
186 // multiple the size of the source and destination stride by two
187 src_stride = src_pixels_per_line << 1;
188 dst_stride = output_pitch << 1;
189 for (i = output_height; i > 1; i -= 2) {
190 // load the 2 strides of source
191 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
192
193 // filter the source buffer
194 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
195
196 // multiply 4 adjacent elements with the filter and add the result
197 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
198
199 // filter the source buffer
200 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
201
202 // multiply 4 adjacent elements with the filter and add the result
203 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
204
205 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
206
207 srcRegFilt32b1_1 =
208 _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
209
210 // shift by 6 bit each 16 bit
211 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
212 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
213
214 // shrink to 8 bit each 16 bits, the first lane contain the first
215 // convolve result and the second lane contain the second convolve result
216 srcRegFilt32b1_1 =
217 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
218
219 src_ptr += src_stride;
220
221 xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
222 output_ptr += dst_stride;
223 }
224
225 // if the number of strides is odd.
226 // process only 4 bytes
227 if (i > 0) {
228 __m128i srcReg1, srcRegFilt1_1;
229 __m128i srcRegFilt2;
230
231 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
232
233 // filter the source buffer
234 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
235
236 // multiply 4 adjacent elements with the filter and add the result
237 srcRegFilt1_1 =
238 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
239
240 // filter the source buffer
241 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
242
243 // multiply 4 adjacent elements with the filter and add the result
244 srcRegFilt2 =
245 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
246
247 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
248 srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
249 // shift by 6 bit each 16 bit
250 srcRegFilt1_1 =
251 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
252 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
253
254 // shrink to 8 bit each 16 bits, the first lane contain the first
255 // convolve result and the second lane contain the second convolve result
256 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
257
258 // save 4 bytes
259 *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
260 }
261 }
262
aom_filter_block1d8_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)263 static void aom_filter_block1d8_h4_avx2(
264 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
265 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
266 __m128i filtersReg;
267 __m256i addFilterReg32, filt2Reg, filt3Reg;
268 __m256i secondFilters, thirdFilters;
269 __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
270 __m256i srcReg32b1, filtersReg32;
271 unsigned int i;
272 ptrdiff_t src_stride, dst_stride;
273 src_ptr -= 3;
274 addFilterReg32 = _mm256_set1_epi16(32);
275 filtersReg = _mm_loadu_si128((const __m128i *)filter);
276 filtersReg = _mm_srai_epi16(filtersReg, 1);
277 // converting the 16 bit (short) to 8 bit (byte) and have the same data
278 // in both lanes of 128 bit register.
279 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
280 // have the same data in both lanes of a 256 bit register
281 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
282
283 // duplicate only the second 16 bits (third and forth byte)
284 // across 256 bit register
285 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
286 // duplicate only the third 16 bits (fifth and sixth byte)
287 // across 256 bit register
288 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
289
290 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
291 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
292
293 // multiply the size of the source and destination stride by two
294 src_stride = src_pixels_per_line << 1;
295 dst_stride = output_pitch << 1;
296 for (i = output_height; i > 1; i -= 2) {
297 // load the 2 strides of source
298 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
299
300 // filter the source buffer
301 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
302 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
303
304 // multiply 2 adjacent elements with the filter and add the result
305 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
306 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
307
308 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
309
310 // shift by 6 bit each 16 bit
311 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
312 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
313
314 // shrink to 8 bit each 16 bits
315 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
316
317 src_ptr += src_stride;
318
319 xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
320 output_ptr += dst_stride;
321 }
322
323 // if the number of strides is odd.
324 // process only 8 bytes
325 if (i > 0) {
326 __m128i srcReg1, srcRegFilt1_1;
327 __m128i srcRegFilt2, srcRegFilt3;
328
329 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
330
331 // filter the source buffer
332 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
333 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
334
335 // multiply 2 adjacent elements with the filter and add the result
336 srcRegFilt2 =
337 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
338 srcRegFilt3 =
339 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
340
341 // add and saturate the results together
342 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
343
344 // shift by 6 bit each 16 bit
345 srcRegFilt1_1 =
346 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
347 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
348
349 // shrink to 8 bit each 16 bits
350 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
351
352 // save 8 bytes
353 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
354 }
355 }
356
aom_filter_block1d8_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)357 static void aom_filter_block1d8_h8_avx2(
358 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
359 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
360 __m128i filtersReg;
361 __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
362 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
363 __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
364 __m256i srcReg32b1;
365 unsigned int i;
366 ptrdiff_t src_stride, dst_stride;
367 src_ptr -= 3;
368 addFilterReg32 = _mm256_set1_epi16(32);
369 filtersReg = _mm_loadu_si128((const __m128i *)filter);
370 filtersReg = _mm_srai_epi16(filtersReg, 1);
371 // converting the 16 bit (short) to 8 bit (byte) and have the same data
372 // in both lanes of 128 bit register.
373 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
374 // have the same data in both lanes of a 256 bit register
375 const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
376
377 // duplicate only the first 16 bits (first and second byte)
378 // across 256 bit register
379 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
380 // duplicate only the second 16 bits (third and forth byte)
381 // across 256 bit register
382 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
383 // duplicate only the third 16 bits (fifth and sixth byte)
384 // across 256 bit register
385 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
386 // duplicate only the forth 16 bits (seventh and eighth byte)
387 // across 256 bit register
388 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
389
390 filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
391 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
392 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
393 filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
394
395 // multiple the size of the source and destination stride by two
396 src_stride = src_pixels_per_line << 1;
397 dst_stride = output_pitch << 1;
398 for (i = output_height; i > 1; i -= 2) {
399 // load the 2 strides of source
400 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
401
402 // filter the source buffer
403 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
404 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
405
406 // multiply 2 adjacent elements with the filter and add the result
407 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
408 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
409
410 // add and saturate the results together
411 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
412
413 // filter the source buffer
414 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
415 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
416
417 // multiply 2 adjacent elements with the filter and add the result
418 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
419 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
420
421 __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
422 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
423
424 // shift by 6 bit each 16 bit
425 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
426 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
427
428 // shrink to 8 bit each 16 bits, the first lane contain the first
429 // convolve result and the second lane contain the second convolve result
430 srcRegFilt32b1_1 =
431 _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
432
433 src_ptr += src_stride;
434
435 xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
436 output_ptr += dst_stride;
437 }
438
439 // if the number of strides is odd.
440 // process only 8 bytes
441 if (i > 0) {
442 __m128i srcReg1, srcRegFilt1_1;
443 __m128i srcRegFilt2, srcRegFilt3;
444
445 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
446
447 // filter the source buffer
448 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
449 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
450
451 // multiply 2 adjacent elements with the filter and add the result
452 srcRegFilt1_1 =
453 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
454 srcRegFilt2 =
455 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
456
457 // add and saturate the results together
458 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
459
460 // filter the source buffer
461 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
462 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
463
464 // multiply 2 adjacent elements with the filter and add the result
465 srcRegFilt3 =
466 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
467 srcRegFilt2 =
468 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
469
470 // add and saturate the results together
471 srcRegFilt1_1 =
472 _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
473
474 // shift by 6 bit each 16 bit
475 srcRegFilt1_1 =
476 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
477 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
478
479 // shrink to 8 bit each 16 bits, the first lane contain the first
480 // convolve result and the second lane contain the second convolve
481 // result
482 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
483
484 // save 8 bytes
485 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
486 }
487 }
488
aom_filter_block1d16_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)489 static void aom_filter_block1d16_h4_avx2(
490 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
491 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
492 __m128i filtersReg;
493 __m256i addFilterReg32, filt2Reg, filt3Reg;
494 __m256i secondFilters, thirdFilters;
495 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
496 __m256i srcReg32b1, srcReg32b2, filtersReg32;
497 unsigned int i;
498 ptrdiff_t src_stride, dst_stride;
499 src_ptr -= 3;
500 addFilterReg32 = _mm256_set1_epi16(32);
501 filtersReg = _mm_loadu_si128((const __m128i *)filter);
502 filtersReg = _mm_srai_epi16(filtersReg, 1);
503 // converting the 16 bit (short) to 8 bit (byte) and have the same data
504 // in both lanes of 128 bit register.
505 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
506 // have the same data in both lanes of a 256 bit register
507 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
508
509 // duplicate only the second 16 bits (third and forth byte)
510 // across 256 bit register
511 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
512 // duplicate only the third 16 bits (fifth and sixth byte)
513 // across 256 bit register
514 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
515
516 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
517 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
518
519 // multiply the size of the source and destination stride by two
520 src_stride = src_pixels_per_line << 1;
521 dst_stride = output_pitch << 1;
522 for (i = output_height; i > 1; i -= 2) {
523 // load the 2 strides of source
524 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
525
526 // filter the source buffer
527 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
528 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
529
530 // multiply 2 adjacent elements with the filter and add the result
531 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
532 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
533
534 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
535
536 // reading 2 strides of the next 16 bytes
537 // (part of it was being read by earlier read)
538 srcReg32b2 =
539 xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
540
541 // filter the source buffer
542 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
543 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
544
545 // multiply 2 adjacent elements with the filter and add the result
546 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
547 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
548
549 // add and saturate the results together
550 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
551
552 // shift by 6 bit each 16 bit
553 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
554 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
555 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
556 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
557
558 // shrink to 8 bit each 16 bits, the first lane contain the first
559 // convolve result and the second lane contain the second convolve result
560 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
561
562 src_ptr += src_stride;
563
564 xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
565 output_ptr += dst_stride;
566 }
567
568 // if the number of strides is odd.
569 // process only 16 bytes
570 if (i > 0) {
571 __m256i srcReg1, srcReg12;
572 __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
573
574 srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
575 srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
576
577 // filter the source buffer
578 srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
579 srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
580
581 // multiply 2 adjacent elements with the filter and add the result
582 srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
583 srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
584
585 // add and saturate the results together
586 srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
587
588 // shift by 6 bit each 16 bit
589 srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
590 srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
591
592 // shrink to 8 bit each 16 bits, the first lane contain the first
593 // convolve result and the second lane contain the second convolve
594 // result
595 srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
596 srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
597
598 // save 16 bytes
599 _mm_store_si128((__m128i *)output_ptr,
600 _mm256_castsi256_si128(srcRegFilt1_1));
601 }
602 }
603
aom_filter_block1d16_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)604 static void aom_filter_block1d16_h8_avx2(
605 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
606 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
607 __m128i filtersReg;
608 __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
609 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
610 __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
611 __m256i srcReg32b1, srcReg32b2, filtersReg32;
612 unsigned int i;
613 ptrdiff_t src_stride, dst_stride;
614 src_ptr -= 3;
615 addFilterReg32 = _mm256_set1_epi16(32);
616 filtersReg = _mm_loadu_si128((const __m128i *)filter);
617 filtersReg = _mm_srai_epi16(filtersReg, 1);
618 // converting the 16 bit (short) to 8 bit (byte) and have the same data
619 // in both lanes of 128 bit register.
620 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
621 // have the same data in both lanes of a 256 bit register
622 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
623
624 // duplicate only the first 16 bits (first and second byte)
625 // across 256 bit register
626 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
627 // duplicate only the second 16 bits (third and forth byte)
628 // across 256 bit register
629 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
630 // duplicate only the third 16 bits (fifth and sixth byte)
631 // across 256 bit register
632 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
633 // duplicate only the forth 16 bits (seventh and eighth byte)
634 // across 256 bit register
635 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
636
637 filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
638 filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
639 filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
640 filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
641
642 // multiple the size of the source and destination stride by two
643 src_stride = src_pixels_per_line << 1;
644 dst_stride = output_pitch << 1;
645 for (i = output_height; i > 1; i -= 2) {
646 // load the 2 strides of source
647 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
648
649 // filter the source buffer
650 srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
651 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
652
653 // multiply 2 adjacent elements with the filter and add the result
654 srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
655 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
656
657 // add and saturate the results together
658 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
659
660 // filter the source buffer
661 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
662 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
663
664 // multiply 2 adjacent elements with the filter and add the result
665 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
666 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
667
668 __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
669 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
670
671 // reading 2 strides of the next 16 bytes
672 // (part of it was being read by earlier read)
673 srcReg32b2 =
674 xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
675
676 // filter the source buffer
677 srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
678 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
679
680 // multiply 2 adjacent elements with the filter and add the result
681 srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
682 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
683
684 // add and saturate the results together
685 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
686
687 // filter the source buffer
688 srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
689 srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
690
691 // multiply 2 adjacent elements with the filter and add the result
692 srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
693 srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
694
695 // add and saturate the results together
696 srcRegFilt32b2_1 = _mm256_adds_epi16(
697 srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
698
699 // shift by 6 bit each 16 bit
700 srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
701 srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
702 srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
703 srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
704
705 // shrink to 8 bit each 16 bits, the first lane contain the first
706 // convolve result and the second lane contain the second convolve result
707 srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
708
709 src_ptr += src_stride;
710
711 xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
712 output_ptr += dst_stride;
713 }
714
715 // if the number of strides is odd.
716 // process only 16 bytes
717 if (i > 0) {
718 __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
719 __m128i srcRegFilt2, srcRegFilt3;
720
721 srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
722
723 // filter the source buffer
724 srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
725 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
726
727 // multiply 2 adjacent elements with the filter and add the result
728 srcRegFilt1_1 =
729 _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
730 srcRegFilt2 =
731 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
732
733 // add and saturate the results together
734 srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
735
736 // filter the source buffer
737 srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
738 srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
739
740 // multiply 2 adjacent elements with the filter and add the result
741 srcRegFilt3 =
742 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
743 srcRegFilt2 =
744 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
745
746 // add and saturate the results together
747 srcRegFilt1_1 =
748 _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
749
750 // reading the next 16 bytes
751 // (part of it was being read by earlier read)
752 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
753
754 // filter the source buffer
755 srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
756 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
757
758 // multiply 2 adjacent elements with the filter and add the result
759 srcRegFilt2_1 =
760 _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
761 srcRegFilt2 =
762 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
763
764 // add and saturate the results together
765 srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
766
767 // filter the source buffer
768 srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
769 srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
770
771 // multiply 2 adjacent elements with the filter and add the result
772 srcRegFilt3 =
773 _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
774 srcRegFilt2 =
775 _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
776
777 // add and saturate the results together
778 srcRegFilt2_1 =
779 _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
780
781 // shift by 6 bit each 16 bit
782 srcRegFilt1_1 =
783 _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
784 srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
785
786 srcRegFilt2_1 =
787 _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
788 srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
789
790 // shrink to 8 bit each 16 bits, the first lane contain the first
791 // convolve result and the second lane contain the second convolve
792 // result
793 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
794
795 // save 16 bytes
796 _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
797 }
798 }
799
aom_filter_block1d8_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)800 static void aom_filter_block1d8_v4_avx2(
801 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
802 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
803 __m128i filtersReg;
804 __m256i filtersReg32, addFilterReg32;
805 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
806 __m256i srcReg23_34_lo, srcReg45_56_lo;
807 __m256i resReg23_34_lo, resReg45_56_lo;
808 __m256i resReglo, resReg;
809 __m256i secondFilters, thirdFilters;
810 unsigned int i;
811 ptrdiff_t src_stride, dst_stride;
812
813 addFilterReg32 = _mm256_set1_epi16(32);
814 filtersReg = _mm_loadu_si128((const __m128i *)filter);
815 // converting the 16 bit (short) to 8 bit (byte) and have the
816 // same data in both lanes of 128 bit register.
817 filtersReg = _mm_srai_epi16(filtersReg, 1);
818 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
819 // have the same data in both lanes of a 256 bit register
820 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
821
822 // duplicate only the second 16 bits (third and forth byte)
823 // across 256 bit register
824 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
825 // duplicate only the third 16 bits (fifth and sixth byte)
826 // across 256 bit register
827 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
828
829 // multiple the size of the source and destination stride by two
830 src_stride = src_pitch << 1;
831 dst_stride = out_pitch << 1;
832
833 srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
834 srcReg4x = _mm256_castsi128_si256(
835 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
836
837 // have consecutive loads on the same 256 register
838 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
839
840 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
841
842 for (i = output_height; i > 1; i -= 2) {
843 // load the last 2 loads of 16 bytes and have every two
844 // consecutive loads in the same 256 bit register
845 srcReg5x = _mm256_castsi128_si256(
846 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
847 srcReg45 =
848 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
849
850 srcReg6x = _mm256_castsi128_si256(
851 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
852 srcReg56 =
853 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
854
855 // merge every two consecutive registers
856 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
857
858 // multiply 2 adjacent elements with the filter and add the result
859 resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
860 resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
861
862 // add and saturate the results together
863 resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
864
865 // shift by 6 bit each 16 bit
866 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
867 resReglo = _mm256_srai_epi16(resReglo, 6);
868
869 // shrink to 8 bit each 16 bits, the first lane contain the first
870 // convolve result and the second lane contain the second convolve
871 // result
872 resReg = _mm256_packus_epi16(resReglo, resReglo);
873
874 src_ptr += src_stride;
875
876 xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
877
878 output_ptr += dst_stride;
879
880 // save part of the registers for next strides
881 srcReg23_34_lo = srcReg45_56_lo;
882 srcReg4x = srcReg6x;
883 }
884 }
885
aom_filter_block1d8_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)886 static void aom_filter_block1d8_v8_avx2(
887 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
888 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
889 __m128i filtersReg;
890 __m256i addFilterReg32;
891 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
892 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
893 __m256i srcReg32b11, srcReg32b12, filtersReg32;
894 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
895 unsigned int i;
896 ptrdiff_t src_stride, dst_stride;
897
898 addFilterReg32 = _mm256_set1_epi16(32);
899 filtersReg = _mm_loadu_si128((const __m128i *)filter);
900 // converting the 16 bit (short) to 8 bit (byte) and have the
901 // same data in both lanes of 128 bit register.
902 filtersReg = _mm_srai_epi16(filtersReg, 1);
903 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
904 // have the same data in both lanes of a 256 bit register
905 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
906
907 // duplicate only the first 16 bits (first and second byte)
908 // across 256 bit register
909 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
910 // duplicate only the second 16 bits (third and forth byte)
911 // across 256 bit register
912 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
913 // duplicate only the third 16 bits (fifth and sixth byte)
914 // across 256 bit register
915 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
916 // duplicate only the forth 16 bits (seventh and eighth byte)
917 // across 256 bit register
918 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
919
920 // multiple the size of the source and destination stride by two
921 src_stride = src_pitch << 1;
922 dst_stride = out_pitch << 1;
923
924 // load 16 bytes 7 times in stride of src_pitch
925 srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
926 srcReg32b3 =
927 xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
928 srcReg32b5 =
929 xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
930 srcReg32b7 = _mm256_castsi128_si256(
931 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
932
933 // have each consecutive loads on the same 256 register
934 srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
935 srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
936 srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
937 // merge every two consecutive registers except the last one
938 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
939 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
940 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
941
942 for (i = output_height; i > 1; i -= 2) {
943 // load the last 2 loads of 16 bytes and have every two
944 // consecutive loads in the same 256 bit register
945 srcReg32b8 = _mm256_castsi128_si256(
946 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
947 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
948 _mm256_castsi256_si128(srcReg32b8), 1);
949 srcReg32b9 = _mm256_castsi128_si256(
950 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
951 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
952 _mm256_castsi256_si128(srcReg32b9), 1);
953
954 // merge every two consecutive registers
955 // save
956 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
957
958 // multiply 2 adjacent elements with the filter and add the result
959 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
960 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
961
962 // add and saturate the results together
963 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
964
965 // multiply 2 adjacent elements with the filter and add the result
966 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
967 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
968
969 // add and saturate the results together
970 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
971 _mm256_adds_epi16(srcReg32b8, srcReg32b12));
972
973 // shift by 6 bit each 16 bit
974 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
975 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
976
977 // shrink to 8 bit each 16 bits, the first lane contain the first
978 // convolve result and the second lane contain the second convolve
979 // result
980 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
981
982 src_ptr += src_stride;
983
984 xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
985
986 output_ptr += dst_stride;
987
988 // save part of the registers for next strides
989 srcReg32b10 = srcReg32b11;
990 srcReg32b11 = srcReg32b2;
991 srcReg32b2 = srcReg32b4;
992 srcReg32b7 = srcReg32b9;
993 }
994 if (i > 0) {
995 __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
996 // load the last 16 bytes
997 srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
998
999 // merge the last 2 results together
1000 srcRegFilt4 =
1001 _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1002
1003 // multiply 2 adjacent elements with the filter and add the result
1004 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1005 _mm256_castsi256_si128(firstFilters));
1006 srcRegFilt4 =
1007 _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1008
1009 // add and saturate the results together
1010 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1011
1012 // multiply 2 adjacent elements with the filter and add the result
1013 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1014 _mm256_castsi256_si128(secondFilters));
1015
1016 // multiply 2 adjacent elements with the filter and add the result
1017 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1018 _mm256_castsi256_si128(thirdFilters));
1019
1020 // add and saturate the results together
1021 srcRegFilt1 =
1022 _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1023
1024 // shift by 6 bit each 16 bit
1025 srcRegFilt1 =
1026 _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1027 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1028
1029 // shrink to 8 bit each 16 bits, the first lane contain the first
1030 // convolve result and the second lane contain the second convolve result
1031 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
1032
1033 // save 8 bytes
1034 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
1035 }
1036 }
1037
aom_filter_block1d16_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1038 static void aom_filter_block1d16_v4_avx2(
1039 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1040 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1041 __m128i filtersReg;
1042 __m256i filtersReg32, addFilterReg32;
1043 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1044 __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
1045 __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
1046 __m256i resReglo, resReghi, resReg;
1047 __m256i secondFilters, thirdFilters;
1048 unsigned int i;
1049 ptrdiff_t src_stride, dst_stride;
1050
1051 addFilterReg32 = _mm256_set1_epi16(32);
1052 filtersReg = _mm_loadu_si128((const __m128i *)filter);
1053 // converting the 16 bit (short) to 8 bit (byte) and have the
1054 // same data in both lanes of 128 bit register.
1055 filtersReg = _mm_srai_epi16(filtersReg, 1);
1056 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1057 // have the same data in both lanes of a 256 bit register
1058 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1059
1060 // duplicate only the second 16 bits (third and forth byte)
1061 // across 256 bit register
1062 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1063 // duplicate only the third 16 bits (fifth and sixth byte)
1064 // across 256 bit register
1065 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1066
1067 // multiple the size of the source and destination stride by two
1068 src_stride = src_pitch << 1;
1069 dst_stride = out_pitch << 1;
1070
1071 srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1072 srcReg4x = _mm256_castsi128_si256(
1073 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
1074
1075 // have consecutive loads on the same 256 register
1076 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1077
1078 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1079 srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
1080
1081 for (i = output_height; i > 1; i -= 2) {
1082 // load the last 2 loads of 16 bytes and have every two
1083 // consecutive loads in the same 256 bit register
1084 srcReg5x = _mm256_castsi128_si256(
1085 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
1086 srcReg45 =
1087 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1088
1089 srcReg6x = _mm256_castsi128_si256(
1090 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1091 srcReg56 =
1092 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1093
1094 // merge every two consecutive registers
1095 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1096 srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
1097
1098 // multiply 2 adjacent elements with the filter and add the result
1099 resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
1100 resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
1101
1102 // add and saturate the results together
1103 resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
1104
1105 // multiply 2 adjacent elements with the filter and add the result
1106 resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
1107 resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
1108
1109 // add and saturate the results together
1110 resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
1111
1112 // shift by 6 bit each 16 bit
1113 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1114 resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
1115 resReglo = _mm256_srai_epi16(resReglo, 6);
1116 resReghi = _mm256_srai_epi16(resReghi, 6);
1117
1118 // shrink to 8 bit each 16 bits, the first lane contain the first
1119 // convolve result and the second lane contain the second convolve
1120 // result
1121 resReg = _mm256_packus_epi16(resReglo, resReghi);
1122
1123 src_ptr += src_stride;
1124
1125 xx_store2_mi128(output_ptr, out_pitch, &resReg);
1126
1127 output_ptr += dst_stride;
1128
1129 // save part of the registers for next strides
1130 srcReg23_34_lo = srcReg45_56_lo;
1131 srcReg23_34_hi = srcReg45_56_hi;
1132 srcReg4x = srcReg6x;
1133 }
1134 }
1135
aom_filter_block1d16_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1136 static void aom_filter_block1d16_v8_avx2(
1137 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1138 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1139 __m128i filtersReg;
1140 __m256i addFilterReg32;
1141 __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
1142 __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
1143 __m256i srcReg32b11, srcReg32b12, filtersReg32;
1144 __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
1145 unsigned int i;
1146 ptrdiff_t src_stride, dst_stride;
1147
1148 addFilterReg32 = _mm256_set1_epi16(32);
1149 filtersReg = _mm_loadu_si128((const __m128i *)filter);
1150 // converting the 16 bit (short) to 8 bit (byte) and have the
1151 // same data in both lanes of 128 bit register.
1152 filtersReg = _mm_srai_epi16(filtersReg, 1);
1153 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1154 // have the same data in both lanes of a 256 bit register
1155 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1156
1157 // duplicate only the first 16 bits (first and second byte)
1158 // across 256 bit register
1159 firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
1160 // duplicate only the second 16 bits (third and forth byte)
1161 // across 256 bit register
1162 secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1163 // duplicate only the third 16 bits (fifth and sixth byte)
1164 // across 256 bit register
1165 thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1166 // duplicate only the forth 16 bits (seventh and eighth byte)
1167 // across 256 bit register
1168 forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
1169
1170 // multiple the size of the source and destination stride by two
1171 src_stride = src_pitch << 1;
1172 dst_stride = out_pitch << 1;
1173
1174 // load 16 bytes 7 times in stride of src_pitch
1175 srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
1176 srcReg32b3 =
1177 xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1178 srcReg32b5 =
1179 xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1180 srcReg32b7 = _mm256_castsi128_si256(
1181 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1182
1183 // have each consecutive loads on the same 256 register
1184 srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
1185 srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
1186 srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
1187 // merge every two consecutive registers except the last one
1188 srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
1189 srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
1190
1191 // save
1192 srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
1193 srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
1194 srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
1195 srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
1196
1197 for (i = output_height; i > 1; i -= 2) {
1198 // load the last 2 loads of 16 bytes and have every two
1199 // consecutive loads in the same 256 bit register
1200 srcReg32b8 = _mm256_castsi128_si256(
1201 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
1202 srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
1203 _mm256_castsi256_si128(srcReg32b8), 1);
1204 srcReg32b9 = _mm256_castsi128_si256(
1205 _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
1206 srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
1207 _mm256_castsi256_si128(srcReg32b9), 1);
1208
1209 // merge every two consecutive registers
1210 // save
1211 srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
1212 srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
1213
1214 // multiply 2 adjacent elements with the filter and add the result
1215 srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
1216 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
1217
1218 // add and saturate the results together
1219 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
1220
1221 // multiply 2 adjacent elements with the filter and add the result
1222 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
1223 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
1224
1225 // add and saturate the results together
1226 srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
1227 _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1228
1229 // multiply 2 adjacent elements with the filter and add the result
1230 srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
1231 srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
1232
1233 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
1234
1235 // multiply 2 adjacent elements with the filter and add the result
1236 srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
1237 srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
1238
1239 // add and saturate the results together
1240 srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
1241 _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1242
1243 // shift by 6 bit each 16 bit
1244 srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
1245 srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
1246 srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
1247 srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
1248
1249 // shrink to 8 bit each 16 bits, the first lane contain the first
1250 // convolve result and the second lane contain the second convolve
1251 // result
1252 srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
1253
1254 src_ptr += src_stride;
1255
1256 xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
1257
1258 output_ptr += dst_stride;
1259
1260 // save part of the registers for next strides
1261 srcReg32b10 = srcReg32b11;
1262 srcReg32b1 = srcReg32b3;
1263 srcReg32b11 = srcReg32b2;
1264 srcReg32b3 = srcReg32b5;
1265 srcReg32b2 = srcReg32b4;
1266 srcReg32b5 = srcReg32b7;
1267 srcReg32b7 = srcReg32b9;
1268 }
1269 if (i > 0) {
1270 __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
1271 __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
1272 // load the last 16 bytes
1273 srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1274
1275 // merge the last 2 results together
1276 srcRegFilt4 =
1277 _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1278 srcRegFilt7 =
1279 _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1280
1281 // multiply 2 adjacent elements with the filter and add the result
1282 srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1283 _mm256_castsi256_si128(firstFilters));
1284 srcRegFilt4 =
1285 _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1286 srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
1287 _mm256_castsi256_si128(firstFilters));
1288 srcRegFilt7 =
1289 _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
1290
1291 // add and saturate the results together
1292 srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1293 srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
1294
1295 // multiply 2 adjacent elements with the filter and add the result
1296 srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1297 _mm256_castsi256_si128(secondFilters));
1298 srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
1299 _mm256_castsi256_si128(secondFilters));
1300
1301 // multiply 2 adjacent elements with the filter and add the result
1302 srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1303 _mm256_castsi256_si128(thirdFilters));
1304 srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
1305 _mm256_castsi256_si128(thirdFilters));
1306
1307 // add and saturate the results together
1308 srcRegFilt1 =
1309 _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1310 srcRegFilt3 =
1311 _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
1312
1313 // shift by 6 bit each 16 bit
1314 srcRegFilt1 =
1315 _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1316 srcRegFilt3 =
1317 _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
1318 srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1319 srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
1320
1321 // shrink to 8 bit each 16 bits, the first lane contain the first
1322 // convolve result and the second lane contain the second convolve
1323 // result
1324 srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
1325
1326 // save 16 bytes
1327 _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
1328 }
1329 }
1330
aom_filter_block1d4_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1331 static void aom_filter_block1d4_v4_avx2(
1332 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1333 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1334 __m128i filtersReg;
1335 __m256i filtersReg32, addFilterReg32;
1336 __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1337 __m256i srcReg23_34_lo, srcReg45_56_lo;
1338 __m256i srcReg2345_3456_lo;
1339 __m256i resReglo, resReg;
1340 __m256i firstFilters;
1341 unsigned int i;
1342 ptrdiff_t src_stride, dst_stride;
1343
1344 addFilterReg32 = _mm256_set1_epi16(32);
1345 filtersReg = _mm_loadu_si128((const __m128i *)filter);
1346 // converting the 16 bit (short) to 8 bit (byte) and have the
1347 // same data in both lanes of 128 bit register.
1348 filtersReg = _mm_srai_epi16(filtersReg, 1);
1349 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1350 // have the same data in both lanes of a 256 bit register
1351 filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1352
1353 firstFilters =
1354 _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
1355
1356 // multiple the size of the source and destination stride by two
1357 src_stride = src_pitch << 1;
1358 dst_stride = out_pitch << 1;
1359
1360 srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1361 srcReg4x = _mm256_castsi128_si256(
1362 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
1363
1364 // have consecutive loads on the same 256 register
1365 srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1366
1367 srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1368
1369 for (i = output_height; i > 1; i -= 2) {
1370 // load the last 2 loads of 16 bytes and have every two
1371 // consecutive loads in the same 256 bit register
1372 srcReg5x = _mm256_castsi128_si256(
1373 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
1374 srcReg45 =
1375 _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1376
1377 srcReg6x = _mm256_castsi128_si256(
1378 _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1379 srcReg56 =
1380 _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1381
1382 // merge every two consecutive registers
1383 srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1384
1385 srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
1386
1387 // multiply 2 adjacent elements with the filter and add the result
1388 resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
1389
1390 resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
1391
1392 // shift by 6 bit each 16 bit
1393 resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1394 resReglo = _mm256_srai_epi16(resReglo, 6);
1395
1396 // shrink to 8 bit each 16 bits, the first lane contain the first
1397 // convolve result and the second lane contain the second convolve
1398 // result
1399 resReg = _mm256_packus_epi16(resReglo, resReglo);
1400
1401 src_ptr += src_stride;
1402
1403 xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
1404
1405 output_ptr += dst_stride;
1406
1407 // save part of the registers for next strides
1408 srcReg23_34_lo = srcReg45_56_lo;
1409 srcReg4x = srcReg6x;
1410 }
1411 }
1412
1413 #if HAVE_AVX2 && HAVE_SSSE3
1414 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
1415 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
1416 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
1417 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
1418 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
1419 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
1420 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
1421 #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
1422 #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
1423 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
1424 #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
1425 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
1426 #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
1427 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
1428 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
1429 // uint8_t *dst, ptrdiff_t dst_stride,
1430 // const int16_t *filter_x, int x_step_q4,
1431 // const int16_t *filter_y, int y_step_q4,
1432 // int w, int h);
1433 // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
1434 // uint8_t *dst, ptrdiff_t dst_stride,
1435 // const int16_t *filter_x, int x_step_q4,
1436 // const int16_t *filter_y, int y_step_q4,
1437 // int w, int h);
1438 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
1439 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
1440
1441 #endif // HAVE_AX2 && HAVE_SSSE3
1442