1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/x86/convolve.h"
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_ports/mem.h"
19 
20 #if defined(__clang__)
21 #if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
22     (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
23     (defined(__APPLE__) && defined(__apple_build_version__) && \
24      ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
25       (__clang_major__ == 5 && __clang_minor__ == 0)))
26 #define MM256_BROADCASTSI128_SI256(x) \
27   _mm_broadcastsi128_si256((__m128i const *)&(x))
28 #else  // clang > 3.3, and not 5.0 on macosx.
29 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
30 #endif  // clang <= 3.3
31 #elif defined(__GNUC__)
32 #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
33 #define MM256_BROADCASTSI128_SI256(x) \
34   _mm_broadcastsi128_si256((__m128i const *)&(x))
35 #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
36 #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
37 #else  // gcc > 4.7
38 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
39 #endif  // gcc <= 4.6
40 #else   // !(gcc || clang)
41 #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
42 #endif  // __clang__
43 
xx_storeu2_epi32(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)44 static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
45                                     const ptrdiff_t stride, const __m256i *a) {
46   *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
47   *((uint32_t *)(output_ptr + stride)) =
48       _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
49 }
50 
xx_loadu2_epi64(const void * hi,const void * lo)51 static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
52   __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
53   a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
54   return a;
55 }
56 
xx_storeu2_epi64(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)57 static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
58                                     const ptrdiff_t stride, const __m256i *a) {
59   _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
60   _mm_storel_epi64((__m128i *)(output_ptr + stride),
61                    _mm256_extractf128_si256(*a, 1));
62 }
63 
xx_loadu2_mi128(const void * hi,const void * lo)64 static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
65   __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
66   a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
67   return a;
68 }
69 
xx_store2_mi128(const uint8_t * output_ptr,const ptrdiff_t stride,const __m256i * a)70 static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
71                                    const ptrdiff_t stride, const __m256i *a) {
72   _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
73   _mm_store_si128((__m128i *)(output_ptr + stride),
74                   _mm256_extractf128_si256(*a, 1));
75 }
76 
aom_filter_block1d4_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)77 static void aom_filter_block1d4_h4_avx2(
78     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
79     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
80   __m128i filtersReg;
81   __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
82   unsigned int i;
83   ptrdiff_t src_stride, dst_stride;
84   src_ptr -= 3;
85   addFilterReg32 = _mm256_set1_epi16(32);
86   filtersReg = _mm_loadu_si128((const __m128i *)filter);
87   filtersReg = _mm_srai_epi16(filtersReg, 1);
88   // converting the 16 bit (short) to 8 bit (byte) and have the same data
89   // in both lanes of 128 bit register.
90   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
91   // have the same data in both lanes of a 256 bit register
92   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
93 
94   firstFilters =
95       _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
96   filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
97 
98   // multiple the size of the source and destination stride by two
99   src_stride = src_pixels_per_line << 1;
100   dst_stride = output_pitch << 1;
101   for (i = output_height; i > 1; i -= 2) {
102     // load the 2 strides of source
103     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
104 
105     // filter the source buffer
106     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
107 
108     // multiply 4 adjacent elements with the filter and add the result
109     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
110 
111     srcRegFilt32b1_1 =
112         _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
113 
114     // shift by 6 bit each 16 bit
115     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
116     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
117 
118     // shrink to 8 bit each 16 bits, the first lane contain the first
119     // convolve result and the second lane contain the second convolve result
120     srcRegFilt32b1_1 =
121         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
122 
123     src_ptr += src_stride;
124 
125     xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
126     output_ptr += dst_stride;
127   }
128 
129   // if the number of strides is odd.
130   // process only 4 bytes
131   if (i > 0) {
132     __m128i srcReg1, srcRegFilt1_1;
133 
134     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
135 
136     // filter the source buffer
137     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
138 
139     // multiply 4 adjacent elements with the filter and add the result
140     srcRegFilt1_1 =
141         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
142 
143     srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
144     // shift by 6 bit each 16 bit
145     srcRegFilt1_1 =
146         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
147     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
148 
149     // shrink to 8 bit each 16 bits, the first lane contain the first
150     // convolve result and the second lane contain the second convolve result
151     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
152 
153     // save 4 bytes
154     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
155   }
156 }
157 
aom_filter_block1d4_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)158 static void aom_filter_block1d4_h8_avx2(
159     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
160     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
161   __m128i filtersReg;
162   __m256i addFilterReg32, filt1Reg, filt2Reg;
163   __m256i firstFilters, secondFilters;
164   __m256i srcRegFilt32b1_1, srcRegFilt32b2;
165   __m256i srcReg32b1;
166   unsigned int i;
167   ptrdiff_t src_stride, dst_stride;
168   src_ptr -= 3;
169   addFilterReg32 = _mm256_set1_epi16(32);
170   filtersReg = _mm_loadu_si128((const __m128i *)filter);
171   filtersReg = _mm_srai_epi16(filtersReg, 1);
172   // converting the 16 bit (short) to 8 bit (byte) and have the same data
173   // in both lanes of 128 bit register.
174   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
175   // have the same data in both lanes of a 256 bit register
176   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
177 
178   // duplicate only the first 32 bits
179   firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
180   // duplicate only the second 32 bits
181   secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
182 
183   filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
184   filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
185 
186   // multiple the size of the source and destination stride by two
187   src_stride = src_pixels_per_line << 1;
188   dst_stride = output_pitch << 1;
189   for (i = output_height; i > 1; i -= 2) {
190     // load the 2 strides of source
191     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
192 
193     // filter the source buffer
194     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
195 
196     // multiply 4 adjacent elements with the filter and add the result
197     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
198 
199     // filter the source buffer
200     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
201 
202     // multiply 4 adjacent elements with the filter and add the result
203     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
204 
205     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
206 
207     srcRegFilt32b1_1 =
208         _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
209 
210     // shift by 6 bit each 16 bit
211     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
212     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
213 
214     // shrink to 8 bit each 16 bits, the first lane contain the first
215     // convolve result and the second lane contain the second convolve result
216     srcRegFilt32b1_1 =
217         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
218 
219     src_ptr += src_stride;
220 
221     xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
222     output_ptr += dst_stride;
223   }
224 
225   // if the number of strides is odd.
226   // process only 4 bytes
227   if (i > 0) {
228     __m128i srcReg1, srcRegFilt1_1;
229     __m128i srcRegFilt2;
230 
231     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
232 
233     // filter the source buffer
234     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
235 
236     // multiply 4 adjacent elements with the filter and add the result
237     srcRegFilt1_1 =
238         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
239 
240     // filter the source buffer
241     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
242 
243     // multiply 4 adjacent elements with the filter and add the result
244     srcRegFilt2 =
245         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
246 
247     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
248     srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
249     // shift by 6 bit each 16 bit
250     srcRegFilt1_1 =
251         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
252     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
253 
254     // shrink to 8 bit each 16 bits, the first lane contain the first
255     // convolve result and the second lane contain the second convolve result
256     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
257 
258     // save 4 bytes
259     *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
260   }
261 }
262 
aom_filter_block1d8_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)263 static void aom_filter_block1d8_h4_avx2(
264     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
265     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
266   __m128i filtersReg;
267   __m256i addFilterReg32, filt2Reg, filt3Reg;
268   __m256i secondFilters, thirdFilters;
269   __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
270   __m256i srcReg32b1, filtersReg32;
271   unsigned int i;
272   ptrdiff_t src_stride, dst_stride;
273   src_ptr -= 3;
274   addFilterReg32 = _mm256_set1_epi16(32);
275   filtersReg = _mm_loadu_si128((const __m128i *)filter);
276   filtersReg = _mm_srai_epi16(filtersReg, 1);
277   // converting the 16 bit (short) to 8 bit (byte) and have the same data
278   // in both lanes of 128 bit register.
279   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
280   // have the same data in both lanes of a 256 bit register
281   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
282 
283   // duplicate only the second 16 bits (third and forth byte)
284   // across 256 bit register
285   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
286   // duplicate only the third 16 bits (fifth and sixth byte)
287   // across 256 bit register
288   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
289 
290   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
291   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
292 
293   // multiply the size of the source and destination stride by two
294   src_stride = src_pixels_per_line << 1;
295   dst_stride = output_pitch << 1;
296   for (i = output_height; i > 1; i -= 2) {
297     // load the 2 strides of source
298     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
299 
300     // filter the source buffer
301     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
302     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
303 
304     // multiply 2 adjacent elements with the filter and add the result
305     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
306     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
307 
308     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
309 
310     // shift by 6 bit each 16 bit
311     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
312     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
313 
314     // shrink to 8 bit each 16 bits
315     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
316 
317     src_ptr += src_stride;
318 
319     xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
320     output_ptr += dst_stride;
321   }
322 
323   // if the number of strides is odd.
324   // process only 8 bytes
325   if (i > 0) {
326     __m128i srcReg1, srcRegFilt1_1;
327     __m128i srcRegFilt2, srcRegFilt3;
328 
329     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
330 
331     // filter the source buffer
332     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
333     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
334 
335     // multiply 2 adjacent elements with the filter and add the result
336     srcRegFilt2 =
337         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
338     srcRegFilt3 =
339         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
340 
341     // add and saturate the results together
342     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
343 
344     // shift by 6 bit each 16 bit
345     srcRegFilt1_1 =
346         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
347     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
348 
349     // shrink to 8 bit each 16 bits
350     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
351 
352     // save 8 bytes
353     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
354   }
355 }
356 
aom_filter_block1d8_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)357 static void aom_filter_block1d8_h8_avx2(
358     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
359     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
360   __m128i filtersReg;
361   __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
362   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
363   __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
364   __m256i srcReg32b1;
365   unsigned int i;
366   ptrdiff_t src_stride, dst_stride;
367   src_ptr -= 3;
368   addFilterReg32 = _mm256_set1_epi16(32);
369   filtersReg = _mm_loadu_si128((const __m128i *)filter);
370   filtersReg = _mm_srai_epi16(filtersReg, 1);
371   // converting the 16 bit (short) to 8 bit (byte) and have the same data
372   // in both lanes of 128 bit register.
373   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
374   // have the same data in both lanes of a 256 bit register
375   const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
376 
377   // duplicate only the first 16 bits (first and second byte)
378   // across 256 bit register
379   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
380   // duplicate only the second 16 bits (third and forth byte)
381   // across 256 bit register
382   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
383   // duplicate only the third 16 bits (fifth and sixth byte)
384   // across 256 bit register
385   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
386   // duplicate only the forth 16 bits (seventh and eighth byte)
387   // across 256 bit register
388   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
389 
390   filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
391   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
392   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
393   filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
394 
395   // multiple the size of the source and destination stride by two
396   src_stride = src_pixels_per_line << 1;
397   dst_stride = output_pitch << 1;
398   for (i = output_height; i > 1; i -= 2) {
399     // load the 2 strides of source
400     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
401 
402     // filter the source buffer
403     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
404     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
405 
406     // multiply 2 adjacent elements with the filter and add the result
407     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
408     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
409 
410     // add and saturate the results together
411     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
412 
413     // filter the source buffer
414     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
415     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
416 
417     // multiply 2 adjacent elements with the filter and add the result
418     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
419     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
420 
421     __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
422     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
423 
424     // shift by 6 bit each 16 bit
425     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
426     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
427 
428     // shrink to 8 bit each 16 bits, the first lane contain the first
429     // convolve result and the second lane contain the second convolve result
430     srcRegFilt32b1_1 =
431         _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
432 
433     src_ptr += src_stride;
434 
435     xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
436     output_ptr += dst_stride;
437   }
438 
439   // if the number of strides is odd.
440   // process only 8 bytes
441   if (i > 0) {
442     __m128i srcReg1, srcRegFilt1_1;
443     __m128i srcRegFilt2, srcRegFilt3;
444 
445     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
446 
447     // filter the source buffer
448     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
449     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
450 
451     // multiply 2 adjacent elements with the filter and add the result
452     srcRegFilt1_1 =
453         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
454     srcRegFilt2 =
455         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
456 
457     // add and saturate the results together
458     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
459 
460     // filter the source buffer
461     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
462     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
463 
464     // multiply 2 adjacent elements with the filter and add the result
465     srcRegFilt3 =
466         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
467     srcRegFilt2 =
468         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
469 
470     // add and saturate the results together
471     srcRegFilt1_1 =
472         _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
473 
474     // shift by 6 bit each 16 bit
475     srcRegFilt1_1 =
476         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
477     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
478 
479     // shrink to 8 bit each 16 bits, the first lane contain the first
480     // convolve result and the second lane contain the second convolve
481     // result
482     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
483 
484     // save 8 bytes
485     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
486   }
487 }
488 
aom_filter_block1d16_h4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)489 static void aom_filter_block1d16_h4_avx2(
490     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
491     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
492   __m128i filtersReg;
493   __m256i addFilterReg32, filt2Reg, filt3Reg;
494   __m256i secondFilters, thirdFilters;
495   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
496   __m256i srcReg32b1, srcReg32b2, filtersReg32;
497   unsigned int i;
498   ptrdiff_t src_stride, dst_stride;
499   src_ptr -= 3;
500   addFilterReg32 = _mm256_set1_epi16(32);
501   filtersReg = _mm_loadu_si128((const __m128i *)filter);
502   filtersReg = _mm_srai_epi16(filtersReg, 1);
503   // converting the 16 bit (short) to 8 bit (byte) and have the same data
504   // in both lanes of 128 bit register.
505   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
506   // have the same data in both lanes of a 256 bit register
507   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
508 
509   // duplicate only the second 16 bits (third and forth byte)
510   // across 256 bit register
511   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
512   // duplicate only the third 16 bits (fifth and sixth byte)
513   // across 256 bit register
514   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
515 
516   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
517   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
518 
519   // multiply the size of the source and destination stride by two
520   src_stride = src_pixels_per_line << 1;
521   dst_stride = output_pitch << 1;
522   for (i = output_height; i > 1; i -= 2) {
523     // load the 2 strides of source
524     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
525 
526     // filter the source buffer
527     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
528     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
529 
530     // multiply 2 adjacent elements with the filter and add the result
531     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
532     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
533 
534     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
535 
536     // reading 2 strides of the next 16 bytes
537     // (part of it was being read by earlier read)
538     srcReg32b2 =
539         xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
540 
541     // filter the source buffer
542     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
543     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
544 
545     // multiply 2 adjacent elements with the filter and add the result
546     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
547     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
548 
549     // add and saturate the results together
550     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
551 
552     // shift by 6 bit each 16 bit
553     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
554     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
555     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
556     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
557 
558     // shrink to 8 bit each 16 bits, the first lane contain the first
559     // convolve result and the second lane contain the second convolve result
560     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
561 
562     src_ptr += src_stride;
563 
564     xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
565     output_ptr += dst_stride;
566   }
567 
568   // if the number of strides is odd.
569   // process only 16 bytes
570   if (i > 0) {
571     __m256i srcReg1, srcReg12;
572     __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
573 
574     srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
575     srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
576 
577     // filter the source buffer
578     srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
579     srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
580 
581     // multiply 2 adjacent elements with the filter and add the result
582     srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
583     srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
584 
585     // add and saturate the results together
586     srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
587 
588     // shift by 6 bit each 16 bit
589     srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
590     srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
591 
592     // shrink to 8 bit each 16 bits, the first lane contain the first
593     // convolve result and the second lane contain the second convolve
594     // result
595     srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
596     srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
597 
598     // save 16 bytes
599     _mm_store_si128((__m128i *)output_ptr,
600                     _mm256_castsi256_si128(srcRegFilt1_1));
601   }
602 }
603 
aom_filter_block1d16_h8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pixels_per_line,uint8_t * output_ptr,ptrdiff_t output_pitch,uint32_t output_height,const int16_t * filter)604 static void aom_filter_block1d16_h8_avx2(
605     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
606     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
607   __m128i filtersReg;
608   __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
609   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
610   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
611   __m256i srcReg32b1, srcReg32b2, filtersReg32;
612   unsigned int i;
613   ptrdiff_t src_stride, dst_stride;
614   src_ptr -= 3;
615   addFilterReg32 = _mm256_set1_epi16(32);
616   filtersReg = _mm_loadu_si128((const __m128i *)filter);
617   filtersReg = _mm_srai_epi16(filtersReg, 1);
618   // converting the 16 bit (short) to 8 bit (byte) and have the same data
619   // in both lanes of 128 bit register.
620   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
621   // have the same data in both lanes of a 256 bit register
622   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
623 
624   // duplicate only the first 16 bits (first and second byte)
625   // across 256 bit register
626   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
627   // duplicate only the second 16 bits (third and forth byte)
628   // across 256 bit register
629   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
630   // duplicate only the third 16 bits (fifth and sixth byte)
631   // across 256 bit register
632   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
633   // duplicate only the forth 16 bits (seventh and eighth byte)
634   // across 256 bit register
635   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
636 
637   filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
638   filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
639   filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
640   filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
641 
642   // multiple the size of the source and destination stride by two
643   src_stride = src_pixels_per_line << 1;
644   dst_stride = output_pitch << 1;
645   for (i = output_height; i > 1; i -= 2) {
646     // load the 2 strides of source
647     srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
648 
649     // filter the source buffer
650     srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
651     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
652 
653     // multiply 2 adjacent elements with the filter and add the result
654     srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
655     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
656 
657     // add and saturate the results together
658     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
659 
660     // filter the source buffer
661     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
662     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
663 
664     // multiply 2 adjacent elements with the filter and add the result
665     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
666     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
667 
668     __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
669     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
670 
671     // reading 2 strides of the next 16 bytes
672     // (part of it was being read by earlier read)
673     srcReg32b2 =
674         xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
675 
676     // filter the source buffer
677     srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
678     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
679 
680     // multiply 2 adjacent elements with the filter and add the result
681     srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
682     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
683 
684     // add and saturate the results together
685     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
686 
687     // filter the source buffer
688     srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
689     srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
690 
691     // multiply 2 adjacent elements with the filter and add the result
692     srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
693     srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
694 
695     // add and saturate the results together
696     srcRegFilt32b2_1 = _mm256_adds_epi16(
697         srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
698 
699     // shift by 6 bit each 16 bit
700     srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
701     srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
702     srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
703     srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
704 
705     // shrink to 8 bit each 16 bits, the first lane contain the first
706     // convolve result and the second lane contain the second convolve result
707     srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
708 
709     src_ptr += src_stride;
710 
711     xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
712     output_ptr += dst_stride;
713   }
714 
715   // if the number of strides is odd.
716   // process only 16 bytes
717   if (i > 0) {
718     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
719     __m128i srcRegFilt2, srcRegFilt3;
720 
721     srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
722 
723     // filter the source buffer
724     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
725     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
726 
727     // multiply 2 adjacent elements with the filter and add the result
728     srcRegFilt1_1 =
729         _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
730     srcRegFilt2 =
731         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
732 
733     // add and saturate the results together
734     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
735 
736     // filter the source buffer
737     srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
738     srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
739 
740     // multiply 2 adjacent elements with the filter and add the result
741     srcRegFilt3 =
742         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
743     srcRegFilt2 =
744         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
745 
746     // add and saturate the results together
747     srcRegFilt1_1 =
748         _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
749 
750     // reading the next 16 bytes
751     // (part of it was being read by earlier read)
752     srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
753 
754     // filter the source buffer
755     srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
756     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
757 
758     // multiply 2 adjacent elements with the filter and add the result
759     srcRegFilt2_1 =
760         _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
761     srcRegFilt2 =
762         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
763 
764     // add and saturate the results together
765     srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
766 
767     // filter the source buffer
768     srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
769     srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
770 
771     // multiply 2 adjacent elements with the filter and add the result
772     srcRegFilt3 =
773         _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
774     srcRegFilt2 =
775         _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
776 
777     // add and saturate the results together
778     srcRegFilt2_1 =
779         _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
780 
781     // shift by 6 bit each 16 bit
782     srcRegFilt1_1 =
783         _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
784     srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
785 
786     srcRegFilt2_1 =
787         _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
788     srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
789 
790     // shrink to 8 bit each 16 bits, the first lane contain the first
791     // convolve result and the second lane contain the second convolve
792     // result
793     srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
794 
795     // save 16 bytes
796     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
797   }
798 }
799 
aom_filter_block1d8_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)800 static void aom_filter_block1d8_v4_avx2(
801     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
802     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
803   __m128i filtersReg;
804   __m256i filtersReg32, addFilterReg32;
805   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
806   __m256i srcReg23_34_lo, srcReg45_56_lo;
807   __m256i resReg23_34_lo, resReg45_56_lo;
808   __m256i resReglo, resReg;
809   __m256i secondFilters, thirdFilters;
810   unsigned int i;
811   ptrdiff_t src_stride, dst_stride;
812 
813   addFilterReg32 = _mm256_set1_epi16(32);
814   filtersReg = _mm_loadu_si128((const __m128i *)filter);
815   // converting the 16 bit (short) to  8 bit (byte) and have the
816   // same data in both lanes of 128 bit register.
817   filtersReg = _mm_srai_epi16(filtersReg, 1);
818   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
819   // have the same data in both lanes of a 256 bit register
820   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
821 
822   // duplicate only the second 16 bits (third and forth byte)
823   // across 256 bit register
824   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
825   // duplicate only the third 16 bits (fifth and sixth byte)
826   // across 256 bit register
827   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
828 
829   // multiple the size of the source and destination stride by two
830   src_stride = src_pitch << 1;
831   dst_stride = out_pitch << 1;
832 
833   srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
834   srcReg4x = _mm256_castsi128_si256(
835       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
836 
837   // have consecutive loads on the same 256 register
838   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
839 
840   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
841 
842   for (i = output_height; i > 1; i -= 2) {
843     // load the last 2 loads of 16 bytes and have every two
844     // consecutive loads in the same 256 bit register
845     srcReg5x = _mm256_castsi128_si256(
846         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
847     srcReg45 =
848         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
849 
850     srcReg6x = _mm256_castsi128_si256(
851         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
852     srcReg56 =
853         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
854 
855     // merge every two consecutive registers
856     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
857 
858     // multiply 2 adjacent elements with the filter and add the result
859     resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
860     resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
861 
862     // add and saturate the results together
863     resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
864 
865     // shift by 6 bit each 16 bit
866     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
867     resReglo = _mm256_srai_epi16(resReglo, 6);
868 
869     // shrink to 8 bit each 16 bits, the first lane contain the first
870     // convolve result and the second lane contain the second convolve
871     // result
872     resReg = _mm256_packus_epi16(resReglo, resReglo);
873 
874     src_ptr += src_stride;
875 
876     xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
877 
878     output_ptr += dst_stride;
879 
880     // save part of the registers for next strides
881     srcReg23_34_lo = srcReg45_56_lo;
882     srcReg4x = srcReg6x;
883   }
884 }
885 
aom_filter_block1d8_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)886 static void aom_filter_block1d8_v8_avx2(
887     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
888     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
889   __m128i filtersReg;
890   __m256i addFilterReg32;
891   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
892   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
893   __m256i srcReg32b11, srcReg32b12, filtersReg32;
894   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
895   unsigned int i;
896   ptrdiff_t src_stride, dst_stride;
897 
898   addFilterReg32 = _mm256_set1_epi16(32);
899   filtersReg = _mm_loadu_si128((const __m128i *)filter);
900   // converting the 16 bit (short) to  8 bit (byte) and have the
901   // same data in both lanes of 128 bit register.
902   filtersReg = _mm_srai_epi16(filtersReg, 1);
903   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
904   // have the same data in both lanes of a 256 bit register
905   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
906 
907   // duplicate only the first 16 bits (first and second byte)
908   // across 256 bit register
909   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
910   // duplicate only the second 16 bits (third and forth byte)
911   // across 256 bit register
912   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
913   // duplicate only the third 16 bits (fifth and sixth byte)
914   // across 256 bit register
915   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
916   // duplicate only the forth 16 bits (seventh and eighth byte)
917   // across 256 bit register
918   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
919 
920   // multiple the size of the source and destination stride by two
921   src_stride = src_pitch << 1;
922   dst_stride = out_pitch << 1;
923 
924   // load 16 bytes 7 times in stride of src_pitch
925   srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
926   srcReg32b3 =
927       xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
928   srcReg32b5 =
929       xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
930   srcReg32b7 = _mm256_castsi128_si256(
931       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
932 
933   // have each consecutive loads on the same 256 register
934   srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
935   srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
936   srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
937   // merge every two consecutive registers except the last one
938   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
939   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
940   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
941 
942   for (i = output_height; i > 1; i -= 2) {
943     // load the last 2 loads of 16 bytes and have every two
944     // consecutive loads in the same 256 bit register
945     srcReg32b8 = _mm256_castsi128_si256(
946         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
947     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
948                                          _mm256_castsi256_si128(srcReg32b8), 1);
949     srcReg32b9 = _mm256_castsi128_si256(
950         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
951     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
952                                          _mm256_castsi256_si128(srcReg32b9), 1);
953 
954     // merge every two consecutive registers
955     // save
956     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
957 
958     // multiply 2 adjacent elements with the filter and add the result
959     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
960     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
961 
962     // add and saturate the results together
963     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
964 
965     // multiply 2 adjacent elements with the filter and add the result
966     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
967     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
968 
969     // add and saturate the results together
970     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
971                                     _mm256_adds_epi16(srcReg32b8, srcReg32b12));
972 
973     // shift by 6 bit each 16 bit
974     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
975     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
976 
977     // shrink to 8 bit each 16 bits, the first lane contain the first
978     // convolve result and the second lane contain the second convolve
979     // result
980     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
981 
982     src_ptr += src_stride;
983 
984     xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
985 
986     output_ptr += dst_stride;
987 
988     // save part of the registers for next strides
989     srcReg32b10 = srcReg32b11;
990     srcReg32b11 = srcReg32b2;
991     srcReg32b2 = srcReg32b4;
992     srcReg32b7 = srcReg32b9;
993   }
994   if (i > 0) {
995     __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
996     // load the last 16 bytes
997     srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
998 
999     // merge the last 2 results together
1000     srcRegFilt4 =
1001         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1002 
1003     // multiply 2 adjacent elements with the filter and add the result
1004     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1005                                     _mm256_castsi256_si128(firstFilters));
1006     srcRegFilt4 =
1007         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1008 
1009     // add and saturate the results together
1010     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1011 
1012     // multiply 2 adjacent elements with the filter and add the result
1013     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1014                                     _mm256_castsi256_si128(secondFilters));
1015 
1016     // multiply 2 adjacent elements with the filter and add the result
1017     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1018                                     _mm256_castsi256_si128(thirdFilters));
1019 
1020     // add and saturate the results together
1021     srcRegFilt1 =
1022         _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1023 
1024     // shift by 6 bit each 16 bit
1025     srcRegFilt1 =
1026         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1027     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1028 
1029     // shrink to 8 bit each 16 bits, the first lane contain the first
1030     // convolve result and the second lane contain the second convolve result
1031     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
1032 
1033     // save 8 bytes
1034     _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
1035   }
1036 }
1037 
aom_filter_block1d16_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1038 static void aom_filter_block1d16_v4_avx2(
1039     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1040     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1041   __m128i filtersReg;
1042   __m256i filtersReg32, addFilterReg32;
1043   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1044   __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
1045   __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
1046   __m256i resReglo, resReghi, resReg;
1047   __m256i secondFilters, thirdFilters;
1048   unsigned int i;
1049   ptrdiff_t src_stride, dst_stride;
1050 
1051   addFilterReg32 = _mm256_set1_epi16(32);
1052   filtersReg = _mm_loadu_si128((const __m128i *)filter);
1053   // converting the 16 bit (short) to  8 bit (byte) and have the
1054   // same data in both lanes of 128 bit register.
1055   filtersReg = _mm_srai_epi16(filtersReg, 1);
1056   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1057   // have the same data in both lanes of a 256 bit register
1058   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1059 
1060   // duplicate only the second 16 bits (third and forth byte)
1061   // across 256 bit register
1062   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1063   // duplicate only the third 16 bits (fifth and sixth byte)
1064   // across 256 bit register
1065   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1066 
1067   // multiple the size of the source and destination stride by two
1068   src_stride = src_pitch << 1;
1069   dst_stride = out_pitch << 1;
1070 
1071   srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1072   srcReg4x = _mm256_castsi128_si256(
1073       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
1074 
1075   // have consecutive loads on the same 256 register
1076   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1077 
1078   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1079   srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
1080 
1081   for (i = output_height; i > 1; i -= 2) {
1082     // load the last 2 loads of 16 bytes and have every two
1083     // consecutive loads in the same 256 bit register
1084     srcReg5x = _mm256_castsi128_si256(
1085         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
1086     srcReg45 =
1087         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1088 
1089     srcReg6x = _mm256_castsi128_si256(
1090         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1091     srcReg56 =
1092         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1093 
1094     // merge every two consecutive registers
1095     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1096     srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
1097 
1098     // multiply 2 adjacent elements with the filter and add the result
1099     resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
1100     resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
1101 
1102     // add and saturate the results together
1103     resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
1104 
1105     // multiply 2 adjacent elements with the filter and add the result
1106     resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
1107     resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
1108 
1109     // add and saturate the results together
1110     resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
1111 
1112     // shift by 6 bit each 16 bit
1113     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1114     resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
1115     resReglo = _mm256_srai_epi16(resReglo, 6);
1116     resReghi = _mm256_srai_epi16(resReghi, 6);
1117 
1118     // shrink to 8 bit each 16 bits, the first lane contain the first
1119     // convolve result and the second lane contain the second convolve
1120     // result
1121     resReg = _mm256_packus_epi16(resReglo, resReghi);
1122 
1123     src_ptr += src_stride;
1124 
1125     xx_store2_mi128(output_ptr, out_pitch, &resReg);
1126 
1127     output_ptr += dst_stride;
1128 
1129     // save part of the registers for next strides
1130     srcReg23_34_lo = srcReg45_56_lo;
1131     srcReg23_34_hi = srcReg45_56_hi;
1132     srcReg4x = srcReg6x;
1133   }
1134 }
1135 
aom_filter_block1d16_v8_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1136 static void aom_filter_block1d16_v8_avx2(
1137     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1138     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1139   __m128i filtersReg;
1140   __m256i addFilterReg32;
1141   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
1142   __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
1143   __m256i srcReg32b11, srcReg32b12, filtersReg32;
1144   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
1145   unsigned int i;
1146   ptrdiff_t src_stride, dst_stride;
1147 
1148   addFilterReg32 = _mm256_set1_epi16(32);
1149   filtersReg = _mm_loadu_si128((const __m128i *)filter);
1150   // converting the 16 bit (short) to  8 bit (byte) and have the
1151   // same data in both lanes of 128 bit register.
1152   filtersReg = _mm_srai_epi16(filtersReg, 1);
1153   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1154   // have the same data in both lanes of a 256 bit register
1155   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1156 
1157   // duplicate only the first 16 bits (first and second byte)
1158   // across 256 bit register
1159   firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
1160   // duplicate only the second 16 bits (third and forth byte)
1161   // across 256 bit register
1162   secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
1163   // duplicate only the third 16 bits (fifth and sixth byte)
1164   // across 256 bit register
1165   thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
1166   // duplicate only the forth 16 bits (seventh and eighth byte)
1167   // across 256 bit register
1168   forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
1169 
1170   // multiple the size of the source and destination stride by two
1171   src_stride = src_pitch << 1;
1172   dst_stride = out_pitch << 1;
1173 
1174   // load 16 bytes 7 times in stride of src_pitch
1175   srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
1176   srcReg32b3 =
1177       xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1178   srcReg32b5 =
1179       xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
1180   srcReg32b7 = _mm256_castsi128_si256(
1181       _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
1182 
1183   // have each consecutive loads on the same 256 register
1184   srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
1185   srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
1186   srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
1187   // merge every two consecutive registers except the last one
1188   srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
1189   srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
1190 
1191   // save
1192   srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
1193   srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
1194   srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
1195   srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
1196 
1197   for (i = output_height; i > 1; i -= 2) {
1198     // load the last 2 loads of 16 bytes and have every two
1199     // consecutive loads in the same 256 bit register
1200     srcReg32b8 = _mm256_castsi128_si256(
1201         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
1202     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
1203                                          _mm256_castsi256_si128(srcReg32b8), 1);
1204     srcReg32b9 = _mm256_castsi128_si256(
1205         _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
1206     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
1207                                          _mm256_castsi256_si128(srcReg32b9), 1);
1208 
1209     // merge every two consecutive registers
1210     // save
1211     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
1212     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
1213 
1214     // multiply 2 adjacent elements with the filter and add the result
1215     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
1216     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
1217 
1218     // add and saturate the results together
1219     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
1220 
1221     // multiply 2 adjacent elements with the filter and add the result
1222     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
1223     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
1224 
1225     // add and saturate the results together
1226     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
1227                                     _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1228 
1229     // multiply 2 adjacent elements with the filter and add the result
1230     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
1231     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
1232 
1233     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
1234 
1235     // multiply 2 adjacent elements with the filter and add the result
1236     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
1237     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
1238 
1239     // add and saturate the results together
1240     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
1241                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
1242 
1243     // shift by 6 bit each 16 bit
1244     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
1245     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
1246     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
1247     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
1248 
1249     // shrink to 8 bit each 16 bits, the first lane contain the first
1250     // convolve result and the second lane contain the second convolve
1251     // result
1252     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
1253 
1254     src_ptr += src_stride;
1255 
1256     xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
1257 
1258     output_ptr += dst_stride;
1259 
1260     // save part of the registers for next strides
1261     srcReg32b10 = srcReg32b11;
1262     srcReg32b1 = srcReg32b3;
1263     srcReg32b11 = srcReg32b2;
1264     srcReg32b3 = srcReg32b5;
1265     srcReg32b2 = srcReg32b4;
1266     srcReg32b5 = srcReg32b7;
1267     srcReg32b7 = srcReg32b9;
1268   }
1269   if (i > 0) {
1270     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
1271     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
1272     // load the last 16 bytes
1273     srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
1274 
1275     // merge the last 2 results together
1276     srcRegFilt4 =
1277         _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1278     srcRegFilt7 =
1279         _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
1280 
1281     // multiply 2 adjacent elements with the filter and add the result
1282     srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
1283                                     _mm256_castsi256_si128(firstFilters));
1284     srcRegFilt4 =
1285         _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
1286     srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
1287                                     _mm256_castsi256_si128(firstFilters));
1288     srcRegFilt7 =
1289         _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
1290 
1291     // add and saturate the results together
1292     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
1293     srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
1294 
1295     // multiply 2 adjacent elements with the filter and add the result
1296     srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
1297                                     _mm256_castsi256_si128(secondFilters));
1298     srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
1299                                     _mm256_castsi256_si128(secondFilters));
1300 
1301     // multiply 2 adjacent elements with the filter and add the result
1302     srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
1303                                     _mm256_castsi256_si128(thirdFilters));
1304     srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
1305                                     _mm256_castsi256_si128(thirdFilters));
1306 
1307     // add and saturate the results together
1308     srcRegFilt1 =
1309         _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
1310     srcRegFilt3 =
1311         _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
1312 
1313     // shift by 6 bit each 16 bit
1314     srcRegFilt1 =
1315         _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
1316     srcRegFilt3 =
1317         _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
1318     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
1319     srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
1320 
1321     // shrink to 8 bit each 16 bits, the first lane contain the first
1322     // convolve result and the second lane contain the second convolve
1323     // result
1324     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
1325 
1326     // save 16 bytes
1327     _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
1328   }
1329 }
1330 
aom_filter_block1d4_v4_avx2(const uint8_t * src_ptr,ptrdiff_t src_pitch,uint8_t * output_ptr,ptrdiff_t out_pitch,uint32_t output_height,const int16_t * filter)1331 static void aom_filter_block1d4_v4_avx2(
1332     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
1333     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
1334   __m128i filtersReg;
1335   __m256i filtersReg32, addFilterReg32;
1336   __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
1337   __m256i srcReg23_34_lo, srcReg45_56_lo;
1338   __m256i srcReg2345_3456_lo;
1339   __m256i resReglo, resReg;
1340   __m256i firstFilters;
1341   unsigned int i;
1342   ptrdiff_t src_stride, dst_stride;
1343 
1344   addFilterReg32 = _mm256_set1_epi16(32);
1345   filtersReg = _mm_loadu_si128((const __m128i *)filter);
1346   // converting the 16 bit (short) to  8 bit (byte) and have the
1347   // same data in both lanes of 128 bit register.
1348   filtersReg = _mm_srai_epi16(filtersReg, 1);
1349   filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
1350   // have the same data in both lanes of a 256 bit register
1351   filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
1352 
1353   firstFilters =
1354       _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
1355 
1356   // multiple the size of the source and destination stride by two
1357   src_stride = src_pitch << 1;
1358   dst_stride = out_pitch << 1;
1359 
1360   srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
1361   srcReg4x = _mm256_castsi128_si256(
1362       _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
1363 
1364   // have consecutive loads on the same 256 register
1365   srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
1366 
1367   srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
1368 
1369   for (i = output_height; i > 1; i -= 2) {
1370     // load the last 2 loads of 16 bytes and have every two
1371     // consecutive loads in the same 256 bit register
1372     srcReg5x = _mm256_castsi128_si256(
1373         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
1374     srcReg45 =
1375         _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
1376 
1377     srcReg6x = _mm256_castsi128_si256(
1378         _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
1379     srcReg56 =
1380         _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
1381 
1382     // merge every two consecutive registers
1383     srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
1384 
1385     srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
1386 
1387     // multiply 2 adjacent elements with the filter and add the result
1388     resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
1389 
1390     resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
1391 
1392     // shift by 6 bit each 16 bit
1393     resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
1394     resReglo = _mm256_srai_epi16(resReglo, 6);
1395 
1396     // shrink to 8 bit each 16 bits, the first lane contain the first
1397     // convolve result and the second lane contain the second convolve
1398     // result
1399     resReg = _mm256_packus_epi16(resReglo, resReglo);
1400 
1401     src_ptr += src_stride;
1402 
1403     xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
1404 
1405     output_ptr += dst_stride;
1406 
1407     // save part of the registers for next strides
1408     srcReg23_34_lo = srcReg45_56_lo;
1409     srcReg4x = srcReg6x;
1410   }
1411 }
1412 
1413 #if HAVE_AVX2 && HAVE_SSSE3
1414 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
1415 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
1416 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
1417 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
1418 filter8_1dfunction aom_filter_block1d8_h2_ssse3;
1419 filter8_1dfunction aom_filter_block1d4_v2_ssse3;
1420 filter8_1dfunction aom_filter_block1d4_h2_ssse3;
1421 #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
1422 #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
1423 #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
1424 #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
1425 #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
1426 #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
1427 #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
1428 // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
1429 //                                uint8_t *dst, ptrdiff_t dst_stride,
1430 //                                const int16_t *filter_x, int x_step_q4,
1431 //                                const int16_t *filter_y, int y_step_q4,
1432 //                                int w, int h);
1433 // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
1434 //                               uint8_t *dst, ptrdiff_t dst_stride,
1435 //                               const int16_t *filter_x, int x_step_q4,
1436 //                               const int16_t *filter_y, int y_step_q4,
1437 //                               int w, int h);
1438 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
1439 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
1440 
1441 #endif  // HAVE_AX2 && HAVE_SSSE3
1442