1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #ifndef AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
13 #define AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
14 
15 #include <immintrin.h>
16 
17 #include "convolve.h"
18 #include "convolve_avx2.h"
19 #include "EbDefinitions.h"
20 #include "synonyms.h"
21 #include "synonyms_avx2.h"
22 
23 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap7_global_avx[64]) = {
24     3, 255, 4,  255, 5, 255, 6, 255, 7,  255, 8, 255, 9, 255, 10, 255, 3, 255, 4,  255, 5, 255,
25     6, 255, 7,  255, 8, 255, 9, 255, 10, 255, 3, 255, 4, 255, 5,  255, 6, 255, 7,  255, 8, 255,
26     9, 255, 10, 255, 3, 255, 4, 255, 5,  255, 6, 255, 7, 255, 8,  255, 9, 255, 10, 255};
27 
28 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap5_global_avx[64]) = {
29     2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 2, 255, 3, 255, 4, 255,
30     5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255,
31     8, 255, 9, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255};
32 
33 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap3_global_avx[64]) = {
34     1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 1, 255, 2, 255, 3, 255,
35     4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255,
36     7, 255, 8, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255};
37 
calc_zero_coef(const int16_t * const filter_x,const int16_t * const filter_y)38 static INLINE int calc_zero_coef(const int16_t* const filter_x, const int16_t* const filter_y) {
39     int cnt = 0;
40     if (!(filter_x[0] | filter_y[0])) {
41         cnt++;
42         if (!(filter_x[1] | filter_y[1])) {
43             cnt++;
44             if (!(filter_x[2] | filter_y[2])) {
45                 cnt++;
46             }
47         }
48     }
49     return cnt;
50 }
51 
wiener_clip(const __m256i s,const __m256i r,const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)52 static INLINE __m256i wiener_clip(const __m256i s, const __m256i r, const __m256i filt_center,
53                                   const __m256i round_h0, const __m256i round_h1,
54                                   const __m256i clamp_high) {
55     const int     round_0   = WIENER_ROUND0_BITS;
56     const __m256i clamp_low = _mm256_setzero_si256();
57     __m256i       res       = _mm256_srai_epi16(_mm256_add_epi16(r, round_h0), round_0);
58     __m256i       data_0    = _mm256_shuffle_epi8(s, filt_center);
59     data_0                  = _mm256_slli_epi16(data_0, FILTER_BITS - round_0);
60     res                     = _mm256_add_epi16(res, data_0);
61     res                     = _mm256_add_epi16(res, round_h1);
62     res                     = _mm256_max_epi16(res, clamp_low);
63     return _mm256_min_epi16(res, clamp_high);
64 }
65 
wiener_convolve_tap3(const __m256i s,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)66 SIMD_INLINE __m256i wiener_convolve_tap3(const __m256i s, const __m256i coeffs[2],
67                                          const __m256i filt[2], const __m256i filt_center,
68                                          const __m256i round_h0, const __m256i round_h1,
69                                          const __m256i clamp_high) {
70     const __m256i res = x_convolve_4tap_avx2(s, coeffs, filt);
71     return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
72 }
73 
wiener_convolve_tap5(const __m256i s,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)74 static INLINE __m256i wiener_convolve_tap5(const __m256i s, const __m256i coeffs[3],
75                                            const __m256i filt[3], const __m256i filt_center,
76                                            const __m256i round_h0, const __m256i round_h1,
77                                            const __m256i clamp_high) {
78     const __m256i res = x_convolve_6tap_avx2(s, coeffs, filt);
79     return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
80 }
81 
wiener_convolve_tap7(const __m256i s,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)82 static INLINE __m256i wiener_convolve_tap7(const __m256i s, const __m256i coeffs[4],
83                                            const __m256i filt[4], const __m256i filt_center,
84                                            const __m256i round_h0, const __m256i round_h1,
85                                            const __m256i clamp_high) {
86     const __m256i res = x_convolve_8tap_avx2(s, coeffs, filt);
87     return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
88 }
89 
wiener_convolve_h8x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)90 static INLINE __m256i wiener_convolve_h8x2_tap3(const uint8_t* src, const ptrdiff_t stride,
91                                                 const __m256i coeffs[2], const __m256i filt[2],
92                                                 const __m256i filt_center, const __m256i round_h0,
93                                                 const __m256i round_h1, const __m256i clamp_high) {
94     const __m256i s = loadu_8bit_16x2_avx2(src, stride);
95     return wiener_convolve_tap3(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
96 }
97 
wiener_convolve_h8x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)98 static INLINE __m256i wiener_convolve_h8x2_tap5(const uint8_t* src, const ptrdiff_t stride,
99                                                 const __m256i coeffs[3], const __m256i filt[3],
100                                                 const __m256i filt_center, const __m256i round_h0,
101                                                 const __m256i round_h1, const __m256i clamp_high) {
102     const __m256i s = loadu_8bit_16x2_avx2(src, stride);
103     return wiener_convolve_tap5(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
104 }
105 
wiener_convolve_h8x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)106 static INLINE __m256i wiener_convolve_h8x2_tap7(const uint8_t* src, const ptrdiff_t stride,
107                                                 const __m256i coeffs[4], const __m256i filt[4],
108                                                 const __m256i filt_center, const __m256i round_h0,
109                                                 const __m256i round_h1, const __m256i clamp_high) {
110     const __m256i s = loadu_8bit_16x2_avx2(src, stride);
111     return wiener_convolve_tap7(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
112 }
113 
wiener_convolve_h16x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)114 SIMD_INLINE void wiener_convolve_h16x2_tap3(const uint8_t* src, const ptrdiff_t stride,
115                                             const __m256i coeffs[2], const __m256i filt[2],
116                                             const __m256i filt_center, const __m256i round_h0,
117                                             const __m256i round_h1, const __m256i clamp_high,
118                                             __m256i* const dst0, __m256i* const dst1) {
119     *dst0 = wiener_convolve_h8x2_tap3(
120         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
121     *dst1 = wiener_convolve_h8x2_tap3(
122         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
123 }
124 
wiener_convolve_h16x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)125 SIMD_INLINE void wiener_convolve_h16x2_tap5(const uint8_t* src, const ptrdiff_t stride,
126                                             const __m256i coeffs[3], const __m256i filt[3],
127                                             const __m256i filt_center, const __m256i round_h0,
128                                             const __m256i round_h1, const __m256i clamp_high,
129                                             __m256i* const dst0, __m256i* const dst1) {
130     *dst0 = wiener_convolve_h8x2_tap5(
131         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
132     *dst1 = wiener_convolve_h8x2_tap5(
133         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
134 }
135 
wiener_convolve_h16x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)136 SIMD_INLINE void wiener_convolve_h16x2_tap7(const uint8_t* src, const ptrdiff_t stride,
137                                             const __m256i coeffs[4], const __m256i filt[4],
138                                             const __m256i filt_center, const __m256i round_h0,
139                                             const __m256i round_h1, const __m256i clamp_high,
140                                             __m256i* const dst0, __m256i* const dst1) {
141     *dst0 = wiener_convolve_h8x2_tap7(
142         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
143     *dst1 = wiener_convolve_h8x2_tap7(
144         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
145 }
146 
round_store(const __m256i res0,const __m256i res1,const __m256i round_v)147 static INLINE __m256i round_store(const __m256i res0, const __m256i res1, const __m256i round_v) {
148     const int     round_1 = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
149     const __m256i r0      = _mm256_srai_epi32(_mm256_add_epi32(res0, round_v), round_1);
150     const __m256i r1      = _mm256_srai_epi32(_mm256_add_epi32(res1, round_v), round_1);
151     return _mm256_packs_epi32(r0, r1);
152 }
153 
wiener_convolve_v_tap3_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[3])154 SIMD_INLINE __m256i wiener_convolve_v_tap3_kernel(const __m256i coeffs[2], const __m256i round_v,
155                                                   const __m256i s[3]) {
156     const __m256i s0 = _mm256_add_epi16(s[0], s[2]);
157     __m256i       ss[2];
158     ss[0]              = _mm256_unpacklo_epi16(s0, s[1]);
159     ss[1]              = _mm256_unpackhi_epi16(s0, s[1]);
160     const __m256i res0 = convolve16_2tap_avx2(&ss[0], coeffs);
161     const __m256i res1 = convolve16_2tap_avx2(&ss[1], coeffs);
162     return round_store(res0, res1, round_v);
163 }
164 
wiener_convolve_v_tap5_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[5])165 SIMD_INLINE __m256i wiener_convolve_v_tap5_kernel(const __m256i coeffs[2], const __m256i round_v,
166                                                   const __m256i s[5]) {
167     const __m256i s0 = _mm256_add_epi16(s[0], s[4]);
168     const __m256i s1 = _mm256_add_epi16(s[1], s[3]);
169     __m256i       ss[4];
170     ss[0]              = _mm256_unpacklo_epi16(s0, s1);
171     ss[1]              = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
172     ss[2]              = _mm256_unpackhi_epi16(s0, s1);
173     ss[3]              = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
174     const __m256i res0 = convolve16_4tap_avx2(ss + 0, coeffs);
175     const __m256i res1 = convolve16_4tap_avx2(ss + 2, coeffs);
176     return round_store(res0, res1, round_v);
177 }
178 
wiener_convolve_v_tap7_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[7])179 SIMD_INLINE __m256i wiener_convolve_v_tap7_kernel(const __m256i coeffs[2], const __m256i round_v,
180                                                   const __m256i s[7]) {
181     const __m256i s0 = _mm256_add_epi16(s[0], s[6]);
182     const __m256i s1 = _mm256_add_epi16(s[1], s[5]);
183     const __m256i s2 = _mm256_add_epi16(s[2], s[4]);
184     __m256i       ss[4];
185     ss[0]              = _mm256_unpacklo_epi16(s0, s1);
186     ss[1]              = _mm256_unpacklo_epi16(s2, s[3]);
187     ss[2]              = _mm256_unpackhi_epi16(s0, s1);
188     ss[3]              = _mm256_unpackhi_epi16(s2, s[3]);
189     const __m256i res0 = convolve16_4tap_avx2(ss + 0, coeffs);
190     const __m256i res1 = convolve16_4tap_avx2(ss + 2, coeffs);
191     return round_store(res0, res1, round_v);
192 }
193 
wiener_convolve_v8x2_tap3(const __m256i coeffs[2],const __m256i round_v,__m256i s[3])194 SIMD_INLINE __m256i wiener_convolve_v8x2_tap3(const __m256i coeffs[2], const __m256i round_v,
195                                               __m256i s[3]) {
196     const __m256i dst = wiener_convolve_v_tap3_kernel(coeffs, round_v, s);
197     s[0]              = s[2];
198     return dst;
199 }
200 
wiener_convolve_v8x2_tap5(const __m256i coeffs[2],const __m256i round_v,__m256i s[5])201 SIMD_INLINE __m256i wiener_convolve_v8x2_tap5(const __m256i coeffs[2], const __m256i round_v,
202                                               __m256i s[5]) {
203     const __m256i dst = wiener_convolve_v_tap5_kernel(coeffs, round_v, s);
204     s[0]              = s[2];
205     s[1]              = s[3];
206     s[2]              = s[4];
207     return dst;
208 }
209 
wiener_convolve_v8x2_tap7(const __m256i coeffs[2],const __m256i round_v,__m256i s[7])210 static INLINE __m256i wiener_convolve_v8x2_tap7(const __m256i coeffs[2], const __m256i round_v,
211                                                 __m256i s[7]) {
212     const __m256i dst = wiener_convolve_v_tap7_kernel(coeffs, round_v, s);
213     s[0]              = s[2];
214     s[1]              = s[3];
215     s[2]              = s[4];
216     s[3]              = s[5];
217     s[4]              = s[6];
218     return dst;
219 }
220 
221 #endif // !AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
222