1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #ifndef AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
13 #define AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
14
15 #include <immintrin.h>
16
17 #include "convolve.h"
18 #include "convolve_avx2.h"
19 #include "EbDefinitions.h"
20 #include "synonyms.h"
21 #include "synonyms_avx2.h"
22
23 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap7_global_avx[64]) = {
24 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, 3, 255, 4, 255, 5, 255,
25 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255,
26 9, 255, 10, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255};
27
28 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap5_global_avx[64]) = {
29 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 2, 255, 3, 255, 4, 255,
30 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255,
31 8, 255, 9, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255};
32
33 DECLARE_ALIGNED(64, static const uint8_t, filt_center_tap3_global_avx[64]) = {
34 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 1, 255, 2, 255, 3, 255,
35 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255,
36 7, 255, 8, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255};
37
calc_zero_coef(const int16_t * const filter_x,const int16_t * const filter_y)38 static INLINE int calc_zero_coef(const int16_t* const filter_x, const int16_t* const filter_y) {
39 int cnt = 0;
40 if (!(filter_x[0] | filter_y[0])) {
41 cnt++;
42 if (!(filter_x[1] | filter_y[1])) {
43 cnt++;
44 if (!(filter_x[2] | filter_y[2])) {
45 cnt++;
46 }
47 }
48 }
49 return cnt;
50 }
51
wiener_clip(const __m256i s,const __m256i r,const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)52 static INLINE __m256i wiener_clip(const __m256i s, const __m256i r, const __m256i filt_center,
53 const __m256i round_h0, const __m256i round_h1,
54 const __m256i clamp_high) {
55 const int round_0 = WIENER_ROUND0_BITS;
56 const __m256i clamp_low = _mm256_setzero_si256();
57 __m256i res = _mm256_srai_epi16(_mm256_add_epi16(r, round_h0), round_0);
58 __m256i data_0 = _mm256_shuffle_epi8(s, filt_center);
59 data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - round_0);
60 res = _mm256_add_epi16(res, data_0);
61 res = _mm256_add_epi16(res, round_h1);
62 res = _mm256_max_epi16(res, clamp_low);
63 return _mm256_min_epi16(res, clamp_high);
64 }
65
wiener_convolve_tap3(const __m256i s,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)66 SIMD_INLINE __m256i wiener_convolve_tap3(const __m256i s, const __m256i coeffs[2],
67 const __m256i filt[2], const __m256i filt_center,
68 const __m256i round_h0, const __m256i round_h1,
69 const __m256i clamp_high) {
70 const __m256i res = x_convolve_4tap_avx2(s, coeffs, filt);
71 return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
72 }
73
wiener_convolve_tap5(const __m256i s,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)74 static INLINE __m256i wiener_convolve_tap5(const __m256i s, const __m256i coeffs[3],
75 const __m256i filt[3], const __m256i filt_center,
76 const __m256i round_h0, const __m256i round_h1,
77 const __m256i clamp_high) {
78 const __m256i res = x_convolve_6tap_avx2(s, coeffs, filt);
79 return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
80 }
81
wiener_convolve_tap7(const __m256i s,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)82 static INLINE __m256i wiener_convolve_tap7(const __m256i s, const __m256i coeffs[4],
83 const __m256i filt[4], const __m256i filt_center,
84 const __m256i round_h0, const __m256i round_h1,
85 const __m256i clamp_high) {
86 const __m256i res = x_convolve_8tap_avx2(s, coeffs, filt);
87 return wiener_clip(s, res, filt_center, round_h0, round_h1, clamp_high);
88 }
89
wiener_convolve_h8x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)90 static INLINE __m256i wiener_convolve_h8x2_tap3(const uint8_t* src, const ptrdiff_t stride,
91 const __m256i coeffs[2], const __m256i filt[2],
92 const __m256i filt_center, const __m256i round_h0,
93 const __m256i round_h1, const __m256i clamp_high) {
94 const __m256i s = loadu_8bit_16x2_avx2(src, stride);
95 return wiener_convolve_tap3(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
96 }
97
wiener_convolve_h8x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)98 static INLINE __m256i wiener_convolve_h8x2_tap5(const uint8_t* src, const ptrdiff_t stride,
99 const __m256i coeffs[3], const __m256i filt[3],
100 const __m256i filt_center, const __m256i round_h0,
101 const __m256i round_h1, const __m256i clamp_high) {
102 const __m256i s = loadu_8bit_16x2_avx2(src, stride);
103 return wiener_convolve_tap5(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
104 }
105
wiener_convolve_h8x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high)106 static INLINE __m256i wiener_convolve_h8x2_tap7(const uint8_t* src, const ptrdiff_t stride,
107 const __m256i coeffs[4], const __m256i filt[4],
108 const __m256i filt_center, const __m256i round_h0,
109 const __m256i round_h1, const __m256i clamp_high) {
110 const __m256i s = loadu_8bit_16x2_avx2(src, stride);
111 return wiener_convolve_tap7(s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
112 }
113
wiener_convolve_h16x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)114 SIMD_INLINE void wiener_convolve_h16x2_tap3(const uint8_t* src, const ptrdiff_t stride,
115 const __m256i coeffs[2], const __m256i filt[2],
116 const __m256i filt_center, const __m256i round_h0,
117 const __m256i round_h1, const __m256i clamp_high,
118 __m256i* const dst0, __m256i* const dst1) {
119 *dst0 = wiener_convolve_h8x2_tap3(
120 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
121 *dst1 = wiener_convolve_h8x2_tap3(
122 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
123 }
124
wiener_convolve_h16x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)125 SIMD_INLINE void wiener_convolve_h16x2_tap5(const uint8_t* src, const ptrdiff_t stride,
126 const __m256i coeffs[3], const __m256i filt[3],
127 const __m256i filt_center, const __m256i round_h0,
128 const __m256i round_h1, const __m256i clamp_high,
129 __m256i* const dst0, __m256i* const dst1) {
130 *dst0 = wiener_convolve_h8x2_tap5(
131 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
132 *dst1 = wiener_convolve_h8x2_tap5(
133 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
134 }
135
wiener_convolve_h16x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i filt_center,const __m256i round_h0,const __m256i round_h1,const __m256i clamp_high,__m256i * const dst0,__m256i * const dst1)136 SIMD_INLINE void wiener_convolve_h16x2_tap7(const uint8_t* src, const ptrdiff_t stride,
137 const __m256i coeffs[4], const __m256i filt[4],
138 const __m256i filt_center, const __m256i round_h0,
139 const __m256i round_h1, const __m256i clamp_high,
140 __m256i* const dst0, __m256i* const dst1) {
141 *dst0 = wiener_convolve_h8x2_tap7(
142 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
143 *dst1 = wiener_convolve_h8x2_tap7(
144 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
145 }
146
round_store(const __m256i res0,const __m256i res1,const __m256i round_v)147 static INLINE __m256i round_store(const __m256i res0, const __m256i res1, const __m256i round_v) {
148 const int round_1 = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
149 const __m256i r0 = _mm256_srai_epi32(_mm256_add_epi32(res0, round_v), round_1);
150 const __m256i r1 = _mm256_srai_epi32(_mm256_add_epi32(res1, round_v), round_1);
151 return _mm256_packs_epi32(r0, r1);
152 }
153
wiener_convolve_v_tap3_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[3])154 SIMD_INLINE __m256i wiener_convolve_v_tap3_kernel(const __m256i coeffs[2], const __m256i round_v,
155 const __m256i s[3]) {
156 const __m256i s0 = _mm256_add_epi16(s[0], s[2]);
157 __m256i ss[2];
158 ss[0] = _mm256_unpacklo_epi16(s0, s[1]);
159 ss[1] = _mm256_unpackhi_epi16(s0, s[1]);
160 const __m256i res0 = convolve16_2tap_avx2(&ss[0], coeffs);
161 const __m256i res1 = convolve16_2tap_avx2(&ss[1], coeffs);
162 return round_store(res0, res1, round_v);
163 }
164
wiener_convolve_v_tap5_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[5])165 SIMD_INLINE __m256i wiener_convolve_v_tap5_kernel(const __m256i coeffs[2], const __m256i round_v,
166 const __m256i s[5]) {
167 const __m256i s0 = _mm256_add_epi16(s[0], s[4]);
168 const __m256i s1 = _mm256_add_epi16(s[1], s[3]);
169 __m256i ss[4];
170 ss[0] = _mm256_unpacklo_epi16(s0, s1);
171 ss[1] = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
172 ss[2] = _mm256_unpackhi_epi16(s0, s1);
173 ss[3] = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
174 const __m256i res0 = convolve16_4tap_avx2(ss + 0, coeffs);
175 const __m256i res1 = convolve16_4tap_avx2(ss + 2, coeffs);
176 return round_store(res0, res1, round_v);
177 }
178
wiener_convolve_v_tap7_kernel(const __m256i coeffs[2],const __m256i round_v,const __m256i s[7])179 SIMD_INLINE __m256i wiener_convolve_v_tap7_kernel(const __m256i coeffs[2], const __m256i round_v,
180 const __m256i s[7]) {
181 const __m256i s0 = _mm256_add_epi16(s[0], s[6]);
182 const __m256i s1 = _mm256_add_epi16(s[1], s[5]);
183 const __m256i s2 = _mm256_add_epi16(s[2], s[4]);
184 __m256i ss[4];
185 ss[0] = _mm256_unpacklo_epi16(s0, s1);
186 ss[1] = _mm256_unpacklo_epi16(s2, s[3]);
187 ss[2] = _mm256_unpackhi_epi16(s0, s1);
188 ss[3] = _mm256_unpackhi_epi16(s2, s[3]);
189 const __m256i res0 = convolve16_4tap_avx2(ss + 0, coeffs);
190 const __m256i res1 = convolve16_4tap_avx2(ss + 2, coeffs);
191 return round_store(res0, res1, round_v);
192 }
193
wiener_convolve_v8x2_tap3(const __m256i coeffs[2],const __m256i round_v,__m256i s[3])194 SIMD_INLINE __m256i wiener_convolve_v8x2_tap3(const __m256i coeffs[2], const __m256i round_v,
195 __m256i s[3]) {
196 const __m256i dst = wiener_convolve_v_tap3_kernel(coeffs, round_v, s);
197 s[0] = s[2];
198 return dst;
199 }
200
wiener_convolve_v8x2_tap5(const __m256i coeffs[2],const __m256i round_v,__m256i s[5])201 SIMD_INLINE __m256i wiener_convolve_v8x2_tap5(const __m256i coeffs[2], const __m256i round_v,
202 __m256i s[5]) {
203 const __m256i dst = wiener_convolve_v_tap5_kernel(coeffs, round_v, s);
204 s[0] = s[2];
205 s[1] = s[3];
206 s[2] = s[4];
207 return dst;
208 }
209
wiener_convolve_v8x2_tap7(const __m256i coeffs[2],const __m256i round_v,__m256i s[7])210 static INLINE __m256i wiener_convolve_v8x2_tap7(const __m256i coeffs[2], const __m256i round_v,
211 __m256i s[7]) {
212 const __m256i dst = wiener_convolve_v_tap7_kernel(coeffs, round_v, s);
213 s[0] = s[2];
214 s[1] = s[3];
215 s[2] = s[4];
216 s[3] = s[5];
217 s[4] = s[6];
218 return dst;
219 }
220
221 #endif // !AOM_DSP_X86_WIENER_CONVOLVE_AVX2_H_
222