1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <immintrin.h>
13 #include "common_dsp_rtcd.h"
14 #include "EbBitstreamUnit.h"
15 #include "EbCdef.h"
16 #include "EbDefinitions.h"
17 #include "EbMemory_AVX2.h"
18
19 /* partial A is a 16-bit vector of the form:
20 [x8 x7 x6 x5 x4 x3 x2 x1] and partial b has the form:
21 [0 y1 y2 y3 y4 y5 y6 y7].
22 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
24 and const2. */
fold_mul_and_sum(__m256i partial,__m256i const_var)25 static INLINE __m256i fold_mul_and_sum(__m256i partial, __m256i const_var) {
26 partial = _mm256_shuffle_epi8(partial,
27 _mm256_set_epi32(0x0f0e0100,
28 0x03020504,
29 0x07060908,
30 0x0b0a0d0c,
31 0x0f0e0d0c,
32 0x0b0a0908,
33 0x07060504,
34 0x03020100));
35 partial = _mm256_permutevar8x32_epi32(partial, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
36 partial = _mm256_shuffle_epi8(partial,
37 _mm256_set_epi32(0x0f0e0b0a,
38 0x0d0c0908,
39 0x07060302,
40 0x05040100,
41 0x0f0e0b0a,
42 0x0d0c0908,
43 0x07060302,
44 0x05040100));
45 partial = _mm256_madd_epi16(partial, partial);
46 partial = _mm256_mullo_epi32(partial, const_var);
47 return partial;
48 }
49
hsum4(__m128i x0,__m128i x1,__m128i x2,__m128i x3)50 static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
51 __m128i t0, t1, t2, t3;
52 t0 = _mm_unpacklo_epi32(x0, x1);
53 t1 = _mm_unpacklo_epi32(x2, x3);
54 t2 = _mm_unpackhi_epi32(x0, x1);
55 t3 = _mm_unpackhi_epi32(x2, x3);
56 x0 = _mm_unpacklo_epi64(t0, t1);
57 x1 = _mm_unpackhi_epi64(t0, t1);
58 x2 = _mm_unpacklo_epi64(t2, t3);
59 x3 = _mm_unpackhi_epi64(t2, t3);
60 return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
61 }
62
63 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
64 to compute the remaining directions. */
compute_directions(__m128i lines[8],int32_t tmp_cost1[4])65 static INLINE void compute_directions(__m128i lines[8], int32_t tmp_cost1[4]) {
66 __m128i partial6;
67 __m128i tmp;
68
69 __m256i partial4;
70 __m256i partial5;
71 __m256i partial7;
72 __m256i tmp_avx2;
73 /* Partial sums for lines 0 and 1. */
74 partial4 = _mm256_setr_m128i(_mm_slli_si128(lines[0], 14), _mm_srli_si128(lines[0], 2));
75 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[1], 12), _mm_srli_si128(lines[1], 4));
76 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
77 tmp = _mm_add_epi16(lines[0], lines[1]);
78 partial5 = _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
79 partial7 = _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
80 partial6 = tmp;
81
82 /* Partial sums for lines 2 and 3. */
83 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[2], 10), _mm_srli_si128(lines[2], 6));
84 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
85 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[3], 8), _mm_srli_si128(lines[3], 8));
86 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
87 tmp = _mm_add_epi16(lines[2], lines[3]);
88 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
89 partial5 = _mm256_add_epi16(partial5, tmp_avx2);
90 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
91 partial7 = _mm256_add_epi16(partial7, tmp_avx2);
92 partial6 = _mm_add_epi16(partial6, tmp);
93
94 /* Partial sums for lines 4 and 5. */
95 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[4], 6), _mm_srli_si128(lines[4], 10));
96 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
97 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[5], 4), _mm_srli_si128(lines[5], 12));
98 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
99 tmp = _mm_add_epi16(lines[4], lines[5]);
100 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
101 partial5 = _mm256_add_epi16(partial5, tmp_avx2);
102 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
103 partial7 = _mm256_add_epi16(partial7, tmp_avx2);
104 partial6 = _mm_add_epi16(partial6, tmp);
105
106 /* Partial sums for lines 6 and 7. */
107 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[6], 2), _mm_srli_si128(lines[6], 14));
108 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
109 tmp_avx2 = _mm256_insertf128_si256(_mm256_setzero_si256(), lines[7], 0x0);
110 partial4 = _mm256_add_epi16(partial4, tmp_avx2);
111 tmp = _mm_add_epi16(lines[6], lines[7]);
112 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
113 partial5 = _mm256_add_epi16(partial5, tmp_avx2);
114 tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
115 partial7 = _mm256_add_epi16(partial7, tmp_avx2);
116 partial6 = _mm_add_epi16(partial6, tmp);
117
118 /* Compute costs in terms of partial sums. */
119 partial4 = fold_mul_and_sum(partial4, _mm256_set_epi32(105, 120, 140, 168, 210, 280, 420, 840));
120 partial7 = fold_mul_and_sum(partial7, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
121 partial5 = fold_mul_and_sum(partial5, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
122 partial6 = _mm_madd_epi16(partial6, partial6);
123 partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
124 __m128i a, b, c;
125 a = _mm_add_epi32(_mm256_castsi256_si128(partial4), _mm256_extracti128_si256(partial4, 1));
126 b = _mm_add_epi32(_mm256_castsi256_si128(partial5), _mm256_extracti128_si256(partial5, 1));
127 c = _mm_add_epi32(_mm256_castsi256_si128(partial7), _mm256_extracti128_si256(partial7, 1));
128
129 _mm_storeu_si128((__m128i *)tmp_cost1, hsum4(a, b, partial6, c));
130 }
131
132 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
133 counter-clockwise rotation of the pixels. */
array_reverse_transpose_8x8(__m128i * in,__m128i * res)134 static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
135 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
136 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
137 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
138 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
139 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
140 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
141 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
142 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
143
144 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
145 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
146 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
147 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
148 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
149 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
150 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
151 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
152
153 res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
154 res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
155 res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
156 res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
157 res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
158 res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
159 res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
160 res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
161 }
162
svt_cdef_find_dir_avx2(const uint16_t * img,int32_t stride,int32_t * var,int32_t coeff_shift)163 int32_t svt_cdef_find_dir_avx2(const uint16_t *img, int32_t stride, int32_t *var,
164 int32_t coeff_shift) {
165 int32_t i;
166 int32_t cost[8];
167 int32_t best_cost = 0;
168 int32_t best_dir = 0;
169 __m128i lines[8];
170 __m128i const_128 = _mm_set1_epi16(128);
171 for (i = 0; i < 8; i++) {
172 lines[i] = _mm_lddqu_si128((__m128i *)&img[i * stride]);
173 lines[i] = _mm_sub_epi16(_mm_sra_epi16(lines[i], _mm_cvtsi32_si128(coeff_shift)),
174 const_128);
175 }
176
177 /* Compute "mostly vertical" directions. */
178 compute_directions(lines, cost + 4);
179
180 array_reverse_transpose_8x8(lines, lines);
181
182 /* Compute "mostly horizontal" directions. */
183 compute_directions(lines, cost);
184
185 for (i = 0; i < 8; i++) {
186 if (cost[i] > best_cost) {
187 best_cost = cost[i];
188 best_dir = i;
189 }
190 }
191
192 /* Difference between the optimal variance and the variance along the
193 orthogonal direction. Again, the sum(x^2) terms cancel out. */
194 *var = best_cost - cost[(best_dir + 4) & 7];
195 /* We'd normally divide by 840, but dividing by 1024 is close enough
196 for what we're going to do with this. */
197 *var >>= 10;
198 return best_dir;
199 }
200
201 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(const __m256i in0,const __m256i in1,const __m256i threshold,const uint32_t adjdamp)202 static INLINE __m256i constrain16(const __m256i in0, const __m256i in1, const __m256i threshold,
203 const uint32_t adjdamp) {
204 const __m256i diff = _mm256_sub_epi16(in0, in1);
205 const __m256i sign = _mm256_srai_epi16(diff, 15);
206 const __m256i a = _mm256_abs_epi16(diff);
207 const __m256i l = _mm256_srl_epi16(a, _mm_cvtsi32_si128(adjdamp));
208 const __m256i s = _mm256_subs_epu16(threshold, l);
209 const __m256i m = _mm256_min_epi16(a, s);
210 const __m256i d = _mm256_add_epi16(sign, m);
211 return _mm256_xor_si256(d, sign);
212 }
213
svt_cdef_filter_block_4x4_8_avx2(uint8_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)214 static void svt_cdef_filter_block_4x4_8_avx2(uint8_t *dst, int32_t dstride, const uint16_t *in,
215 int32_t pri_strength, int32_t sec_strength,
216 int32_t dir, int32_t pri_damping, int32_t sec_damping,
217 int32_t coeff_shift) {
218 __m256i p0, p1, p2, p3, sum, row, res;
219 __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
220 int32_t po1 = eb_cdef_directions[dir][0];
221 int32_t po2 = eb_cdef_directions[dir][1];
222 int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
223 int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
224 int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
225 int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
226
227 const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
228 const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
229 __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
230 __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
231
232 if (pri_strength)
233 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
234 if (sec_strength)
235 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
236
237 sum = _mm256_setzero_si256();
238 row = _mm256_set_epi64x(*(uint64_t *)(in),
239 *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
240 *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
241 *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
242 min = max = row;
243
244 if (pri_strength) {
245 // Primary near taps
246 p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
247 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
248 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
249 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
250 p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
251 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
252 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
253 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
254
255 max = _mm256_max_epi16(
256 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
257 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
258 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
259 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
260 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
261
262 // sum += pri_taps[0] * (p0 + p1)
263 sum = _mm256_add_epi16(
264 sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]), _mm256_add_epi16(p0, p1)));
265
266 // Primary far taps
267 p0 = _mm256_set_epi64x(*(uint64_t *)(in + po2),
268 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
269 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
270 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
271 p1 = _mm256_set_epi64x(*(uint64_t *)(in - po2),
272 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
273 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
274 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
275 max = _mm256_max_epi16(
276 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
277 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
278 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
279 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
280 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
281
282 // sum += pri_taps[1] * (p0 + p1)
283 sum = _mm256_add_epi16(
284 sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]), _mm256_add_epi16(p0, p1)));
285 }
286
287 if (sec_strength) {
288 // Secondary near taps
289 p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
290 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
291 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
292 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
293 p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
294 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
295 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
296 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
297 p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
298 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
299 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
300 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
301 p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
302 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
303 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
304 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
305 max = _mm256_max_epi16(
306 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
307 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
308 max = _mm256_max_epi16(
309 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
310 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
311 min = _mm256_min_epi16(
312 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
313 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
314 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
315 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
316 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
317
318 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
319 sum = _mm256_add_epi16(sum,
320 _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
321 _mm256_add_epi16(_mm256_add_epi16(p0, p1),
322 _mm256_add_epi16(p2, p3))));
323
324 // Secondary far taps
325 p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
326 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
327 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
328 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
329 p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
330 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
331 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
332 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
333 p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
334 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
335 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
336 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
337 p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
338 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
339 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
340 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
341 max = _mm256_max_epi16(
342 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
343 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
344 max = _mm256_max_epi16(
345 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
346 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
347 min = _mm256_min_epi16(
348 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
349 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
350 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
351 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
352 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
353
354 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
355 sum = _mm256_add_epi16(sum,
356 _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
357 _mm256_add_epi16(_mm256_add_epi16(p0, p1),
358 _mm256_add_epi16(p2, p3))));
359 }
360
361 // res = row + ((sum - (sum < 0) + 8) >> 4)
362 sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
363 res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
364 res = _mm256_srai_epi16(res, 4);
365 res = _mm256_add_epi16(row, res);
366 res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
367 res = _mm256_packus_epi16(res, res);
368
369 *(int32_t *)(dst + 0 * dstride) = _mm256_extract_epi32(res, 5);
370 *(int32_t *)(dst + 1 * dstride) = _mm256_extract_epi32(res, 4);
371 *(int32_t *)(dst + 2 * dstride) = _mm256_extract_epi32(res, 1);
372 *(int32_t *)(dst + 3 * dstride) = _mm256_cvtsi256_si32(res);
373 }
374
svt_cdef_filter_block_8x8_8_avx2(uint8_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)375 static void svt_cdef_filter_block_8x8_8_avx2(uint8_t *dst, int32_t dstride, const uint16_t *in,
376 int32_t pri_strength, int32_t sec_strength,
377 int32_t dir, int32_t pri_damping, int32_t sec_damping,
378 int32_t coeff_shift) {
379 int32_t i;
380 __m256i sum, p0, p1, p2, p3, row, res;
381 __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
382 int32_t po1 = eb_cdef_directions[dir][0];
383 int32_t po2 = eb_cdef_directions[dir][1];
384 int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
385 int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
386 int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
387 int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
388 // SSE CHKN
389 const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
390 const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
391 __m256i pri_taps_0 = _mm256_set1_epi16(pri_taps[0]);
392 __m256i pri_taps_1 = _mm256_set1_epi16(pri_taps[1]);
393 __m256i sec_taps_0 = _mm256_set1_epi16(sec_taps[0]);
394 __m256i sec_taps_1 = _mm256_set1_epi16(sec_taps[1]);
395 __m256i duplicate_8 = _mm256_set1_epi16(8);
396 __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
397 __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
398
399 if (pri_strength)
400 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
401 if (sec_strength)
402 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
403
404 for (i = 0; i < 8; i += 2) {
405 sum = _mm256_setzero_si256();
406 row = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE)),
407 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE)));
408
409 min = max = row;
410 if (pri_strength) {
411 // Primary near taps
412 p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po1)),
413 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po1)));
414 p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po1)),
415 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po1)));
416 max = _mm256_max_epi16(
417 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
418 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
419 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
420 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
421 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
422
423 // sum += pri_taps[0] * (p0 + p1)
424 sum = _mm256_add_epi16(sum, _mm256_mullo_epi16(pri_taps_0, _mm256_add_epi16(p0, p1)));
425
426 // Primary far taps
427 p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po2)),
428 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po2)));
429 p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po2)),
430 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po2)));
431 max = _mm256_max_epi16(
432 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
433 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
434 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
435 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
436 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
437
438 // sum += pri_taps[1] * (p0 + p1)
439 sum = _mm256_add_epi16(sum, _mm256_mullo_epi16(pri_taps_1, _mm256_add_epi16(p0, p1)));
440 }
441
442 if (sec_strength) {
443 // Secondary near taps
444 p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o1)),
445 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o1)));
446 p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o1)),
447 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o1)));
448 p2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o1)),
449 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o1)));
450 p3 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o1)),
451 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o1)));
452 max = _mm256_max_epi16(
453 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
454 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
455 max = _mm256_max_epi16(
456 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
457 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
458 min = _mm256_min_epi16(
459 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
460 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
461 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
462 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
463 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
464
465 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
466 sum = _mm256_add_epi16(sum,
467 _mm256_mullo_epi16(sec_taps_0,
468 _mm256_add_epi16(_mm256_add_epi16(p0, p1),
469 _mm256_add_epi16(p2, p3))));
470
471 // Secondary far taps
472 p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o2)),
473 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o2)));
474 p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o2)),
475 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o2)));
476 p2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o2)),
477 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o2)));
478 p3 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o2)),
479 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o2)));
480 max = _mm256_max_epi16(
481 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
482 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
483 max = _mm256_max_epi16(
484 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
485 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
486 min = _mm256_min_epi16(
487 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
488 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
489 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
490 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
491 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
492
493 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
494 sum = _mm256_add_epi16(sum,
495 _mm256_mullo_epi16(sec_taps_1,
496 _mm256_add_epi16(_mm256_add_epi16(p0, p1),
497 _mm256_add_epi16(p2, p3))));
498 }
499
500 // res = row + ((sum - (sum < 0) + 8) >> 4)
501 sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
502 res = _mm256_add_epi16(sum, duplicate_8);
503 res = _mm256_srai_epi16(res, 4);
504 res = _mm256_add_epi16(row, res);
505 res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
506 res = _mm256_packus_epi16(res, res);
507 *(int64_t *)(dst + i * dstride) = _mm256_extract_epi64(res, 2);
508 *(int64_t *)(dst + (i + 1) * dstride) = _mm256_extract_epi64(res, 0);
509 }
510 }
511
svt_cdef_filter_block_4x4_16_avx2(uint16_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)512 static void svt_cdef_filter_block_4x4_16_avx2(uint16_t *dst, int32_t dstride, const uint16_t *in,
513 int32_t pri_strength, int32_t sec_strength,
514 int32_t dir, int32_t pri_damping, int32_t sec_damping,
515 int32_t coeff_shift) {
516 __m256i p0, p1, p2, p3, sum, row, res;
517 __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
518 int32_t po1 = eb_cdef_directions[dir][0];
519 int32_t po2 = eb_cdef_directions[dir][1];
520 int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
521 int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
522 int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
523 int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
524
525 const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
526 const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
527 __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
528 __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
529
530 if (pri_strength)
531 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
532 if (sec_strength)
533 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
534 sum = _mm256_setzero_si256();
535 row = _mm256_set_epi64x(*(uint64_t *)(in),
536 *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
537 *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
538 *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
539 min = max = row;
540
541 // Primary near taps
542 p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
543 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
544 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
545 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
546 p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
547 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
548 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
549 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
550
551 max = _mm256_max_epi16(
552 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
553 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
554 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
555 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
556 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
557
558 // sum += pri_taps[0] * (p0 + p1)
559 sum = _mm256_add_epi16(
560 sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]), _mm256_add_epi16(p0, p1)));
561
562 // Primary far taps
563 p0 = _mm256_set_epi64x(*(uint64_t *)(in + po2),
564 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
565 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
566 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
567 p1 = _mm256_set_epi64x(*(uint64_t *)(in - po2),
568 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
569 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
570 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
571 max = _mm256_max_epi16(
572 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
573 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
574 min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
575 p0 = constrain16(p0, row, pri_strength_256, pri_damping);
576 p1 = constrain16(p1, row, pri_strength_256, pri_damping);
577
578 // sum += pri_taps[1] * (p0 + p1)
579 sum = _mm256_add_epi16(
580 sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]), _mm256_add_epi16(p0, p1)));
581
582 // Secondary near taps
583 p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
584 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
585 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
586 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
587 p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
588 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
589 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
590 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
591 p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
592 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
593 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
594 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
595 p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
596 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
597 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
598 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
599 max = _mm256_max_epi16(
600 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
601 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
602 max = _mm256_max_epi16(
603 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
604 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
605 min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
606 p3);
607 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
608 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
609 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
610 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
611
612 // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
613 sum = _mm256_add_epi16(
614 sum,
615 _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
616 _mm256_add_epi16(_mm256_add_epi16(p0, p1), _mm256_add_epi16(p2, p3))));
617
618 // Secondary far taps
619 p0 = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
620 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
621 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
622 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
623 p1 = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
624 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
625 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
626 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
627 p2 = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
628 *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
629 *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
630 *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
631 p3 = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
632 *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
633 *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
634 *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
635 max = _mm256_max_epi16(
636 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
637 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
638 max = _mm256_max_epi16(
639 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
640 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
641 min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
642 p3);
643 p0 = constrain16(p0, row, sec_strength_256, sec_damping);
644 p1 = constrain16(p1, row, sec_strength_256, sec_damping);
645 p2 = constrain16(p2, row, sec_strength_256, sec_damping);
646 p3 = constrain16(p3, row, sec_strength_256, sec_damping);
647
648 // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
649 sum = _mm256_add_epi16(
650 sum,
651 _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
652 _mm256_add_epi16(_mm256_add_epi16(p0, p1), _mm256_add_epi16(p2, p3))));
653
654 // res = row + ((sum - (sum < 0) + 8) >> 4)
655 sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
656 res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
657 res = _mm256_srai_epi16(res, 4);
658 res = _mm256_add_epi16(row, res);
659 res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
660
661 *(uint64_t *)(dst) = _mm256_extract_epi64(res, 3);
662 *(uint64_t *)(dst + 1 * dstride) = _mm256_extract_epi64(res, 2);
663 *(uint64_t *)(dst + 2 * dstride) = _mm256_extract_epi64(res, 1);
664 *(uint64_t *)(dst + 3 * dstride) = _mm256_extract_epi64(res, 0);
665 }
666
cdef_filter_block_8x8_16_pri_avx2(const uint16_t * const in,const int32_t pri_damping,const int32_t po,const __m256i row,const __m256i pri_strength_256,const __m256i pri_taps,__m256i * const max,__m256i * const min,__m256i * const sum)667 static INLINE void cdef_filter_block_8x8_16_pri_avx2(const uint16_t *const in,
668 const int32_t pri_damping, const int32_t po,
669 const __m256i row,
670 const __m256i pri_strength_256,
671 const __m256i pri_taps, __m256i *const max,
672 __m256i *const min, __m256i *const sum) {
673 const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
674 const __m256i p0 = loadu_u16_8x2_avx2(in + po, CDEF_BSTRIDE);
675 const __m256i p1 = loadu_u16_8x2_avx2(in - po, CDEF_BSTRIDE);
676
677 *max = _mm256_max_epi16(
678 _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
679 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
680 *min = _mm256_min_epi16(_mm256_min_epi16(*min, p0), p1);
681
682 const __m256i q0 = constrain16(p0, row, pri_strength_256, pri_damping);
683 const __m256i q1 = constrain16(p1, row, pri_strength_256, pri_damping);
684
685 // sum += pri_taps * (p0 + p1)
686 *sum = _mm256_add_epi16(*sum, _mm256_mullo_epi16(pri_taps, _mm256_add_epi16(q0, q1)));
687 }
688
cdef_filter_block_8x8_16_sec_avx2(const uint16_t * const in,const int32_t sec_damping,const int32_t so1,const int32_t so2,const __m256i row,const __m256i sec_strength_256,const __m256i sec_taps,__m256i * const max,__m256i * const min,__m256i * const sum)689 static INLINE void cdef_filter_block_8x8_16_sec_avx2(const uint16_t *const in,
690 const int32_t sec_damping, const int32_t so1,
691 const int32_t so2, const __m256i row,
692 const __m256i sec_strength_256,
693 const __m256i sec_taps, __m256i *const max,
694 __m256i *const min, __m256i *const sum) {
695 const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
696 const __m256i p0 = loadu_u16_8x2_avx2(in + so1, CDEF_BSTRIDE);
697 const __m256i p1 = loadu_u16_8x2_avx2(in - so1, CDEF_BSTRIDE);
698 const __m256i p2 = loadu_u16_8x2_avx2(in + so2, CDEF_BSTRIDE);
699 const __m256i p3 = loadu_u16_8x2_avx2(in - so2, CDEF_BSTRIDE);
700
701 *max = _mm256_max_epi16(
702 _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
703 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
704 *max = _mm256_max_epi16(
705 _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
706 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
707 *min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(*min, p0), p1), p2),
708 p3);
709
710 const __m256i q0 = constrain16(p0, row, sec_strength_256, sec_damping);
711 const __m256i q1 = constrain16(p1, row, sec_strength_256, sec_damping);
712 const __m256i q2 = constrain16(p2, row, sec_strength_256, sec_damping);
713 const __m256i q3 = constrain16(p3, row, sec_strength_256, sec_damping);
714
715 // sum += sec_taps * (p0 + p1 + p2 + p3)
716 *sum = _mm256_add_epi16(
717 *sum,
718 _mm256_mullo_epi16(sec_taps,
719 _mm256_add_epi16(_mm256_add_epi16(q0, q1), _mm256_add_epi16(q2, q3))));
720 }
721
svt_cdef_filter_block_8x8_16_avx2(const uint16_t * const in,const int32_t pri_strength,const int32_t sec_strength,const int32_t dir,int32_t pri_damping,int32_t sec_damping,const int32_t coeff_shift,uint16_t * const dst,const int32_t dstride)722 void svt_cdef_filter_block_8x8_16_avx2(const uint16_t *const in, const int32_t pri_strength,
723 const int32_t sec_strength, const int32_t dir,
724 int32_t pri_damping, int32_t sec_damping,
725 const int32_t coeff_shift, uint16_t *const dst,
726 const int32_t dstride) {
727 const int32_t po1 = eb_cdef_directions[dir][0];
728 const int32_t po2 = eb_cdef_directions[dir][1];
729 const int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
730 const int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
731 const int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
732 const int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
733 // SSE CHKN
734 const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
735 const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
736 int32_t i;
737 const __m256i pri_taps_0 = _mm256_set1_epi16(pri_taps[0]);
738 const __m256i pri_taps_1 = _mm256_set1_epi16(pri_taps[1]);
739 const __m256i sec_taps_0 = _mm256_set1_epi16(sec_taps[0]);
740 const __m256i sec_taps_1 = _mm256_set1_epi16(sec_taps[1]);
741 const __m256i duplicate_8 = _mm256_set1_epi16(8);
742 const __m256i pri_strength_256 = _mm256_set1_epi16(pri_strength);
743 const __m256i sec_strength_256 = _mm256_set1_epi16(sec_strength);
744
745 if (pri_strength)
746 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
747 if (sec_strength)
748 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
749
750 for (i = 0; i < 8; i += 2) {
751 const __m256i row = loadu_u16_8x2_avx2(in + i * CDEF_BSTRIDE, CDEF_BSTRIDE);
752 __m256i sum, res, max, min;
753
754 min = max = row;
755 sum = _mm256_setzero_si256();
756
757 // Primary near taps
758 cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
759 pri_damping,
760 po1,
761 row,
762 pri_strength_256,
763 pri_taps_0,
764 &max,
765 &min,
766 &sum);
767
768 // Primary far taps
769 cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
770 pri_damping,
771 po2,
772 row,
773 pri_strength_256,
774 pri_taps_1,
775 &max,
776 &min,
777 &sum);
778
779 // Secondary near taps
780 cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
781 sec_damping,
782 s1o1,
783 s2o1,
784 row,
785 sec_strength_256,
786 sec_taps_0,
787 &max,
788 &min,
789 &sum);
790
791 // Secondary far taps
792 cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
793 sec_damping,
794 s1o2,
795 s2o2,
796 row,
797 sec_strength_256,
798 sec_taps_1,
799 &max,
800 &min,
801 &sum);
802
803 // res = row + ((sum - (sum < 0) + 8) >> 4)
804 sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
805 res = _mm256_add_epi16(sum, duplicate_8);
806 res = _mm256_srai_epi16(res, 4);
807 res = _mm256_add_epi16(row, res);
808 res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
809 _mm_storeu_si128((__m128i *)&dst[i * dstride], _mm256_castsi256_si128(res));
810 _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride], _mm256_extracti128_si256(res, 1));
811 }
812 }
813
svt_cdef_filter_block_avx2(uint8_t * dst8,uint16_t * dst16,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t bsize,int32_t coeff_shift)814 void svt_cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int32_t dstride, const uint16_t *in,
815 int32_t pri_strength, int32_t sec_strength, int32_t dir,
816 int32_t pri_damping, int32_t sec_damping, int32_t bsize,
817 int32_t coeff_shift) {
818 if (dst8) {
819 if (bsize == BLOCK_8X8) {
820 svt_cdef_filter_block_8x8_8_avx2(dst8,
821 dstride,
822 in,
823 pri_strength,
824 sec_strength,
825 dir,
826 pri_damping,
827 sec_damping,
828 coeff_shift);
829 } else if (bsize == BLOCK_4X8) {
830 svt_cdef_filter_block_4x4_8_avx2(dst8,
831 dstride,
832 in,
833 pri_strength,
834 sec_strength,
835 dir,
836 pri_damping,
837 sec_damping,
838 coeff_shift);
839 svt_cdef_filter_block_4x4_8_avx2(dst8 + 4 * dstride,
840 dstride,
841 in + 4 * CDEF_BSTRIDE,
842 pri_strength,
843 sec_strength,
844 dir,
845 pri_damping,
846 sec_damping,
847 coeff_shift);
848 } else if (bsize == BLOCK_8X4) {
849 svt_cdef_filter_block_4x4_8_avx2(dst8,
850 dstride,
851 in,
852 pri_strength,
853 sec_strength,
854 dir,
855 pri_damping,
856 sec_damping,
857 coeff_shift);
858 svt_cdef_filter_block_4x4_8_avx2(dst8 + 4,
859 dstride,
860 in + 4,
861 pri_strength,
862 sec_strength,
863 dir,
864 pri_damping,
865 sec_damping,
866 coeff_shift);
867 } else {
868 svt_cdef_filter_block_4x4_8_avx2(dst8,
869 dstride,
870 in,
871 pri_strength,
872 sec_strength,
873 dir,
874 pri_damping,
875 sec_damping,
876 coeff_shift);
877 }
878 } else {
879 if (bsize == BLOCK_8X8) {
880 svt_cdef_filter_block_8x8_16(in,
881 pri_strength,
882 sec_strength,
883 dir,
884 pri_damping,
885 sec_damping,
886 coeff_shift,
887 dst16,
888 dstride);
889 } else if (bsize == BLOCK_4X8) {
890 svt_cdef_filter_block_4x4_16_avx2(dst16,
891 dstride,
892 in,
893 pri_strength,
894 sec_strength,
895 dir,
896 pri_damping,
897 sec_damping,
898 coeff_shift);
899 svt_cdef_filter_block_4x4_16_avx2(dst16 + 4 * dstride,
900 dstride,
901 in + 4 * CDEF_BSTRIDE,
902 pri_strength,
903 sec_strength,
904 dir,
905 pri_damping,
906 sec_damping,
907 coeff_shift);
908 } else if (bsize == BLOCK_8X4) {
909 svt_cdef_filter_block_4x4_16_avx2(dst16,
910 dstride,
911 in,
912 pri_strength,
913 sec_strength,
914 dir,
915 pri_damping,
916 sec_damping,
917 coeff_shift);
918 svt_cdef_filter_block_4x4_16_avx2(dst16 + 4,
919 dstride,
920 in + 4,
921 pri_strength,
922 sec_strength,
923 dir,
924 pri_damping,
925 sec_damping,
926 coeff_shift);
927 } else {
928 assert(bsize == BLOCK_4X4);
929 svt_cdef_filter_block_4x4_16_avx2(dst16,
930 dstride,
931 in,
932 pri_strength,
933 sec_strength,
934 dir,
935 pri_damping,
936 sec_damping,
937 coeff_shift);
938 }
939 }
940 }
941
svt_copy_rect8_8bit_to_16bit_avx2(uint16_t * dst,int32_t dstride,const uint8_t * src,int32_t sstride,int32_t v,int32_t h)942 void svt_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int32_t dstride, const uint8_t *src,
943 int32_t sstride, int32_t v, int32_t h) {
944 int32_t i, j;
945 for (i = 0; i < v; i++) {
946 for (j = 0; j < (h & ~0x7); j += 8) {
947 __m128i row = _mm_loadl_epi64((__m128i *)&src[i * sstride + j]);
948 _mm_storeu_si128((__m128i *)&dst[i * dstride + j],
949 _mm_unpacklo_epi8(row, _mm_setzero_si128()));
950 }
951 for (; j < h; j++) dst[i * dstride + j] = src[i * sstride + j];
952 }
953 }
954