1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include <immintrin.h>
13 #include "common_dsp_rtcd.h"
14 #include "EbBitstreamUnit.h"
15 #include "EbCdef.h"
16 #include "EbDefinitions.h"
17 #include "EbMemory_AVX2.h"
18 
19 /* partial A is a 16-bit vector of the form:
20  [x8 x7 x6 x5 x4 x3 x2 x1] and partial b has the form:
21  [0  y1 y2 y3 y4 y5 y6 y7].
22  This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23  (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
24  and const2. */
fold_mul_and_sum(__m256i partial,__m256i const_var)25 static INLINE __m256i fold_mul_and_sum(__m256i partial, __m256i const_var) {
26     partial = _mm256_shuffle_epi8(partial,
27                                   _mm256_set_epi32(0x0f0e0100,
28                                                    0x03020504,
29                                                    0x07060908,
30                                                    0x0b0a0d0c,
31                                                    0x0f0e0d0c,
32                                                    0x0b0a0908,
33                                                    0x07060504,
34                                                    0x03020100));
35     partial = _mm256_permutevar8x32_epi32(partial, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0));
36     partial = _mm256_shuffle_epi8(partial,
37                                   _mm256_set_epi32(0x0f0e0b0a,
38                                                    0x0d0c0908,
39                                                    0x07060302,
40                                                    0x05040100,
41                                                    0x0f0e0b0a,
42                                                    0x0d0c0908,
43                                                    0x07060302,
44                                                    0x05040100));
45     partial = _mm256_madd_epi16(partial, partial);
46     partial = _mm256_mullo_epi32(partial, const_var);
47     return partial;
48 }
49 
hsum4(__m128i x0,__m128i x1,__m128i x2,__m128i x3)50 static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
51     __m128i t0, t1, t2, t3;
52     t0 = _mm_unpacklo_epi32(x0, x1);
53     t1 = _mm_unpacklo_epi32(x2, x3);
54     t2 = _mm_unpackhi_epi32(x0, x1);
55     t3 = _mm_unpackhi_epi32(x2, x3);
56     x0 = _mm_unpacklo_epi64(t0, t1);
57     x1 = _mm_unpackhi_epi64(t0, t1);
58     x2 = _mm_unpacklo_epi64(t2, t3);
59     x3 = _mm_unpackhi_epi64(t2, t3);
60     return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
61 }
62 
63 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
64 to compute the remaining directions. */
compute_directions(__m128i lines[8],int32_t tmp_cost1[4])65 static INLINE void compute_directions(__m128i lines[8], int32_t tmp_cost1[4]) {
66     __m128i partial6;
67     __m128i tmp;
68 
69     __m256i partial4;
70     __m256i partial5;
71     __m256i partial7;
72     __m256i tmp_avx2;
73     /* Partial sums for lines 0 and 1. */
74     partial4 = _mm256_setr_m128i(_mm_slli_si128(lines[0], 14), _mm_srli_si128(lines[0], 2));
75     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[1], 12), _mm_srli_si128(lines[1], 4));
76     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
77     tmp      = _mm_add_epi16(lines[0], lines[1]);
78     partial5 = _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
79     partial7 = _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
80     partial6 = tmp;
81 
82     /* Partial sums for lines 2 and 3. */
83     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[2], 10), _mm_srli_si128(lines[2], 6));
84     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
85     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[3], 8), _mm_srli_si128(lines[3], 8));
86     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
87     tmp      = _mm_add_epi16(lines[2], lines[3]);
88     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
89     partial5 = _mm256_add_epi16(partial5, tmp_avx2);
90     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
91     partial7 = _mm256_add_epi16(partial7, tmp_avx2);
92     partial6 = _mm_add_epi16(partial6, tmp);
93 
94     /* Partial sums for lines 4 and 5. */
95     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[4], 6), _mm_srli_si128(lines[4], 10));
96     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
97     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[5], 4), _mm_srli_si128(lines[5], 12));
98     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
99     tmp      = _mm_add_epi16(lines[4], lines[5]);
100     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 6), _mm_srli_si128(tmp, 10));
101     partial5 = _mm256_add_epi16(partial5, tmp_avx2);
102     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 8), _mm_srli_si128(tmp, 8));
103     partial7 = _mm256_add_epi16(partial7, tmp_avx2);
104     partial6 = _mm_add_epi16(partial6, tmp);
105 
106     /* Partial sums for lines 6 and 7. */
107     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(lines[6], 2), _mm_srli_si128(lines[6], 14));
108     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
109     tmp_avx2 = _mm256_insertf128_si256(_mm256_setzero_si256(), lines[7], 0x0);
110     partial4 = _mm256_add_epi16(partial4, tmp_avx2);
111     tmp      = _mm_add_epi16(lines[6], lines[7]);
112     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 4), _mm_srli_si128(tmp, 12));
113     partial5 = _mm256_add_epi16(partial5, tmp_avx2);
114     tmp_avx2 = _mm256_setr_m128i(_mm_slli_si128(tmp, 10), _mm_srli_si128(tmp, 6));
115     partial7 = _mm256_add_epi16(partial7, tmp_avx2);
116     partial6 = _mm_add_epi16(partial6, tmp);
117 
118     /* Compute costs in terms of partial sums. */
119     partial4 = fold_mul_and_sum(partial4, _mm256_set_epi32(105, 120, 140, 168, 210, 280, 420, 840));
120     partial7 = fold_mul_and_sum(partial7, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
121     partial5 = fold_mul_and_sum(partial5, _mm256_set_epi32(105, 105, 105, 140, 210, 420, 0, 0));
122     partial6 = _mm_madd_epi16(partial6, partial6);
123     partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
124     __m128i a, b, c;
125     a = _mm_add_epi32(_mm256_castsi256_si128(partial4), _mm256_extracti128_si256(partial4, 1));
126     b = _mm_add_epi32(_mm256_castsi256_si128(partial5), _mm256_extracti128_si256(partial5, 1));
127     c = _mm_add_epi32(_mm256_castsi256_si128(partial7), _mm256_extracti128_si256(partial7, 1));
128 
129     _mm_storeu_si128((__m128i *)tmp_cost1, hsum4(a, b, partial6, c));
130 }
131 
132 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
133 counter-clockwise rotation of the pixels. */
array_reverse_transpose_8x8(__m128i * in,__m128i * res)134 static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
135     const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
136     const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
137     const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
138     const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
139     const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
140     const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
141     const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
142     const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
143 
144     const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
145     const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
146     const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
147     const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
148     const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
149     const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
150     const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
151     const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
152 
153     res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
154     res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
155     res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
156     res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
157     res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
158     res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
159     res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
160     res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
161 }
162 
svt_cdef_find_dir_avx2(const uint16_t * img,int32_t stride,int32_t * var,int32_t coeff_shift)163 int32_t svt_cdef_find_dir_avx2(const uint16_t *img, int32_t stride, int32_t *var,
164                                int32_t coeff_shift) {
165     int32_t i;
166     int32_t cost[8];
167     int32_t best_cost = 0;
168     int32_t best_dir  = 0;
169     __m128i lines[8];
170     __m128i const_128 = _mm_set1_epi16(128);
171     for (i = 0; i < 8; i++) {
172         lines[i] = _mm_lddqu_si128((__m128i *)&img[i * stride]);
173         lines[i] = _mm_sub_epi16(_mm_sra_epi16(lines[i], _mm_cvtsi32_si128(coeff_shift)),
174                                  const_128);
175     }
176 
177     /* Compute "mostly vertical" directions. */
178     compute_directions(lines, cost + 4);
179 
180     array_reverse_transpose_8x8(lines, lines);
181 
182     /* Compute "mostly horizontal" directions. */
183     compute_directions(lines, cost);
184 
185     for (i = 0; i < 8; i++) {
186         if (cost[i] > best_cost) {
187             best_cost = cost[i];
188             best_dir  = i;
189         }
190     }
191 
192     /* Difference between the optimal variance and the variance along the
193     orthogonal direction. Again, the sum(x^2) terms cancel out. */
194     *var = best_cost - cost[(best_dir + 4) & 7];
195     /* We'd normally divide by 840, but dividing by 1024 is close enough
196     for what we're going to do with this. */
197     *var >>= 10;
198     return best_dir;
199 }
200 
201 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(const __m256i in0,const __m256i in1,const __m256i threshold,const uint32_t adjdamp)202 static INLINE __m256i constrain16(const __m256i in0, const __m256i in1, const __m256i threshold,
203                                   const uint32_t adjdamp) {
204     const __m256i diff = _mm256_sub_epi16(in0, in1);
205     const __m256i sign = _mm256_srai_epi16(diff, 15);
206     const __m256i a    = _mm256_abs_epi16(diff);
207     const __m256i l    = _mm256_srl_epi16(a, _mm_cvtsi32_si128(adjdamp));
208     const __m256i s    = _mm256_subs_epu16(threshold, l);
209     const __m256i m    = _mm256_min_epi16(a, s);
210     const __m256i d    = _mm256_add_epi16(sign, m);
211     return _mm256_xor_si256(d, sign);
212 }
213 
svt_cdef_filter_block_4x4_8_avx2(uint8_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)214 static void svt_cdef_filter_block_4x4_8_avx2(uint8_t *dst, int32_t dstride, const uint16_t *in,
215                                              int32_t pri_strength, int32_t sec_strength,
216                                              int32_t dir, int32_t pri_damping, int32_t sec_damping,
217                                              int32_t coeff_shift) {
218     __m256i p0, p1, p2, p3, sum, row, res;
219     __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
220     int32_t po1  = eb_cdef_directions[dir][0];
221     int32_t po2  = eb_cdef_directions[dir][1];
222     int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
223     int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
224     int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
225     int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
226 
227     const int32_t *pri_taps         = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
228     const int32_t *sec_taps         = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
229     __m256i        pri_strength_256 = _mm256_set1_epi16(pri_strength);
230     __m256i        sec_strength_256 = _mm256_set1_epi16(sec_strength);
231 
232     if (pri_strength)
233         pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
234     if (sec_strength)
235         sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
236 
237     sum = _mm256_setzero_si256();
238     row = _mm256_set_epi64x(*(uint64_t *)(in),
239                             *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
240                             *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
241                             *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
242     min = max = row;
243 
244     if (pri_strength) {
245         // Primary near taps
246         p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
247                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
248                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
249                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
250         p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
251                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
252                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
253                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
254 
255         max = _mm256_max_epi16(
256             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
257             _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
258         min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
259         p0  = constrain16(p0, row, pri_strength_256, pri_damping);
260         p1  = constrain16(p1, row, pri_strength_256, pri_damping);
261 
262         // sum += pri_taps[0] * (p0 + p1)
263         sum = _mm256_add_epi16(
264             sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]), _mm256_add_epi16(p0, p1)));
265 
266         // Primary far taps
267         p0  = _mm256_set_epi64x(*(uint64_t *)(in + po2),
268                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
269                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
270                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
271         p1  = _mm256_set_epi64x(*(uint64_t *)(in - po2),
272                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
273                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
274                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
275         max = _mm256_max_epi16(
276             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
277             _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
278         min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
279         p0  = constrain16(p0, row, pri_strength_256, pri_damping);
280         p1  = constrain16(p1, row, pri_strength_256, pri_damping);
281 
282         // sum += pri_taps[1] * (p0 + p1)
283         sum = _mm256_add_epi16(
284             sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]), _mm256_add_epi16(p0, p1)));
285     }
286 
287     if (sec_strength) {
288         // Secondary near taps
289         p0  = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
290                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
291                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
292                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
293         p1  = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
294                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
295                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
296                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
297         p2  = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
298                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
299                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
300                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
301         p3  = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
302                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
303                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
304                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
305         max = _mm256_max_epi16(
306             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
307             _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
308         max = _mm256_max_epi16(
309             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
310             _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
311         min = _mm256_min_epi16(
312             _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
313         p0 = constrain16(p0, row, sec_strength_256, sec_damping);
314         p1 = constrain16(p1, row, sec_strength_256, sec_damping);
315         p2 = constrain16(p2, row, sec_strength_256, sec_damping);
316         p3 = constrain16(p3, row, sec_strength_256, sec_damping);
317 
318         // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
319         sum = _mm256_add_epi16(sum,
320                                _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
321                                                   _mm256_add_epi16(_mm256_add_epi16(p0, p1),
322                                                                    _mm256_add_epi16(p2, p3))));
323 
324         // Secondary far taps
325         p0  = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
326                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
327                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
328                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
329         p1  = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
330                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
331                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
332                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
333         p2  = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
334                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
335                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
336                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
337         p3  = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
338                                *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
339                                *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
340                                *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
341         max = _mm256_max_epi16(
342             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
343             _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
344         max = _mm256_max_epi16(
345             _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
346             _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
347         min = _mm256_min_epi16(
348             _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
349         p0 = constrain16(p0, row, sec_strength_256, sec_damping);
350         p1 = constrain16(p1, row, sec_strength_256, sec_damping);
351         p2 = constrain16(p2, row, sec_strength_256, sec_damping);
352         p3 = constrain16(p3, row, sec_strength_256, sec_damping);
353 
354         // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
355         sum = _mm256_add_epi16(sum,
356                                _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
357                                                   _mm256_add_epi16(_mm256_add_epi16(p0, p1),
358                                                                    _mm256_add_epi16(p2, p3))));
359     }
360 
361     // res = row + ((sum - (sum < 0) + 8) >> 4)
362     sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
363     res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
364     res = _mm256_srai_epi16(res, 4);
365     res = _mm256_add_epi16(row, res);
366     res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
367     res = _mm256_packus_epi16(res, res);
368 
369     *(int32_t *)(dst + 0 * dstride) = _mm256_extract_epi32(res, 5);
370     *(int32_t *)(dst + 1 * dstride) = _mm256_extract_epi32(res, 4);
371     *(int32_t *)(dst + 2 * dstride) = _mm256_extract_epi32(res, 1);
372     *(int32_t *)(dst + 3 * dstride) = _mm256_cvtsi256_si32(res);
373 }
374 
svt_cdef_filter_block_8x8_8_avx2(uint8_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)375 static void svt_cdef_filter_block_8x8_8_avx2(uint8_t *dst, int32_t dstride, const uint16_t *in,
376                                              int32_t pri_strength, int32_t sec_strength,
377                                              int32_t dir, int32_t pri_damping, int32_t sec_damping,
378                                              int32_t coeff_shift) {
379     int32_t i;
380     __m256i sum, p0, p1, p2, p3, row, res;
381     __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
382     int32_t po1  = eb_cdef_directions[dir][0];
383     int32_t po2  = eb_cdef_directions[dir][1];
384     int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
385     int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
386     int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
387     int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
388     // SSE CHKN
389     const int32_t *pri_taps         = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
390     const int32_t *sec_taps         = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
391     __m256i        pri_taps_0       = _mm256_set1_epi16(pri_taps[0]);
392     __m256i        pri_taps_1       = _mm256_set1_epi16(pri_taps[1]);
393     __m256i        sec_taps_0       = _mm256_set1_epi16(sec_taps[0]);
394     __m256i        sec_taps_1       = _mm256_set1_epi16(sec_taps[1]);
395     __m256i        duplicate_8      = _mm256_set1_epi16(8);
396     __m256i        pri_strength_256 = _mm256_set1_epi16(pri_strength);
397     __m256i        sec_strength_256 = _mm256_set1_epi16(sec_strength);
398 
399     if (pri_strength)
400         pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
401     if (sec_strength)
402         sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
403 
404     for (i = 0; i < 8; i += 2) {
405         sum = _mm256_setzero_si256();
406         row = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE)),
407                                 _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE)));
408 
409         min = max = row;
410         if (pri_strength) {
411             // Primary near taps
412             p0  = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po1)),
413                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po1)));
414             p1  = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po1)),
415                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po1)));
416             max = _mm256_max_epi16(
417                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
418                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
419             min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
420             p0  = constrain16(p0, row, pri_strength_256, pri_damping);
421             p1  = constrain16(p1, row, pri_strength_256, pri_damping);
422 
423             // sum += pri_taps[0] * (p0 + p1)
424             sum = _mm256_add_epi16(sum, _mm256_mullo_epi16(pri_taps_0, _mm256_add_epi16(p0, p1)));
425 
426             // Primary far taps
427             p0  = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + po2)),
428                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + po2)));
429             p1  = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - po2)),
430                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - po2)));
431             max = _mm256_max_epi16(
432                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
433                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
434             min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
435             p0  = constrain16(p0, row, pri_strength_256, pri_damping);
436             p1  = constrain16(p1, row, pri_strength_256, pri_damping);
437 
438             // sum += pri_taps[1] * (p0 + p1)
439             sum = _mm256_add_epi16(sum, _mm256_mullo_epi16(pri_taps_1, _mm256_add_epi16(p0, p1)));
440         }
441 
442         if (sec_strength) {
443             // Secondary near taps
444             p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o1)),
445                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o1)));
446             p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o1)),
447                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o1)));
448             p2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o1)),
449                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o1)));
450             p3 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o1)),
451                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o1)));
452             max = _mm256_max_epi16(
453                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
454                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
455             max = _mm256_max_epi16(
456                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
457                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
458             min = _mm256_min_epi16(
459                 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
460             p0 = constrain16(p0, row, sec_strength_256, sec_damping);
461             p1 = constrain16(p1, row, sec_strength_256, sec_damping);
462             p2 = constrain16(p2, row, sec_strength_256, sec_damping);
463             p3 = constrain16(p3, row, sec_strength_256, sec_damping);
464 
465             // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
466             sum = _mm256_add_epi16(sum,
467                                    _mm256_mullo_epi16(sec_taps_0,
468                                                       _mm256_add_epi16(_mm256_add_epi16(p0, p1),
469                                                                        _mm256_add_epi16(p2, p3))));
470 
471             // Secondary far taps
472             p0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s1o2)),
473                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s1o2)));
474             p1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s1o2)),
475                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s1o2)));
476             p2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE + s2o2)),
477                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE + s2o2)));
478             p3 = _mm256_setr_m128i(_mm_loadu_si128((__m128i *)(in + (i + 1) * CDEF_BSTRIDE - s2o2)),
479                                    _mm_loadu_si128((__m128i *)(in + i * CDEF_BSTRIDE - s2o2)));
480             max = _mm256_max_epi16(
481                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
482                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
483             max = _mm256_max_epi16(
484                 _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
485                 _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
486             min = _mm256_min_epi16(
487                 _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2), p3);
488             p0 = constrain16(p0, row, sec_strength_256, sec_damping);
489             p1 = constrain16(p1, row, sec_strength_256, sec_damping);
490             p2 = constrain16(p2, row, sec_strength_256, sec_damping);
491             p3 = constrain16(p3, row, sec_strength_256, sec_damping);
492 
493             // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
494             sum = _mm256_add_epi16(sum,
495                                    _mm256_mullo_epi16(sec_taps_1,
496                                                       _mm256_add_epi16(_mm256_add_epi16(p0, p1),
497                                                                        _mm256_add_epi16(p2, p3))));
498         }
499 
500         // res = row + ((sum - (sum < 0) + 8) >> 4)
501         sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
502         res = _mm256_add_epi16(sum, duplicate_8);
503         res = _mm256_srai_epi16(res, 4);
504         res = _mm256_add_epi16(row, res);
505         res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
506         res = _mm256_packus_epi16(res, res);
507         *(int64_t *)(dst + i * dstride)       = _mm256_extract_epi64(res, 2);
508         *(int64_t *)(dst + (i + 1) * dstride) = _mm256_extract_epi64(res, 0);
509     }
510 }
511 
svt_cdef_filter_block_4x4_16_avx2(uint16_t * dst,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t coeff_shift)512 static void svt_cdef_filter_block_4x4_16_avx2(uint16_t *dst, int32_t dstride, const uint16_t *in,
513                                               int32_t pri_strength, int32_t sec_strength,
514                                               int32_t dir, int32_t pri_damping, int32_t sec_damping,
515                                               int32_t coeff_shift) {
516     __m256i p0, p1, p2, p3, sum, row, res;
517     __m256i max, min, large = _mm256_set1_epi16(CDEF_VERY_LARGE);
518     int32_t po1  = eb_cdef_directions[dir][0];
519     int32_t po2  = eb_cdef_directions[dir][1];
520     int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
521     int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
522     int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
523     int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
524 
525     const int32_t *pri_taps         = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
526     const int32_t *sec_taps         = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
527     __m256i        pri_strength_256 = _mm256_set1_epi16(pri_strength);
528     __m256i        sec_strength_256 = _mm256_set1_epi16(sec_strength);
529 
530     if (pri_strength)
531         pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
532     if (sec_strength)
533         sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
534     sum = _mm256_setzero_si256();
535     row = _mm256_set_epi64x(*(uint64_t *)(in),
536                             *(uint64_t *)(in + 1 * CDEF_BSTRIDE),
537                             *(uint64_t *)(in + 2 * CDEF_BSTRIDE),
538                             *(uint64_t *)(in + 3 * CDEF_BSTRIDE));
539     min = max = row;
540 
541     // Primary near taps
542     p0 = _mm256_set_epi64x(*(uint64_t *)(in + po1),
543                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po1),
544                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po1),
545                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po1));
546     p1 = _mm256_set_epi64x(*(uint64_t *)(in - po1),
547                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po1),
548                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po1),
549                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po1));
550 
551     max = _mm256_max_epi16(
552         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
553         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
554     min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
555     p0  = constrain16(p0, row, pri_strength_256, pri_damping);
556     p1  = constrain16(p1, row, pri_strength_256, pri_damping);
557 
558     // sum += pri_taps[0] * (p0 + p1)
559     sum = _mm256_add_epi16(
560         sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[0]), _mm256_add_epi16(p0, p1)));
561 
562     // Primary far taps
563     p0  = _mm256_set_epi64x(*(uint64_t *)(in + po2),
564                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + po2),
565                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + po2),
566                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + po2));
567     p1  = _mm256_set_epi64x(*(uint64_t *)(in - po2),
568                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - po2),
569                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - po2),
570                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - po2));
571     max = _mm256_max_epi16(
572         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
573         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
574     min = _mm256_min_epi16(_mm256_min_epi16(min, p0), p1);
575     p0  = constrain16(p0, row, pri_strength_256, pri_damping);
576     p1  = constrain16(p1, row, pri_strength_256, pri_damping);
577 
578     // sum += pri_taps[1] * (p0 + p1)
579     sum = _mm256_add_epi16(
580         sum, _mm256_mullo_epi16(_mm256_set1_epi16(pri_taps[1]), _mm256_add_epi16(p0, p1)));
581 
582     // Secondary near taps
583     p0  = _mm256_set_epi64x(*(uint64_t *)(in + s1o1),
584                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o1),
585                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o1),
586                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o1));
587     p1  = _mm256_set_epi64x(*(uint64_t *)(in - s1o1),
588                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o1),
589                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o1),
590                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o1));
591     p2  = _mm256_set_epi64x(*(uint64_t *)(in + s2o1),
592                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o1),
593                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o1),
594                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o1));
595     p3  = _mm256_set_epi64x(*(uint64_t *)(in - s2o1),
596                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o1),
597                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o1),
598                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o1));
599     max = _mm256_max_epi16(
600         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
601         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
602     max = _mm256_max_epi16(
603         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
604         _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
605     min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
606                            p3);
607     p0  = constrain16(p0, row, sec_strength_256, sec_damping);
608     p1  = constrain16(p1, row, sec_strength_256, sec_damping);
609     p2  = constrain16(p2, row, sec_strength_256, sec_damping);
610     p3  = constrain16(p3, row, sec_strength_256, sec_damping);
611 
612     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
613     sum = _mm256_add_epi16(
614         sum,
615         _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[0]),
616                            _mm256_add_epi16(_mm256_add_epi16(p0, p1), _mm256_add_epi16(p2, p3))));
617 
618     // Secondary far taps
619     p0  = _mm256_set_epi64x(*(uint64_t *)(in + s1o2),
620                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s1o2),
621                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s1o2),
622                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s1o2));
623     p1  = _mm256_set_epi64x(*(uint64_t *)(in - s1o2),
624                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s1o2),
625                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s1o2),
626                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s1o2));
627     p2  = _mm256_set_epi64x(*(uint64_t *)(in + s2o2),
628                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE + s2o2),
629                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE + s2o2),
630                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE + s2o2));
631     p3  = _mm256_set_epi64x(*(uint64_t *)(in - s2o2),
632                            *(uint64_t *)(in + 1 * CDEF_BSTRIDE - s2o2),
633                            *(uint64_t *)(in + 2 * CDEF_BSTRIDE - s2o2),
634                            *(uint64_t *)(in + 3 * CDEF_BSTRIDE - s2o2));
635     max = _mm256_max_epi16(
636         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
637         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
638     max = _mm256_max_epi16(
639         _mm256_max_epi16(max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
640         _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
641     min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(min, p0), p1), p2),
642                            p3);
643     p0  = constrain16(p0, row, sec_strength_256, sec_damping);
644     p1  = constrain16(p1, row, sec_strength_256, sec_damping);
645     p2  = constrain16(p2, row, sec_strength_256, sec_damping);
646     p3  = constrain16(p3, row, sec_strength_256, sec_damping);
647 
648     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
649     sum = _mm256_add_epi16(
650         sum,
651         _mm256_mullo_epi16(_mm256_set1_epi16(sec_taps[1]),
652                            _mm256_add_epi16(_mm256_add_epi16(p0, p1), _mm256_add_epi16(p2, p3))));
653 
654     // res = row + ((sum - (sum < 0) + 8) >> 4)
655     sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
656     res = _mm256_add_epi16(sum, _mm256_set1_epi16(8));
657     res = _mm256_srai_epi16(res, 4);
658     res = _mm256_add_epi16(row, res);
659     res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
660 
661     *(uint64_t *)(dst)               = _mm256_extract_epi64(res, 3);
662     *(uint64_t *)(dst + 1 * dstride) = _mm256_extract_epi64(res, 2);
663     *(uint64_t *)(dst + 2 * dstride) = _mm256_extract_epi64(res, 1);
664     *(uint64_t *)(dst + 3 * dstride) = _mm256_extract_epi64(res, 0);
665 }
666 
cdef_filter_block_8x8_16_pri_avx2(const uint16_t * const in,const int32_t pri_damping,const int32_t po,const __m256i row,const __m256i pri_strength_256,const __m256i pri_taps,__m256i * const max,__m256i * const min,__m256i * const sum)667 static INLINE void cdef_filter_block_8x8_16_pri_avx2(const uint16_t *const in,
668                                                      const int32_t pri_damping, const int32_t po,
669                                                      const __m256i row,
670                                                      const __m256i pri_strength_256,
671                                                      const __m256i pri_taps, __m256i *const max,
672                                                      __m256i *const min, __m256i *const sum) {
673     const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
674     const __m256i p0    = loadu_u16_8x2_avx2(in + po, CDEF_BSTRIDE);
675     const __m256i p1    = loadu_u16_8x2_avx2(in - po, CDEF_BSTRIDE);
676 
677     *max = _mm256_max_epi16(
678         _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
679         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
680     *min = _mm256_min_epi16(_mm256_min_epi16(*min, p0), p1);
681 
682     const __m256i q0 = constrain16(p0, row, pri_strength_256, pri_damping);
683     const __m256i q1 = constrain16(p1, row, pri_strength_256, pri_damping);
684 
685     // sum += pri_taps * (p0 + p1)
686     *sum = _mm256_add_epi16(*sum, _mm256_mullo_epi16(pri_taps, _mm256_add_epi16(q0, q1)));
687 }
688 
cdef_filter_block_8x8_16_sec_avx2(const uint16_t * const in,const int32_t sec_damping,const int32_t so1,const int32_t so2,const __m256i row,const __m256i sec_strength_256,const __m256i sec_taps,__m256i * const max,__m256i * const min,__m256i * const sum)689 static INLINE void cdef_filter_block_8x8_16_sec_avx2(const uint16_t *const in,
690                                                      const int32_t sec_damping, const int32_t so1,
691                                                      const int32_t so2, const __m256i row,
692                                                      const __m256i sec_strength_256,
693                                                      const __m256i sec_taps, __m256i *const max,
694                                                      __m256i *const min, __m256i *const sum) {
695     const __m256i large = _mm256_set1_epi16(CDEF_VERY_LARGE);
696     const __m256i p0    = loadu_u16_8x2_avx2(in + so1, CDEF_BSTRIDE);
697     const __m256i p1    = loadu_u16_8x2_avx2(in - so1, CDEF_BSTRIDE);
698     const __m256i p2    = loadu_u16_8x2_avx2(in + so2, CDEF_BSTRIDE);
699     const __m256i p3    = loadu_u16_8x2_avx2(in - so2, CDEF_BSTRIDE);
700 
701     *max = _mm256_max_epi16(
702         _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p0, large), p0)),
703         _mm256_andnot_si256(_mm256_cmpeq_epi16(p1, large), p1));
704     *max = _mm256_max_epi16(
705         _mm256_max_epi16(*max, _mm256_andnot_si256(_mm256_cmpeq_epi16(p2, large), p2)),
706         _mm256_andnot_si256(_mm256_cmpeq_epi16(p3, large), p3));
707     *min = _mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(_mm256_min_epi16(*min, p0), p1), p2),
708                             p3);
709 
710     const __m256i q0 = constrain16(p0, row, sec_strength_256, sec_damping);
711     const __m256i q1 = constrain16(p1, row, sec_strength_256, sec_damping);
712     const __m256i q2 = constrain16(p2, row, sec_strength_256, sec_damping);
713     const __m256i q3 = constrain16(p3, row, sec_strength_256, sec_damping);
714 
715     // sum += sec_taps * (p0 + p1 + p2 + p3)
716     *sum = _mm256_add_epi16(
717         *sum,
718         _mm256_mullo_epi16(sec_taps,
719                            _mm256_add_epi16(_mm256_add_epi16(q0, q1), _mm256_add_epi16(q2, q3))));
720 }
721 
svt_cdef_filter_block_8x8_16_avx2(const uint16_t * const in,const int32_t pri_strength,const int32_t sec_strength,const int32_t dir,int32_t pri_damping,int32_t sec_damping,const int32_t coeff_shift,uint16_t * const dst,const int32_t dstride)722 void svt_cdef_filter_block_8x8_16_avx2(const uint16_t *const in, const int32_t pri_strength,
723                                        const int32_t sec_strength, const int32_t dir,
724                                        int32_t pri_damping, int32_t sec_damping,
725                                        const int32_t coeff_shift, uint16_t *const dst,
726                                        const int32_t dstride) {
727     const int32_t po1  = eb_cdef_directions[dir][0];
728     const int32_t po2  = eb_cdef_directions[dir][1];
729     const int32_t s1o1 = eb_cdef_directions[(dir + 2) & 7][0];
730     const int32_t s1o2 = eb_cdef_directions[(dir + 2) & 7][1];
731     const int32_t s2o1 = eb_cdef_directions[(dir + 6) & 7][0];
732     const int32_t s2o2 = eb_cdef_directions[(dir + 6) & 7][1];
733     // SSE CHKN
734     const int32_t *pri_taps = eb_cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
735     const int32_t *sec_taps = eb_cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
736     int32_t        i;
737     const __m256i  pri_taps_0       = _mm256_set1_epi16(pri_taps[0]);
738     const __m256i  pri_taps_1       = _mm256_set1_epi16(pri_taps[1]);
739     const __m256i  sec_taps_0       = _mm256_set1_epi16(sec_taps[0]);
740     const __m256i  sec_taps_1       = _mm256_set1_epi16(sec_taps[1]);
741     const __m256i  duplicate_8      = _mm256_set1_epi16(8);
742     const __m256i  pri_strength_256 = _mm256_set1_epi16(pri_strength);
743     const __m256i  sec_strength_256 = _mm256_set1_epi16(sec_strength);
744 
745     if (pri_strength)
746         pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
747     if (sec_strength)
748         sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
749 
750     for (i = 0; i < 8; i += 2) {
751         const __m256i row = loadu_u16_8x2_avx2(in + i * CDEF_BSTRIDE, CDEF_BSTRIDE);
752         __m256i       sum, res, max, min;
753 
754         min = max = row;
755         sum       = _mm256_setzero_si256();
756 
757         // Primary near taps
758         cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
759                                           pri_damping,
760                                           po1,
761                                           row,
762                                           pri_strength_256,
763                                           pri_taps_0,
764                                           &max,
765                                           &min,
766                                           &sum);
767 
768         // Primary far taps
769         cdef_filter_block_8x8_16_pri_avx2(in + i * CDEF_BSTRIDE,
770                                           pri_damping,
771                                           po2,
772                                           row,
773                                           pri_strength_256,
774                                           pri_taps_1,
775                                           &max,
776                                           &min,
777                                           &sum);
778 
779         // Secondary near taps
780         cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
781                                           sec_damping,
782                                           s1o1,
783                                           s2o1,
784                                           row,
785                                           sec_strength_256,
786                                           sec_taps_0,
787                                           &max,
788                                           &min,
789                                           &sum);
790 
791         // Secondary far taps
792         cdef_filter_block_8x8_16_sec_avx2(in + i * CDEF_BSTRIDE,
793                                           sec_damping,
794                                           s1o2,
795                                           s2o2,
796                                           row,
797                                           sec_strength_256,
798                                           sec_taps_1,
799                                           &max,
800                                           &min,
801                                           &sum);
802 
803         // res = row + ((sum - (sum < 0) + 8) >> 4)
804         sum = _mm256_add_epi16(sum, _mm256_cmpgt_epi16(_mm256_setzero_si256(), sum));
805         res = _mm256_add_epi16(sum, duplicate_8);
806         res = _mm256_srai_epi16(res, 4);
807         res = _mm256_add_epi16(row, res);
808         res = _mm256_min_epi16(_mm256_max_epi16(res, min), max);
809         _mm_storeu_si128((__m128i *)&dst[i * dstride], _mm256_castsi256_si128(res));
810         _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride], _mm256_extracti128_si256(res, 1));
811     }
812 }
813 
svt_cdef_filter_block_avx2(uint8_t * dst8,uint16_t * dst16,int32_t dstride,const uint16_t * in,int32_t pri_strength,int32_t sec_strength,int32_t dir,int32_t pri_damping,int32_t sec_damping,int32_t bsize,int32_t coeff_shift)814 void svt_cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int32_t dstride, const uint16_t *in,
815                                 int32_t pri_strength, int32_t sec_strength, int32_t dir,
816                                 int32_t pri_damping, int32_t sec_damping, int32_t bsize,
817                                 int32_t coeff_shift) {
818     if (dst8) {
819         if (bsize == BLOCK_8X8) {
820             svt_cdef_filter_block_8x8_8_avx2(dst8,
821                                              dstride,
822                                              in,
823                                              pri_strength,
824                                              sec_strength,
825                                              dir,
826                                              pri_damping,
827                                              sec_damping,
828                                              coeff_shift);
829         } else if (bsize == BLOCK_4X8) {
830             svt_cdef_filter_block_4x4_8_avx2(dst8,
831                                              dstride,
832                                              in,
833                                              pri_strength,
834                                              sec_strength,
835                                              dir,
836                                              pri_damping,
837                                              sec_damping,
838                                              coeff_shift);
839             svt_cdef_filter_block_4x4_8_avx2(dst8 + 4 * dstride,
840                                              dstride,
841                                              in + 4 * CDEF_BSTRIDE,
842                                              pri_strength,
843                                              sec_strength,
844                                              dir,
845                                              pri_damping,
846                                              sec_damping,
847                                              coeff_shift);
848         } else if (bsize == BLOCK_8X4) {
849             svt_cdef_filter_block_4x4_8_avx2(dst8,
850                                              dstride,
851                                              in,
852                                              pri_strength,
853                                              sec_strength,
854                                              dir,
855                                              pri_damping,
856                                              sec_damping,
857                                              coeff_shift);
858             svt_cdef_filter_block_4x4_8_avx2(dst8 + 4,
859                                              dstride,
860                                              in + 4,
861                                              pri_strength,
862                                              sec_strength,
863                                              dir,
864                                              pri_damping,
865                                              sec_damping,
866                                              coeff_shift);
867         } else {
868             svt_cdef_filter_block_4x4_8_avx2(dst8,
869                                              dstride,
870                                              in,
871                                              pri_strength,
872                                              sec_strength,
873                                              dir,
874                                              pri_damping,
875                                              sec_damping,
876                                              coeff_shift);
877         }
878     } else {
879         if (bsize == BLOCK_8X8) {
880             svt_cdef_filter_block_8x8_16(in,
881                                          pri_strength,
882                                          sec_strength,
883                                          dir,
884                                          pri_damping,
885                                          sec_damping,
886                                          coeff_shift,
887                                          dst16,
888                                          dstride);
889         } else if (bsize == BLOCK_4X8) {
890             svt_cdef_filter_block_4x4_16_avx2(dst16,
891                                               dstride,
892                                               in,
893                                               pri_strength,
894                                               sec_strength,
895                                               dir,
896                                               pri_damping,
897                                               sec_damping,
898                                               coeff_shift);
899             svt_cdef_filter_block_4x4_16_avx2(dst16 + 4 * dstride,
900                                               dstride,
901                                               in + 4 * CDEF_BSTRIDE,
902                                               pri_strength,
903                                               sec_strength,
904                                               dir,
905                                               pri_damping,
906                                               sec_damping,
907                                               coeff_shift);
908         } else if (bsize == BLOCK_8X4) {
909             svt_cdef_filter_block_4x4_16_avx2(dst16,
910                                               dstride,
911                                               in,
912                                               pri_strength,
913                                               sec_strength,
914                                               dir,
915                                               pri_damping,
916                                               sec_damping,
917                                               coeff_shift);
918             svt_cdef_filter_block_4x4_16_avx2(dst16 + 4,
919                                               dstride,
920                                               in + 4,
921                                               pri_strength,
922                                               sec_strength,
923                                               dir,
924                                               pri_damping,
925                                               sec_damping,
926                                               coeff_shift);
927         } else {
928             assert(bsize == BLOCK_4X4);
929             svt_cdef_filter_block_4x4_16_avx2(dst16,
930                                               dstride,
931                                               in,
932                                               pri_strength,
933                                               sec_strength,
934                                               dir,
935                                               pri_damping,
936                                               sec_damping,
937                                               coeff_shift);
938         }
939     }
940 }
941 
svt_copy_rect8_8bit_to_16bit_avx2(uint16_t * dst,int32_t dstride,const uint8_t * src,int32_t sstride,int32_t v,int32_t h)942 void svt_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int32_t dstride, const uint8_t *src,
943                                        int32_t sstride, int32_t v, int32_t h) {
944     int32_t i, j;
945     for (i = 0; i < v; i++) {
946         for (j = 0; j < (h & ~0x7); j += 8) {
947             __m128i row = _mm_loadl_epi64((__m128i *)&src[i * sstride + j]);
948             _mm_storeu_si128((__m128i *)&dst[i * dstride + j],
949                              _mm_unpacklo_epi8(row, _mm_setzero_si128()));
950         }
951         for (; j < h; j++) dst[i * dstride + j] = src[i * sstride + j];
952     }
953 }
954