1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h> /* AVX2 */
12
13 #include "vpx_dsp_rtcd.h"
14 #include "mem.h"
15 #include "EbTranspose_AVX2.h"
16 #include "mem_sse2.h"
17
eb_vp9_lpf_horizontal_16_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)18 void eb_vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
19 const unsigned char *_blimit,
20 const unsigned char *_limit,
21 const unsigned char *_thresh) {
22 __m128i mask, hev, flat, flat2;
23 const __m128i zero = _mm_set1_epi16(0);
24 const __m128i one = _mm_set1_epi8(1);
25 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
26 __m128i abs_p1p0;
27
28 const __m128i thresh =
29 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
30 const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
31 const __m128i blimit =
32 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
33
34 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
35 q4p4 = _mm_castps_si128(
36 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
37 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
38 q3p3 = _mm_castps_si128(
39 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
40 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
41 q2p2 = _mm_castps_si128(
42 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
43 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
44 q1p1 = _mm_castps_si128(
45 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
46 p1q1 = _mm_shuffle_epi32(q1p1, 78);
47 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
48 q0p0 = _mm_castps_si128(
49 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
50 p0q0 = _mm_shuffle_epi32(q0p0, 78);
51
52 {
53 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
54 abs_p1p0 =
55 _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
56 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
57 fe = _mm_set1_epi8((char)0xfe);
58 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
59 abs_p0q0 =
60 _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
61 abs_p1q1 =
62 _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
63 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
64 hev = _mm_subs_epu8(flat, thresh);
65 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
66
67 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
68 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
69 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
70 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
71 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
72 mask = _mm_max_epu8(abs_p1p0, mask);
73 // mask |= (abs(p1 - p0) > limit) * -1;
74 // mask |= (abs(q1 - q0) > limit) * -1;
75
76 work = _mm_max_epu8(
77 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
78 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
79 mask = _mm_max_epu8(work, mask);
80 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
81 mask = _mm_subs_epu8(mask, limit);
82 mask = _mm_cmpeq_epi8(mask, zero);
83 }
84
85 // lp filter
86 {
87 const __m128i t4 = _mm_set1_epi8(4);
88 const __m128i t3 = _mm_set1_epi8(3);
89 const __m128i t80 = _mm_set1_epi8((char)0x80);
90 const __m128i t1 = _mm_set1_epi16(0x1);
91 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
92 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
93 __m128i qs0 = _mm_xor_si128(p0q0, t80);
94 __m128i qs1 = _mm_xor_si128(p1q1, t80);
95 __m128i filt;
96 __m128i work_a;
97 __m128i filter1, filter2;
98 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
99 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
100
101 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
102 work_a = _mm_subs_epi8(qs0, qs0ps0);
103 filt = _mm_adds_epi8(filt, work_a);
104 filt = _mm_adds_epi8(filt, work_a);
105 filt = _mm_adds_epi8(filt, work_a);
106 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
107 filt = _mm_and_si128(filt, mask);
108
109 filter1 = _mm_adds_epi8(filt, t4);
110 filter2 = _mm_adds_epi8(filt, t3);
111
112 filter1 = _mm_unpacklo_epi8(zero, filter1);
113 filter1 = _mm_srai_epi16(filter1, 0xB);
114 filter2 = _mm_unpacklo_epi8(zero, filter2);
115 filter2 = _mm_srai_epi16(filter2, 0xB);
116
117 /* Filter1 >> 3 */
118 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
119 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
120
121 /* filt >> 1 */
122 filt = _mm_adds_epi16(filter1, t1);
123 filt = _mm_srai_epi16(filt, 1);
124 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
125 filt);
126 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
127 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
128 // loop_filter done
129
130 {
131 __m128i work;
132 flat = _mm_max_epu8(
133 _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
134 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
135 flat = _mm_max_epu8(abs_p1p0, flat);
136 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
137 flat = _mm_subs_epu8(flat, one);
138 flat = _mm_cmpeq_epi8(flat, zero);
139 flat = _mm_and_si128(flat, mask);
140
141 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
142 q5p5 = _mm_castps_si128(
143 _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
144
145 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
146 q6p6 = _mm_castps_si128(
147 _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
148
149 flat2 = _mm_max_epu8(
150 _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
151 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
152
153 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
154 q7p7 = _mm_castps_si128(
155 _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
156
157 work = _mm_max_epu8(
158 _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
159 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
160
161 flat2 = _mm_max_epu8(work, flat2);
162 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
163 flat2 = _mm_subs_epu8(flat2, one);
164 flat2 = _mm_cmpeq_epi8(flat2, zero);
165 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
166 }
167
168 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
169 // flat and wide flat calculations
170 {
171 const __m128i eight = _mm_set1_epi16(8);
172 const __m128i four = _mm_set1_epi16(4);
173 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
174 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
175 __m128i pixelFilter_p, pixelFilter_q;
176 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
177 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
178
179 p7_16 = _mm_unpacklo_epi8(q7p7, zero);
180 p6_16 = _mm_unpacklo_epi8(q6p6, zero);
181 p5_16 = _mm_unpacklo_epi8(q5p5, zero);
182 p4_16 = _mm_unpacklo_epi8(q4p4, zero);
183 p3_16 = _mm_unpacklo_epi8(q3p3, zero);
184 p2_16 = _mm_unpacklo_epi8(q2p2, zero);
185 p1_16 = _mm_unpacklo_epi8(q1p1, zero);
186 p0_16 = _mm_unpacklo_epi8(q0p0, zero);
187 q0_16 = _mm_unpackhi_epi8(q0p0, zero);
188 q1_16 = _mm_unpackhi_epi8(q1p1, zero);
189 q2_16 = _mm_unpackhi_epi8(q2p2, zero);
190 q3_16 = _mm_unpackhi_epi8(q3p3, zero);
191 q4_16 = _mm_unpackhi_epi8(q4p4, zero);
192 q5_16 = _mm_unpackhi_epi8(q5p5, zero);
193 q6_16 = _mm_unpackhi_epi8(q6p6, zero);
194 q7_16 = _mm_unpackhi_epi8(q7p7, zero);
195
196 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
197 _mm_add_epi16(p4_16, p3_16));
198 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
199 _mm_add_epi16(q4_16, q3_16));
200
201 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
202 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
203
204 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
205 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
206 pixelFilter_p =
207 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
208 pixetFilter_p2p1p0 = _mm_add_epi16(
209 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
210 res_p = _mm_srli_epi16(
211 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
212 res_q = _mm_srli_epi16(
213 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
214 flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
215 res_p = _mm_srli_epi16(
216 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
217 res_q = _mm_srli_epi16(
218 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
219
220 flat_q0p0 = _mm_packus_epi16(res_p, res_q);
221
222 sum_p7 = _mm_add_epi16(p7_16, p7_16);
223 sum_q7 = _mm_add_epi16(q7_16, q7_16);
224 sum_p3 = _mm_add_epi16(p3_16, p3_16);
225 sum_q3 = _mm_add_epi16(q3_16, q3_16);
226
227 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
228 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
229 res_p = _mm_srli_epi16(
230 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
231 res_q = _mm_srli_epi16(
232 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
233 flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
234
235 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
236 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
237 res_p = _mm_srli_epi16(
238 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
239 res_q = _mm_srli_epi16(
240 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
241 flat_q1p1 = _mm_packus_epi16(res_p, res_q);
242
243 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
244 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
245 sum_p3 = _mm_add_epi16(sum_p3, p3_16);
246 sum_q3 = _mm_add_epi16(sum_q3, q3_16);
247
248 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
249 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
250 res_p = _mm_srli_epi16(
251 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
252 res_q = _mm_srli_epi16(
253 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
254 flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
255
256 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
257 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
258
259 res_p = _mm_srli_epi16(
260 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
261 res_q = _mm_srli_epi16(
262 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
263 flat_q2p2 = _mm_packus_epi16(res_p, res_q);
264
265 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
266 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
267 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
268 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
269 res_p = _mm_srli_epi16(
270 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
271 res_q = _mm_srli_epi16(
272 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
273 flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
274
275 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
276 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
277 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
278 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
279 res_p = _mm_srli_epi16(
280 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
281 res_q = _mm_srli_epi16(
282 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
283 flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
284
285 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
286 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
287 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
288 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
289 res_p = _mm_srli_epi16(
290 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
291 res_q = _mm_srli_epi16(
292 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
293 flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
294
295 sum_p7 = _mm_add_epi16(sum_p7, p7_16);
296 sum_q7 = _mm_add_epi16(sum_q7, q7_16);
297 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
298 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
299 res_p = _mm_srli_epi16(
300 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
301 res_q = _mm_srli_epi16(
302 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
303 flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
304 }
305 // wide flat
306 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307
308 flat = _mm_shuffle_epi32(flat, 68);
309 flat2 = _mm_shuffle_epi32(flat2, 68);
310
311 q2p2 = _mm_andnot_si128(flat, q2p2);
312 flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
313 q2p2 = _mm_or_si128(q2p2, flat_q2p2);
314
315 qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
316 flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
317 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
318
319 qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
320 flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
321 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
322
323 q6p6 = _mm_andnot_si128(flat2, q6p6);
324 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
325 q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
326 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
327 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
328
329 q5p5 = _mm_andnot_si128(flat2, q5p5);
330 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
331 q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
332 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
333 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
334
335 q4p4 = _mm_andnot_si128(flat2, q4p4);
336 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
337 q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
338 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
339 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
340
341 q3p3 = _mm_andnot_si128(flat2, q3p3);
342 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
343 q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
344 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
345 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
346
347 q2p2 = _mm_andnot_si128(flat2, q2p2);
348 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
349 q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
350 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
351 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
352
353 q1p1 = _mm_andnot_si128(flat2, q1p1);
354 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
355 q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
356 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
357 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358
359 q0p0 = _mm_andnot_si128(flat2, q0p0);
360 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361 q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
364 }
365 }
366
367 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
368 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
369 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
370 };
371
abs_diff(__m128i a,__m128i b)372 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
373 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
374 }
375
filter_add2_sub2_avx2(const __m256i total,const __m256i a1,const __m256i a2,const __m256i s1,const __m256i s2)376 static INLINE __m256i filter_add2_sub2_avx2(const __m256i total,
377 const __m256i a1, const __m256i a2,
378 const __m256i s1,
379 const __m256i s2) {
380 __m256i x = _mm256_add_epi16(a1, total);
381 x = _mm256_add_epi16(_mm256_sub_epi16(x, _mm256_add_epi16(s1, s2)), a2);
382 return x;
383 }
384
unpack_8bit_avx2(const __m128i in)385 static INLINE __m256i unpack_8bit_avx2(const __m128i in) {
386 const __m256i mask = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
387 const __m256i d = _mm256_inserti128_si256(_mm256_castsi128_si256(in), in, 1);
388 return _mm256_shuffle_epi8(d, mask);
389 }
390
filter8_mask_avx2(const __m128i flat,const __m128i other_filt,const __m256i f)391 static INLINE __m128i filter8_mask_avx2(const __m128i flat,
392 const __m128i other_filt,
393 const __m256i f) {
394 const __m256i ff = _mm256_srli_epi16(f, 3);
395 const __m128i f8 = _mm_packus_epi16(_mm256_extracti128_si256(ff, 0),
396 _mm256_extracti128_si256(ff, 1));
397 const __m128i result = _mm_and_si128(flat, f8);
398 return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
399 }
400
filter16_mask_avx2(const __m128i flat,const __m128i other_filt,const __m256i f)401 static INLINE __m128i filter16_mask_avx2(const __m128i flat,
402 const __m128i other_filt,
403 const __m256i f) {
404 const __m256i ff = _mm256_srli_epi16(f, 4);
405 const __m128i f16 = _mm_packus_epi16(_mm256_extracti128_si256(ff, 0),
406 _mm256_extracti128_si256(ff, 1));
407 const __m128i result = _mm_and_si128(flat, f16);
408 return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
409 }
410
lpf_filter8_avx2(const __m128i o,const __m128i flat2,const __m256i a0,const __m256i a1,const __m256i s0,const __m256i s1,__m256i * const total)411 static INLINE __m128i lpf_filter8_avx2(const __m128i o, const __m128i flat2,
412 const __m256i a0, const __m256i a1,
413 const __m256i s0, const __m256i s1,
414 __m256i *const total) {
415 *total = filter_add2_sub2_avx2(*total, a0, a1, s0, s1);
416 return filter8_mask_avx2(flat2, o, *total);
417 }
418
lpf_filter16_avx2(const __m128i o,const __m128i flat2,const __m256i a0,const __m256i a1,const __m256i s0,const __m256i s1,__m256i * const total)419 static INLINE __m128i lpf_filter16_avx2(const __m128i o, const __m128i flat2,
420 const __m256i a0, const __m256i a1,
421 const __m256i s0, const __m256i s1,
422 __m256i *const total) {
423 *total = filter_add2_sub2_avx2(*total, a0, a1, s0, s1);
424 return filter16_mask_avx2(flat2, o, *total);
425 }
426
lpf_horizontal_16_dual_avx2(const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,__m128i * const io)427 static INLINE void lpf_horizontal_16_dual_avx2(const unsigned char *_blimit,
428 const unsigned char *_limit,
429 const unsigned char *_thresh,
430 __m128i *const io) {
431 const __m128i zero = _mm_setzero_si128();
432 const __m128i one = _mm_set1_epi8(1);
433 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
434 const __m128i limit = _mm_load_si128((const __m128i *)_limit);
435 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
436 __m128i mask, hev, flat, flat2;
437 __m128i p[8], q[8];
438 __m128i op2, op1, op0, oq0, oq1, oq2;
439 __m128i max_abs_p1p0q1q0;
440 __m256i p0, p1, p2, p3, p4, p5, p6, p7;
441 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
442
443 p[7] = io[0];
444 p[6] = io[1];
445 p[5] = io[2];
446 p[4] = io[3];
447 p[3] = io[4];
448 p[2] = io[5];
449 p[1] = io[6];
450 p[0] = io[7];
451 q[0] = io[8];
452 q[1] = io[9];
453 q[2] = io[10];
454 q[3] = io[11];
455 q[4] = io[12];
456 q[5] = io[13];
457 q[6] = io[14];
458 q[7] = io[15];
459
460 p7 = unpack_8bit_avx2(p[7]);
461 p6 = unpack_8bit_avx2(p[6]);
462 p5 = unpack_8bit_avx2(p[5]);
463 p4 = unpack_8bit_avx2(p[4]);
464 p3 = unpack_8bit_avx2(p[3]);
465 p2 = unpack_8bit_avx2(p[2]);
466 p1 = unpack_8bit_avx2(p[1]);
467 p0 = unpack_8bit_avx2(p[0]);
468 q0 = unpack_8bit_avx2(q[0]);
469 q1 = unpack_8bit_avx2(q[1]);
470 q2 = unpack_8bit_avx2(q[2]);
471 q3 = unpack_8bit_avx2(q[3]);
472 q4 = unpack_8bit_avx2(q[4]);
473 q5 = unpack_8bit_avx2(q[5]);
474 q6 = unpack_8bit_avx2(q[6]);
475 q7 = unpack_8bit_avx2(q[7]);
476
477 {
478 const __m128i abs_p1p0 = abs_diff(p[1], p[0]);
479 const __m128i abs_q1q0 = abs_diff(q[1], q[0]);
480 const __m128i fe = _mm_set1_epi8((char)0xfe);
481 const __m128i ff = _mm_cmpeq_epi8(zero, zero);
482 __m128i abs_p0q0 = abs_diff(p[0], q[0]);
483 __m128i abs_p1q1 = abs_diff(p[1], q[1]);
484 __m128i work;
485 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
486
487 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
488 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
489 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
490 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
491 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
492 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
493 // mask |= (abs(p1 - p0) > limit) * -1;
494 // mask |= (abs(q1 - q0) > limit) * -1;
495 work = _mm_max_epu8(abs_diff(p[2], p[1]), abs_diff(p[3], p[2]));
496 mask = _mm_max_epu8(work, mask);
497 work = _mm_max_epu8(abs_diff(q[2], q[1]), abs_diff(q[3], q[2]));
498 mask = _mm_max_epu8(work, mask);
499 mask = _mm_subs_epu8(mask, limit);
500 mask = _mm_cmpeq_epi8(mask, zero);
501 }
502
503 {
504 __m128i work;
505 work = _mm_max_epu8(abs_diff(p[2], p[0]), abs_diff(q[2], q[0]));
506 flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
507 work = _mm_max_epu8(abs_diff(p[3], p[0]), abs_diff(q[3], q[0]));
508 flat = _mm_max_epu8(work, flat);
509 work = _mm_max_epu8(abs_diff(p[4], p[0]), abs_diff(q[4], q[0]));
510 flat = _mm_subs_epu8(flat, one);
511 flat = _mm_cmpeq_epi8(flat, zero);
512 flat = _mm_and_si128(flat, mask);
513 flat2 = _mm_max_epu8(abs_diff(p[5], p[0]), abs_diff(q[5], q[0]));
514 flat2 = _mm_max_epu8(work, flat2);
515 work = _mm_max_epu8(abs_diff(p[6], p[0]), abs_diff(q[6], q[0]));
516 flat2 = _mm_max_epu8(work, flat2);
517 work = _mm_max_epu8(abs_diff(p[7], p[0]), abs_diff(q[7], q[0]));
518 flat2 = _mm_max_epu8(work, flat2);
519 flat2 = _mm_subs_epu8(flat2, one);
520 flat2 = _mm_cmpeq_epi8(flat2, zero);
521 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
522 }
523
524 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
525 // filter4
526 {
527 const __m128i t4 = _mm_set1_epi8(4);
528 const __m128i t3 = _mm_set1_epi8(3);
529 const __m128i t80 = _mm_set1_epi8((char)0x80);
530 const __m128i te0 = _mm_set1_epi8((char)0xe0);
531 const __m128i t1f = _mm_set1_epi8(0x1f);
532 const __m128i t1 = _mm_set1_epi8(0x1);
533 const __m128i t7f = _mm_set1_epi8(0x7f);
534 const __m128i ff = _mm_cmpeq_epi8(t4, t4);
535 __m128i filt;
536 __m128i work_a;
537 __m128i filter1, filter2;
538
539 op1 = _mm_xor_si128(p[1], t80);
540 op0 = _mm_xor_si128(p[0], t80);
541 oq0 = _mm_xor_si128(q[0], t80);
542 oq1 = _mm_xor_si128(q[1], t80);
543
544 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
545 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
546 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
547
548 work_a = _mm_subs_epi8(oq0, op0);
549 filt = _mm_adds_epi8(filt, work_a);
550 filt = _mm_adds_epi8(filt, work_a);
551 filt = _mm_adds_epi8(filt, work_a);
552 // (vpx_filter + 3 * (qs0 - ps0)) & mask
553 filt = _mm_and_si128(filt, mask);
554 filter1 = _mm_adds_epi8(filt, t4);
555 filter2 = _mm_adds_epi8(filt, t3);
556
557 // Filter1 >> 3
558 work_a = _mm_cmpgt_epi8(zero, filter1);
559 filter1 = _mm_srli_epi16(filter1, 3);
560 work_a = _mm_and_si128(work_a, te0);
561 filter1 = _mm_and_si128(filter1, t1f);
562 filter1 = _mm_or_si128(filter1, work_a);
563 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
564
565 // Filter2 >> 3
566 work_a = _mm_cmpgt_epi8(zero, filter2);
567 filter2 = _mm_srli_epi16(filter2, 3);
568 work_a = _mm_and_si128(work_a, te0);
569 filter2 = _mm_and_si128(filter2, t1f);
570 filter2 = _mm_or_si128(filter2, work_a);
571 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
572
573 // filt >> 1
574 filt = _mm_adds_epi8(filter1, t1);
575 work_a = _mm_cmpgt_epi8(zero, filt);
576 filt = _mm_srli_epi16(filt, 1);
577 work_a = _mm_and_si128(work_a, t80);
578 filt = _mm_and_si128(filt, t7f);
579 filt = _mm_or_si128(filt, work_a);
580 filt = _mm_andnot_si128(hev, filt);
581 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
582 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
583 // loop_filter done
584
585 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
586 // filter8
587 #if 1
588 {
589 const __m256i four = _mm256_set1_epi16(4);
590 __m256i f8;
591
592 f8 = _mm256_add_epi16(_mm256_add_epi16(p3, four),
593 _mm256_add_epi16(p3, p2));
594 f8 = _mm256_add_epi16(_mm256_add_epi16(p3, f8), _mm256_add_epi16(p2, p1));
595 f8 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f8);
596
597 op2 = filter8_mask_avx2(flat, p[2], f8);
598 op1 = lpf_filter8_avx2(op1, flat, q1, p1, p2, p3, &f8);
599 op0 = lpf_filter8_avx2(op0, flat, q2, p0, p1, p3, &f8);
600 oq0 = lpf_filter8_avx2(oq0, flat, q3, q0, p0, p3, &f8);
601 oq1 = lpf_filter8_avx2(oq1, flat, q3, q1, q0, p2, &f8);
602 oq2 = lpf_filter8_avx2(q[2], flat, q3, q2, q1, p1, &f8);
603 }
604
605 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
606 // wide flat calculations
607 {
608 const __m256i eight = _mm256_set1_epi16(8);
609 __m256i f;
610
611 f = _mm256_sub_epi16(_mm256_slli_epi16(p7, 3), p7); // p7 * 7
612 f = _mm256_add_epi16(_mm256_slli_epi16(p6, 1), _mm256_add_epi16(p4, f));
613 f = _mm256_add_epi16(_mm256_add_epi16(p3, f), _mm256_add_epi16(p2, p1));
614 f = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f);
615 f = _mm256_add_epi16(_mm256_add_epi16(p5, eight), f);
616
617 io[1] = filter16_mask_avx2(flat2, p[6], f);
618 io[2] = lpf_filter16_avx2(p[5], flat2, q1, p5, p6, p7, &f);
619 io[3] = lpf_filter16_avx2(p[4], flat2, q2, p4, p5, p7, &f);
620 io[4] = lpf_filter16_avx2(p[3], flat2, q3, p3, p4, p7, &f);
621 io[5] = lpf_filter16_avx2(op2, flat2, q4, p2, p3, p7, &f);
622 io[6] = lpf_filter16_avx2(op1, flat2, q5, p1, p2, p7, &f);
623 io[7] = lpf_filter16_avx2(op0, flat2, q6, p0, p1, p7, &f);
624 io[8] = lpf_filter16_avx2(oq0, flat2, q7, q0, p0, p7, &f);
625 io[9] = lpf_filter16_avx2(oq1, flat2, q7, q1, p6, q0, &f);
626 io[10] = lpf_filter16_avx2(oq2, flat2, q7, q2, p5, q1, &f);
627 io[11] = lpf_filter16_avx2(q[3], flat2, q7, q3, p4, q2, &f);
628 io[12] = lpf_filter16_avx2(q[4], flat2, q7, q4, p3, q3, &f);
629 io[13] = lpf_filter16_avx2(q[5], flat2, q7, q5, p2, q4, &f);
630 io[14] = lpf_filter16_avx2(q[6], flat2, q7, q6, p1, q5, &f);
631 }
632 // wide flat
633 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
634 #else
635 // Note: this is even slower. Keep the code so don't try the same idea.
636 __m256i op2_op1, op0_oq0, oq1_oq2;
637 {
638 const __m256i four = _mm256_set1_epi16(4);
639 const __m256i flatx = _mm256_setr_m128i(flat, flat);
640 __m256i f0, f1;
641
642 f0 = _mm256_add_epi16(_mm256_add_epi16(p3, four),
643 _mm256_add_epi16(p3, p2));
644 f0 = _mm256_add_epi16(_mm256_add_epi16(p3, f0), _mm256_add_epi16(p2, p1));
645 f0 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f0);
646
647 f1 = filter_add2_sub2_avx2(f0, q1, p1, p2, p3);
648 op2_op1 =
649 dual_filter8_mask_avx2(flatx, _mm256_setr_m128i(p[2], op1), f0, f1);
650 op0_oq0 = dual_filter8_avx2(q2, p0, p1, p3, q3, q0, p0, p3, flatx,
651 _mm256_setr_m128i(op0, oq0), &f1);
652 oq1_oq2 = dual_filter8_avx2(q3, q1, q0, p2, q3, q2, q1, p1, flatx,
653 _mm256_setr_m128i(oq1, q[2]), &f1);
654 }
655
656 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
657 // wide flat calculations
658 {
659 const __m256i eight = _mm256_set1_epi16(8);
660 const __m256i flat2x = _mm256_setr_m128i(flat2, flat2);
661 __m256i f0, f1, xx;
662
663 f0 = _mm256_sub_epi16(_mm256_slli_epi16(p7, 3), p7); // p7 * 7
664 f0 = _mm256_add_epi16(_mm256_slli_epi16(p6, 1), _mm256_add_epi16(p4, f0));
665 f0 = _mm256_add_epi16(_mm256_add_epi16(p3, f0), _mm256_add_epi16(p2, p1));
666 f0 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f0);
667 f0 = _mm256_add_epi16(_mm256_add_epi16(p5, eight), f0);
668
669 f1 = filter_add2_sub2_avx2(f0, q1, p5, p6, p7);
670 xx = dual_filter16_mask_avx2(flat2x, _mm256_setr_m128i(p[6], p[5]), f0,
671 f1);
672 io[1] = _mm256_extracti128_si256(xx, 0);
673 io[2] = _mm256_extracti128_si256(xx, 1);
674
675 xx = dual_filter16_avx2(q2, p4, p5, p7, q3, p3, p4, p7, flat2x,
676 _mm256_setr_m128i(p[4], p[3]), &f1);
677 io[3] = _mm256_extracti128_si256(xx, 0);
678 io[4] = _mm256_extracti128_si256(xx, 1);
679
680 xx = dual_filter16_avx2(q4, p2, p3, p7, q5, p1, p2, p7, flat2x, op2_op1,
681 &f1);
682 io[5] = _mm256_extracti128_si256(xx, 0);
683 io[6] = _mm256_extracti128_si256(xx, 1);
684
685 xx = dual_filter16_avx2(q6, p0, p1, p7, q7, q0, p0, p7, flat2x, op0_oq0,
686 &f1);
687 io[7] = _mm256_extracti128_si256(xx, 0);
688 io[8] = _mm256_extracti128_si256(xx, 1);
689
690 xx = dual_filter16_avx2(q7, q1, p6, q0, q7, q2, p5, q1, flat2x, oq1_oq2,
691 &f1);
692 io[9] = _mm256_extracti128_si256(xx, 0);
693 io[10] = _mm256_extracti128_si256(xx, 1);
694
695 xx = dual_filter16_avx2(q7, q3, p4, q2, q7, q4, p3, q3, flat2x,
696 _mm256_setr_m128i(q[3], q[4]), &f1);
697 io[11] = _mm256_extracti128_si256(xx, 0);
698 io[12] = _mm256_extracti128_si256(xx, 1);
699
700 xx = dual_filter16_avx2(q7, q5, p2, q4, q7, q6, p1, q5, flat2x,
701 _mm256_setr_m128i(q[5], q[6]), &f1);
702 io[13] = _mm256_extracti128_si256(xx, 0);
703 io[14] = _mm256_extracti128_si256(xx, 1);
704 }
705 // wide flat
706 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
707 #endif
708 }
709 }
eb_vp9_lpf_horizontal_16_dual_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)710 void eb_vp9_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
711 const unsigned char *_blimit,
712 const unsigned char *_limit,
713 const unsigned char *_thresh) {
714 __m128i mask, hev, flat, flat2;
715 const __m128i zero = _mm_set1_epi16(0);
716 const __m128i one = _mm_set1_epi8(1);
717 __m128i p7, p6, p5;
718 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
719 __m128i q5, q6, q7;
720 __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
721 p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
722
723 const __m128i thresh =
724 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
725 const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
726 const __m128i blimit =
727 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
728
729 p256_4 =
730 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
731 p256_3 =
732 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
733 p256_2 =
734 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
735 p256_1 =
736 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
737 p256_0 =
738 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
739 q256_0 =
740 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
741 q256_1 =
742 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
743 q256_2 =
744 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
745 q256_3 =
746 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
747 q256_4 =
748 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
749
750 p4 = _mm256_castsi256_si128(p256_4);
751 p3 = _mm256_castsi256_si128(p256_3);
752 p2 = _mm256_castsi256_si128(p256_2);
753 p1 = _mm256_castsi256_si128(p256_1);
754 p0 = _mm256_castsi256_si128(p256_0);
755 q0 = _mm256_castsi256_si128(q256_0);
756 q1 = _mm256_castsi256_si128(q256_1);
757 q2 = _mm256_castsi256_si128(q256_2);
758 q3 = _mm256_castsi256_si128(q256_3);
759 q4 = _mm256_castsi256_si128(q256_4);
760
761 {
762 const __m128i abs_p1p0 =
763 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
764 const __m128i abs_q1q0 =
765 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
766 const __m128i fe = _mm_set1_epi8((char)0xfe);
767 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
768 __m128i abs_p0q0 =
769 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
770 __m128i abs_p1q1 =
771 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
772 __m128i work;
773 flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
774 hev = _mm_subs_epu8(flat, thresh);
775 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
776
777 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
778 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
779 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
780 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
781 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
782 mask = _mm_max_epu8(flat, mask);
783 // mask |= (abs(p1 - p0) > limit) * -1;
784 // mask |= (abs(q1 - q0) > limit) * -1;
785 work = _mm_max_epu8(
786 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
787 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
788 mask = _mm_max_epu8(work, mask);
789 work = _mm_max_epu8(
790 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
791 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
792 mask = _mm_max_epu8(work, mask);
793 mask = _mm_subs_epu8(mask, limit);
794 mask = _mm_cmpeq_epi8(mask, zero);
795 }
796
797 // lp filter
798 {
799 const __m128i t4 = _mm_set1_epi8(4);
800 const __m128i t3 = _mm_set1_epi8(3);
801 const __m128i t80 = _mm_set1_epi8((char)0x80);
802 const __m128i te0 = _mm_set1_epi8((char)0xe0);
803 const __m128i t1f = _mm_set1_epi8(0x1f);
804 const __m128i t1 = _mm_set1_epi8(0x1);
805 const __m128i t7f = _mm_set1_epi8(0x7f);
806
807 __m128i ps1 = _mm_xor_si128(p1, t80);
808 __m128i ps0 = _mm_xor_si128(p0, t80);
809 __m128i qs0 = _mm_xor_si128(q0, t80);
810 __m128i qs1 = _mm_xor_si128(q1, t80);
811 __m128i filt;
812 __m128i work_a;
813 __m128i filter1, filter2;
814 __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
815 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
816 flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
817
818 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
819 work_a = _mm_subs_epi8(qs0, ps0);
820 filt = _mm_adds_epi8(filt, work_a);
821 filt = _mm_adds_epi8(filt, work_a);
822 filt = _mm_adds_epi8(filt, work_a);
823 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
824 filt = _mm_and_si128(filt, mask);
825
826 filter1 = _mm_adds_epi8(filt, t4);
827 filter2 = _mm_adds_epi8(filt, t3);
828
829 /* Filter1 >> 3 */
830 work_a = _mm_cmpgt_epi8(zero, filter1);
831 filter1 = _mm_srli_epi16(filter1, 3);
832 work_a = _mm_and_si128(work_a, te0);
833 filter1 = _mm_and_si128(filter1, t1f);
834 filter1 = _mm_or_si128(filter1, work_a);
835 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
836
837 /* Filter2 >> 3 */
838 work_a = _mm_cmpgt_epi8(zero, filter2);
839 filter2 = _mm_srli_epi16(filter2, 3);
840 work_a = _mm_and_si128(work_a, te0);
841 filter2 = _mm_and_si128(filter2, t1f);
842 filter2 = _mm_or_si128(filter2, work_a);
843 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
844
845 /* filt >> 1 */
846 filt = _mm_adds_epi8(filter1, t1);
847 work_a = _mm_cmpgt_epi8(zero, filt);
848 filt = _mm_srli_epi16(filt, 1);
849 work_a = _mm_and_si128(work_a, t80);
850 filt = _mm_and_si128(filt, t7f);
851 filt = _mm_or_si128(filt, work_a);
852 filt = _mm_andnot_si128(hev, filt);
853 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
854 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
855 // loop_filter done
856
857 {
858 __m128i work;
859 work = _mm_max_epu8(
860 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
861 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
862 flat = _mm_max_epu8(work, flat);
863 work = _mm_max_epu8(
864 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
865 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
866 flat = _mm_max_epu8(work, flat);
867 work = _mm_max_epu8(
868 _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
869 _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
870 flat = _mm_subs_epu8(flat, one);
871 flat = _mm_cmpeq_epi8(flat, zero);
872 flat = _mm_and_si128(flat, mask);
873
874 p256_5 = _mm256_castpd_si256(
875 _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
876 q256_5 = _mm256_castpd_si256(
877 _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
878 p5 = _mm256_castsi256_si128(p256_5);
879 q5 = _mm256_castsi256_si128(q256_5);
880 flat2 = _mm_max_epu8(
881 _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
882 _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
883
884 flat2 = _mm_max_epu8(work, flat2);
885 p256_6 = _mm256_castpd_si256(
886 _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
887 q256_6 = _mm256_castpd_si256(
888 _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
889 p6 = _mm256_castsi256_si128(p256_6);
890 q6 = _mm256_castsi256_si128(q256_6);
891 work = _mm_max_epu8(
892 _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
893 _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
894
895 flat2 = _mm_max_epu8(work, flat2);
896
897 p256_7 = _mm256_castpd_si256(
898 _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
899 q256_7 = _mm256_castpd_si256(
900 _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
901 p7 = _mm256_castsi256_si128(p256_7);
902 q7 = _mm256_castsi256_si128(q256_7);
903 work = _mm_max_epu8(
904 _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
905 _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
906
907 flat2 = _mm_max_epu8(work, flat2);
908 flat2 = _mm_subs_epu8(flat2, one);
909 flat2 = _mm_cmpeq_epi8(flat2, zero);
910 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
911 }
912
913 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
914 // flat and wide flat calculations
915 {
916 const __m256i eight = _mm256_set1_epi16(8);
917 const __m256i four = _mm256_set1_epi16(4);
918 __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
919 pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
920
921 const __m256i filter =
922 _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
923 p256_7 = _mm256_shuffle_epi8(p256_7, filter);
924 p256_6 = _mm256_shuffle_epi8(p256_6, filter);
925 p256_5 = _mm256_shuffle_epi8(p256_5, filter);
926 p256_4 = _mm256_shuffle_epi8(p256_4, filter);
927 p256_3 = _mm256_shuffle_epi8(p256_3, filter);
928 p256_2 = _mm256_shuffle_epi8(p256_2, filter);
929 p256_1 = _mm256_shuffle_epi8(p256_1, filter);
930 p256_0 = _mm256_shuffle_epi8(p256_0, filter);
931 q256_0 = _mm256_shuffle_epi8(q256_0, filter);
932 q256_1 = _mm256_shuffle_epi8(q256_1, filter);
933 q256_2 = _mm256_shuffle_epi8(q256_2, filter);
934 q256_3 = _mm256_shuffle_epi8(q256_3, filter);
935 q256_4 = _mm256_shuffle_epi8(q256_4, filter);
936 q256_5 = _mm256_shuffle_epi8(q256_5, filter);
937 q256_6 = _mm256_shuffle_epi8(q256_6, filter);
938 q256_7 = _mm256_shuffle_epi8(q256_7, filter);
939
940 pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
941 _mm256_add_epi16(p256_4, p256_3));
942 pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
943 _mm256_add_epi16(q256_4, q256_3));
944
945 pixetFilter_p2p1p0 =
946 _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
947 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
948
949 pixetFilter_q2q1q0 =
950 _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
951 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
952
953 pixelFilter_p = _mm256_add_epi16(
954 eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
955
956 pixetFilter_p2p1p0 = _mm256_add_epi16(
957 four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
958
959 res_p = _mm256_srli_epi16(
960 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
961
962 flat2_p0 = _mm256_castsi256_si128(
963 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
964
965 res_q = _mm256_srli_epi16(
966 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
967
968 flat2_q0 = _mm256_castsi256_si128(
969 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
970
971 res_p =
972 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
973 _mm256_add_epi16(p256_3, p256_0)),
974 3);
975
976 flat_p0 = _mm256_castsi256_si128(
977 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
978
979 res_q =
980 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
981 _mm256_add_epi16(q256_3, q256_0)),
982 3);
983
984 flat_q0 = _mm256_castsi256_si128(
985 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
986
987 sum_p7 = _mm256_add_epi16(p256_7, p256_7);
988
989 sum_q7 = _mm256_add_epi16(q256_7, q256_7);
990
991 sum_p3 = _mm256_add_epi16(p256_3, p256_3);
992
993 sum_q3 = _mm256_add_epi16(q256_3, q256_3);
994
995 pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
996
997 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
998
999 res_p = _mm256_srli_epi16(
1000 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
1001
1002 flat2_p1 = _mm256_castsi256_si128(
1003 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1004
1005 res_q = _mm256_srli_epi16(
1006 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
1007
1008 flat2_q1 = _mm256_castsi256_si128(
1009 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1010
1011 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
1012
1013 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
1014
1015 res_p =
1016 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
1017 _mm256_add_epi16(sum_p3, p256_1)),
1018 3);
1019
1020 flat_p1 = _mm256_castsi256_si128(
1021 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1022
1023 res_q =
1024 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
1025 _mm256_add_epi16(sum_q3, q256_1)),
1026 3);
1027
1028 flat_q1 = _mm256_castsi256_si128(
1029 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1030
1031 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1032
1033 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1034
1035 sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
1036
1037 sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
1038
1039 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
1040
1041 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
1042
1043 res_p = _mm256_srli_epi16(
1044 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
1045
1046 flat2_p2 = _mm256_castsi256_si128(
1047 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1048
1049 res_q = _mm256_srli_epi16(
1050 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
1051
1052 flat2_q2 = _mm256_castsi256_si128(
1053 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1054
1055 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
1056
1057 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
1058
1059 res_p =
1060 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
1061 _mm256_add_epi16(sum_p3, p256_2)),
1062 3);
1063
1064 flat_p2 = _mm256_castsi256_si128(
1065 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1066
1067 res_q =
1068 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
1069 _mm256_add_epi16(sum_q3, q256_2)),
1070 3);
1071
1072 flat_q2 = _mm256_castsi256_si128(
1073 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1074
1075 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1076
1077 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1078
1079 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
1080
1081 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
1082
1083 res_p = _mm256_srli_epi16(
1084 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
1085
1086 flat2_p3 = _mm256_castsi256_si128(
1087 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1088
1089 res_q = _mm256_srli_epi16(
1090 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
1091
1092 flat2_q3 = _mm256_castsi256_si128(
1093 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1094
1095 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1096
1097 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1098
1099 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
1100
1101 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
1102
1103 res_p = _mm256_srli_epi16(
1104 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
1105
1106 flat2_p4 = _mm256_castsi256_si128(
1107 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1108
1109 res_q = _mm256_srli_epi16(
1110 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
1111
1112 flat2_q4 = _mm256_castsi256_si128(
1113 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1114
1115 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1116
1117 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1118
1119 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
1120
1121 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
1122
1123 res_p = _mm256_srli_epi16(
1124 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
1125
1126 flat2_p5 = _mm256_castsi256_si128(
1127 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1128
1129 res_q = _mm256_srli_epi16(
1130 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
1131
1132 flat2_q5 = _mm256_castsi256_si128(
1133 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1134
1135 sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1136
1137 sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1138
1139 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
1140
1141 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
1142
1143 res_p = _mm256_srli_epi16(
1144 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
1145
1146 flat2_p6 = _mm256_castsi256_si128(
1147 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1148
1149 res_q = _mm256_srli_epi16(
1150 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
1151
1152 flat2_q6 = _mm256_castsi256_si128(
1153 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1154 }
1155
1156 // wide flat
1157 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1158
1159 p2 = _mm_andnot_si128(flat, p2);
1160 flat_p2 = _mm_and_si128(flat, flat_p2);
1161 p2 = _mm_or_si128(flat_p2, p2);
1162
1163 p1 = _mm_andnot_si128(flat, ps1);
1164 flat_p1 = _mm_and_si128(flat, flat_p1);
1165 p1 = _mm_or_si128(flat_p1, p1);
1166
1167 p0 = _mm_andnot_si128(flat, ps0);
1168 flat_p0 = _mm_and_si128(flat, flat_p0);
1169 p0 = _mm_or_si128(flat_p0, p0);
1170
1171 q0 = _mm_andnot_si128(flat, qs0);
1172 flat_q0 = _mm_and_si128(flat, flat_q0);
1173 q0 = _mm_or_si128(flat_q0, q0);
1174
1175 q1 = _mm_andnot_si128(flat, qs1);
1176 flat_q1 = _mm_and_si128(flat, flat_q1);
1177 q1 = _mm_or_si128(flat_q1, q1);
1178
1179 q2 = _mm_andnot_si128(flat, q2);
1180 flat_q2 = _mm_and_si128(flat, flat_q2);
1181 q2 = _mm_or_si128(flat_q2, q2);
1182
1183 p6 = _mm_andnot_si128(flat2, p6);
1184 flat2_p6 = _mm_and_si128(flat2, flat2_p6);
1185 p6 = _mm_or_si128(flat2_p6, p6);
1186 _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
1187
1188 p5 = _mm_andnot_si128(flat2, p5);
1189 flat2_p5 = _mm_and_si128(flat2, flat2_p5);
1190 p5 = _mm_or_si128(flat2_p5, p5);
1191 _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
1192
1193 p4 = _mm_andnot_si128(flat2, p4);
1194 flat2_p4 = _mm_and_si128(flat2, flat2_p4);
1195 p4 = _mm_or_si128(flat2_p4, p4);
1196 _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
1197
1198 p3 = _mm_andnot_si128(flat2, p3);
1199 flat2_p3 = _mm_and_si128(flat2, flat2_p3);
1200 p3 = _mm_or_si128(flat2_p3, p3);
1201 _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
1202
1203 p2 = _mm_andnot_si128(flat2, p2);
1204 flat2_p2 = _mm_and_si128(flat2, flat2_p2);
1205 p2 = _mm_or_si128(flat2_p2, p2);
1206 _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1207
1208 p1 = _mm_andnot_si128(flat2, p1);
1209 flat2_p1 = _mm_and_si128(flat2, flat2_p1);
1210 p1 = _mm_or_si128(flat2_p1, p1);
1211 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1212
1213 p0 = _mm_andnot_si128(flat2, p0);
1214 flat2_p0 = _mm_and_si128(flat2, flat2_p0);
1215 p0 = _mm_or_si128(flat2_p0, p0);
1216 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1217
1218 q0 = _mm_andnot_si128(flat2, q0);
1219 flat2_q0 = _mm_and_si128(flat2, flat2_q0);
1220 q0 = _mm_or_si128(flat2_q0, q0);
1221 _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
1222
1223 q1 = _mm_andnot_si128(flat2, q1);
1224 flat2_q1 = _mm_and_si128(flat2, flat2_q1);
1225 q1 = _mm_or_si128(flat2_q1, q1);
1226 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1227
1228 q2 = _mm_andnot_si128(flat2, q2);
1229 flat2_q2 = _mm_and_si128(flat2, flat2_q2);
1230 q2 = _mm_or_si128(flat2_q2, q2);
1231 _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1232
1233 q3 = _mm_andnot_si128(flat2, q3);
1234 flat2_q3 = _mm_and_si128(flat2, flat2_q3);
1235 q3 = _mm_or_si128(flat2_q3, q3);
1236 _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
1237
1238 q4 = _mm_andnot_si128(flat2, q4);
1239 flat2_q4 = _mm_and_si128(flat2, flat2_q4);
1240 q4 = _mm_or_si128(flat2_q4, q4);
1241 _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
1242
1243 q5 = _mm_andnot_si128(flat2, q5);
1244 flat2_q5 = _mm_and_si128(flat2, flat2_q5);
1245 q5 = _mm_or_si128(flat2_q5, q5);
1246 _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
1247
1248 q6 = _mm_andnot_si128(flat2, q6);
1249 flat2_q6 = _mm_and_si128(flat2, flat2_q6);
1250 q6 = _mm_or_si128(flat2_q6, q6);
1251 _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
1252 }
1253 }
vpx_lpf_vertical_16_dual_avx2(unsigned char * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1254 void vpx_lpf_vertical_16_dual_avx2(unsigned char *s, int p,
1255 const uint8_t *blimit, const uint8_t *limit,
1256 const uint8_t *thresh) {
1257 __m128i io[16];
1258
1259 // Transpose 16x16
1260 loadu_8bit_16x16(s - 8, p, io);
1261 transpose_8bit_16x16_avx2(io, io);
1262
1263 // Loop filtering
1264 lpf_horizontal_16_dual_avx2(blimit, limit, thresh, io);
1265
1266 // Transpose back
1267 transpose_8bit_16x16_avx2(io, io);
1268 storeu_8bit_16x16(io, s - 8, p);
1269 }
1270