1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h> /* AVX2 */
12 
13 #include "vpx_dsp_rtcd.h"
14 #include "mem.h"
15 #include "EbTranspose_AVX2.h"
16 #include "mem_sse2.h"
17 
eb_vp9_lpf_horizontal_16_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)18 void eb_vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
19                                 const unsigned char *_blimit,
20                                 const unsigned char *_limit,
21                                 const unsigned char *_thresh) {
22   __m128i mask, hev, flat, flat2;
23   const __m128i zero = _mm_set1_epi16(0);
24   const __m128i one = _mm_set1_epi8(1);
25   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
26   __m128i abs_p1p0;
27 
28   const __m128i thresh =
29       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
30   const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
31   const __m128i blimit =
32       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
33 
34   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
35   q4p4 = _mm_castps_si128(
36       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
37   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
38   q3p3 = _mm_castps_si128(
39       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
40   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
41   q2p2 = _mm_castps_si128(
42       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
43   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
44   q1p1 = _mm_castps_si128(
45       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
46   p1q1 = _mm_shuffle_epi32(q1p1, 78);
47   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
48   q0p0 = _mm_castps_si128(
49       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
50   p0q0 = _mm_shuffle_epi32(q0p0, 78);
51 
52   {
53     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
54     abs_p1p0 =
55         _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
56     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
57     fe = _mm_set1_epi8((char)0xfe);
58     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
59     abs_p0q0 =
60         _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
61     abs_p1q1 =
62         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
63     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
64     hev = _mm_subs_epu8(flat, thresh);
65     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
66 
67     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
68     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
69     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
70     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
71     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
72     mask = _mm_max_epu8(abs_p1p0, mask);
73     // mask |= (abs(p1 - p0) > limit) * -1;
74     // mask |= (abs(q1 - q0) > limit) * -1;
75 
76     work = _mm_max_epu8(
77         _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
78         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
79     mask = _mm_max_epu8(work, mask);
80     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
81     mask = _mm_subs_epu8(mask, limit);
82     mask = _mm_cmpeq_epi8(mask, zero);
83   }
84 
85   // lp filter
86   {
87     const __m128i t4 = _mm_set1_epi8(4);
88     const __m128i t3 = _mm_set1_epi8(3);
89     const __m128i t80 = _mm_set1_epi8((char)0x80);
90     const __m128i t1 = _mm_set1_epi16(0x1);
91     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
92     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
93     __m128i qs0 = _mm_xor_si128(p0q0, t80);
94     __m128i qs1 = _mm_xor_si128(p1q1, t80);
95     __m128i filt;
96     __m128i work_a;
97     __m128i filter1, filter2;
98     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
99     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
100 
101     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
102     work_a = _mm_subs_epi8(qs0, qs0ps0);
103     filt = _mm_adds_epi8(filt, work_a);
104     filt = _mm_adds_epi8(filt, work_a);
105     filt = _mm_adds_epi8(filt, work_a);
106     /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
107     filt = _mm_and_si128(filt, mask);
108 
109     filter1 = _mm_adds_epi8(filt, t4);
110     filter2 = _mm_adds_epi8(filt, t3);
111 
112     filter1 = _mm_unpacklo_epi8(zero, filter1);
113     filter1 = _mm_srai_epi16(filter1, 0xB);
114     filter2 = _mm_unpacklo_epi8(zero, filter2);
115     filter2 = _mm_srai_epi16(filter2, 0xB);
116 
117     /* Filter1 >> 3 */
118     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
119     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
120 
121     /* filt >> 1 */
122     filt = _mm_adds_epi16(filter1, t1);
123     filt = _mm_srai_epi16(filt, 1);
124     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
125                             filt);
126     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
127     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
128     // loop_filter done
129 
130     {
131       __m128i work;
132       flat = _mm_max_epu8(
133           _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
134           _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
135       flat = _mm_max_epu8(abs_p1p0, flat);
136       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
137       flat = _mm_subs_epu8(flat, one);
138       flat = _mm_cmpeq_epi8(flat, zero);
139       flat = _mm_and_si128(flat, mask);
140 
141       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
142       q5p5 = _mm_castps_si128(
143           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
144 
145       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
146       q6p6 = _mm_castps_si128(
147           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
148 
149       flat2 = _mm_max_epu8(
150           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
151           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
152 
153       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
154       q7p7 = _mm_castps_si128(
155           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
156 
157       work = _mm_max_epu8(
158           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
159           _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
160 
161       flat2 = _mm_max_epu8(work, flat2);
162       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
163       flat2 = _mm_subs_epu8(flat2, one);
164       flat2 = _mm_cmpeq_epi8(flat2, zero);
165       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
166     }
167 
168     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
169     // flat and wide flat calculations
170     {
171       const __m128i eight = _mm_set1_epi16(8);
172       const __m128i four = _mm_set1_epi16(4);
173       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
174       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
175       __m128i pixelFilter_p, pixelFilter_q;
176       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
177       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
178 
179       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
180       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
181       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
182       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
183       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
184       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
185       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
186       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
187       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
188       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
189       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
190       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
191       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
192       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
193       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
194       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
195 
196       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
197                                     _mm_add_epi16(p4_16, p3_16));
198       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
199                                     _mm_add_epi16(q4_16, q3_16));
200 
201       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
202       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
203 
204       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
205       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
206       pixelFilter_p =
207           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
208       pixetFilter_p2p1p0 = _mm_add_epi16(
209           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
210       res_p = _mm_srli_epi16(
211           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
212       res_q = _mm_srli_epi16(
213           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
214       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
215       res_p = _mm_srli_epi16(
216           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
217       res_q = _mm_srli_epi16(
218           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
219 
220       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
221 
222       sum_p7 = _mm_add_epi16(p7_16, p7_16);
223       sum_q7 = _mm_add_epi16(q7_16, q7_16);
224       sum_p3 = _mm_add_epi16(p3_16, p3_16);
225       sum_q3 = _mm_add_epi16(q3_16, q3_16);
226 
227       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
228       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
229       res_p = _mm_srli_epi16(
230           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
231       res_q = _mm_srli_epi16(
232           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
233       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
234 
235       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
236       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
237       res_p = _mm_srli_epi16(
238           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
239       res_q = _mm_srli_epi16(
240           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
241       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
242 
243       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
244       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
245       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
246       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
247 
248       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
249       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
250       res_p = _mm_srli_epi16(
251           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
252       res_q = _mm_srli_epi16(
253           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
254       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
255 
256       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
257       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
258 
259       res_p = _mm_srli_epi16(
260           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
261       res_q = _mm_srli_epi16(
262           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
263       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
264 
265       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
266       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
267       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
268       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
269       res_p = _mm_srli_epi16(
270           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
271       res_q = _mm_srli_epi16(
272           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
273       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
274 
275       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
276       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
277       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
278       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
279       res_p = _mm_srli_epi16(
280           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
281       res_q = _mm_srli_epi16(
282           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
283       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
284 
285       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
286       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
287       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
288       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
289       res_p = _mm_srli_epi16(
290           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
291       res_q = _mm_srli_epi16(
292           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
293       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
294 
295       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
296       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
297       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
298       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
299       res_p = _mm_srli_epi16(
300           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
301       res_q = _mm_srli_epi16(
302           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
303       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
304     }
305     // wide flat
306     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307 
308     flat = _mm_shuffle_epi32(flat, 68);
309     flat2 = _mm_shuffle_epi32(flat2, 68);
310 
311     q2p2 = _mm_andnot_si128(flat, q2p2);
312     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
313     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
314 
315     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
316     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
317     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
318 
319     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
320     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
321     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
322 
323     q6p6 = _mm_andnot_si128(flat2, q6p6);
324     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
325     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
326     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
327     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
328 
329     q5p5 = _mm_andnot_si128(flat2, q5p5);
330     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
331     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
332     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
333     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
334 
335     q4p4 = _mm_andnot_si128(flat2, q4p4);
336     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
337     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
338     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
339     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
340 
341     q3p3 = _mm_andnot_si128(flat2, q3p3);
342     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
343     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
344     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
345     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
346 
347     q2p2 = _mm_andnot_si128(flat2, q2p2);
348     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
349     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
350     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
351     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
352 
353     q1p1 = _mm_andnot_si128(flat2, q1p1);
354     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
355     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
356     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
357     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358 
359     q0p0 = _mm_andnot_si128(flat2, q0p0);
360     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363     _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
364   }
365 }
366 
367 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
368   0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
369   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
370 };
371 
abs_diff(__m128i a,__m128i b)372 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
373     return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
374 }
375 
filter_add2_sub2_avx2(const __m256i total,const __m256i a1,const __m256i a2,const __m256i s1,const __m256i s2)376 static INLINE __m256i filter_add2_sub2_avx2(const __m256i total,
377     const __m256i a1, const __m256i a2,
378     const __m256i s1,
379     const __m256i s2) {
380     __m256i x = _mm256_add_epi16(a1, total);
381     x = _mm256_add_epi16(_mm256_sub_epi16(x, _mm256_add_epi16(s1, s2)), a2);
382     return x;
383 }
384 
unpack_8bit_avx2(const __m128i in)385 static INLINE __m256i unpack_8bit_avx2(const __m128i in) {
386     const __m256i mask = _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
387     const __m256i d = _mm256_inserti128_si256(_mm256_castsi128_si256(in), in, 1);
388     return _mm256_shuffle_epi8(d, mask);
389 }
390 
filter8_mask_avx2(const __m128i flat,const __m128i other_filt,const __m256i f)391 static INLINE __m128i filter8_mask_avx2(const __m128i flat,
392     const __m128i other_filt,
393     const __m256i f) {
394     const __m256i ff = _mm256_srli_epi16(f, 3);
395     const __m128i f8 = _mm_packus_epi16(_mm256_extracti128_si256(ff, 0),
396         _mm256_extracti128_si256(ff, 1));
397     const __m128i result = _mm_and_si128(flat, f8);
398     return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
399 }
400 
filter16_mask_avx2(const __m128i flat,const __m128i other_filt,const __m256i f)401 static INLINE __m128i filter16_mask_avx2(const __m128i flat,
402     const __m128i other_filt,
403     const __m256i f) {
404     const __m256i ff = _mm256_srli_epi16(f, 4);
405     const __m128i f16 = _mm_packus_epi16(_mm256_extracti128_si256(ff, 0),
406         _mm256_extracti128_si256(ff, 1));
407     const __m128i result = _mm_and_si128(flat, f16);
408     return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
409 }
410 
lpf_filter8_avx2(const __m128i o,const __m128i flat2,const __m256i a0,const __m256i a1,const __m256i s0,const __m256i s1,__m256i * const total)411 static INLINE __m128i lpf_filter8_avx2(const __m128i o, const __m128i flat2,
412     const __m256i a0, const __m256i a1,
413     const __m256i s0, const __m256i s1,
414     __m256i *const total) {
415     *total = filter_add2_sub2_avx2(*total, a0, a1, s0, s1);
416     return filter8_mask_avx2(flat2, o, *total);
417 }
418 
lpf_filter16_avx2(const __m128i o,const __m128i flat2,const __m256i a0,const __m256i a1,const __m256i s0,const __m256i s1,__m256i * const total)419 static INLINE __m128i lpf_filter16_avx2(const __m128i o, const __m128i flat2,
420     const __m256i a0, const __m256i a1,
421     const __m256i s0, const __m256i s1,
422     __m256i *const total) {
423     *total = filter_add2_sub2_avx2(*total, a0, a1, s0, s1);
424     return filter16_mask_avx2(flat2, o, *total);
425 }
426 
lpf_horizontal_16_dual_avx2(const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh,__m128i * const io)427 static INLINE void lpf_horizontal_16_dual_avx2(const unsigned char *_blimit,
428     const unsigned char *_limit,
429     const unsigned char *_thresh,
430     __m128i *const io) {
431     const __m128i zero = _mm_setzero_si128();
432     const __m128i one = _mm_set1_epi8(1);
433     const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
434     const __m128i limit = _mm_load_si128((const __m128i *)_limit);
435     const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
436     __m128i mask, hev, flat, flat2;
437     __m128i p[8], q[8];
438     __m128i op2, op1, op0, oq0, oq1, oq2;
439     __m128i max_abs_p1p0q1q0;
440     __m256i p0, p1, p2, p3, p4, p5, p6, p7;
441     __m256i q0, q1, q2, q3, q4, q5, q6, q7;
442 
443     p[7] = io[0];
444     p[6] = io[1];
445     p[5] = io[2];
446     p[4] = io[3];
447     p[3] = io[4];
448     p[2] = io[5];
449     p[1] = io[6];
450     p[0] = io[7];
451     q[0] = io[8];
452     q[1] = io[9];
453     q[2] = io[10];
454     q[3] = io[11];
455     q[4] = io[12];
456     q[5] = io[13];
457     q[6] = io[14];
458     q[7] = io[15];
459 
460     p7 = unpack_8bit_avx2(p[7]);
461     p6 = unpack_8bit_avx2(p[6]);
462     p5 = unpack_8bit_avx2(p[5]);
463     p4 = unpack_8bit_avx2(p[4]);
464     p3 = unpack_8bit_avx2(p[3]);
465     p2 = unpack_8bit_avx2(p[2]);
466     p1 = unpack_8bit_avx2(p[1]);
467     p0 = unpack_8bit_avx2(p[0]);
468     q0 = unpack_8bit_avx2(q[0]);
469     q1 = unpack_8bit_avx2(q[1]);
470     q2 = unpack_8bit_avx2(q[2]);
471     q3 = unpack_8bit_avx2(q[3]);
472     q4 = unpack_8bit_avx2(q[4]);
473     q5 = unpack_8bit_avx2(q[5]);
474     q6 = unpack_8bit_avx2(q[6]);
475     q7 = unpack_8bit_avx2(q[7]);
476 
477     {
478         const __m128i abs_p1p0 = abs_diff(p[1], p[0]);
479         const __m128i abs_q1q0 = abs_diff(q[1], q[0]);
480         const __m128i fe = _mm_set1_epi8((char)0xfe);
481         const __m128i ff = _mm_cmpeq_epi8(zero, zero);
482         __m128i abs_p0q0 = abs_diff(p[0], q[0]);
483         __m128i abs_p1q1 = abs_diff(p[1], q[1]);
484         __m128i work;
485         max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
486 
487         abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
488         abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
489         mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
490         mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
491         // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
492         mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
493         // mask |= (abs(p1 - p0) > limit) * -1;
494         // mask |= (abs(q1 - q0) > limit) * -1;
495         work = _mm_max_epu8(abs_diff(p[2], p[1]), abs_diff(p[3], p[2]));
496         mask = _mm_max_epu8(work, mask);
497         work = _mm_max_epu8(abs_diff(q[2], q[1]), abs_diff(q[3], q[2]));
498         mask = _mm_max_epu8(work, mask);
499         mask = _mm_subs_epu8(mask, limit);
500         mask = _mm_cmpeq_epi8(mask, zero);
501     }
502 
503     {
504         __m128i work;
505         work = _mm_max_epu8(abs_diff(p[2], p[0]), abs_diff(q[2], q[0]));
506         flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
507         work = _mm_max_epu8(abs_diff(p[3], p[0]), abs_diff(q[3], q[0]));
508         flat = _mm_max_epu8(work, flat);
509         work = _mm_max_epu8(abs_diff(p[4], p[0]), abs_diff(q[4], q[0]));
510         flat = _mm_subs_epu8(flat, one);
511         flat = _mm_cmpeq_epi8(flat, zero);
512         flat = _mm_and_si128(flat, mask);
513         flat2 = _mm_max_epu8(abs_diff(p[5], p[0]), abs_diff(q[5], q[0]));
514         flat2 = _mm_max_epu8(work, flat2);
515         work = _mm_max_epu8(abs_diff(p[6], p[0]), abs_diff(q[6], q[0]));
516         flat2 = _mm_max_epu8(work, flat2);
517         work = _mm_max_epu8(abs_diff(p[7], p[0]), abs_diff(q[7], q[0]));
518         flat2 = _mm_max_epu8(work, flat2);
519         flat2 = _mm_subs_epu8(flat2, one);
520         flat2 = _mm_cmpeq_epi8(flat2, zero);
521         flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
522     }
523 
524     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
525     // filter4
526     {
527         const __m128i t4 = _mm_set1_epi8(4);
528         const __m128i t3 = _mm_set1_epi8(3);
529         const __m128i t80 = _mm_set1_epi8((char)0x80);
530         const __m128i te0 = _mm_set1_epi8((char)0xe0);
531         const __m128i t1f = _mm_set1_epi8(0x1f);
532         const __m128i t1 = _mm_set1_epi8(0x1);
533         const __m128i t7f = _mm_set1_epi8(0x7f);
534         const __m128i ff = _mm_cmpeq_epi8(t4, t4);
535         __m128i filt;
536         __m128i work_a;
537         __m128i filter1, filter2;
538 
539         op1 = _mm_xor_si128(p[1], t80);
540         op0 = _mm_xor_si128(p[0], t80);
541         oq0 = _mm_xor_si128(q[0], t80);
542         oq1 = _mm_xor_si128(q[1], t80);
543 
544         hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
545         hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
546         filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
547 
548         work_a = _mm_subs_epi8(oq0, op0);
549         filt = _mm_adds_epi8(filt, work_a);
550         filt = _mm_adds_epi8(filt, work_a);
551         filt = _mm_adds_epi8(filt, work_a);
552         // (vpx_filter + 3 * (qs0 - ps0)) & mask
553         filt = _mm_and_si128(filt, mask);
554         filter1 = _mm_adds_epi8(filt, t4);
555         filter2 = _mm_adds_epi8(filt, t3);
556 
557         // Filter1 >> 3
558         work_a = _mm_cmpgt_epi8(zero, filter1);
559         filter1 = _mm_srli_epi16(filter1, 3);
560         work_a = _mm_and_si128(work_a, te0);
561         filter1 = _mm_and_si128(filter1, t1f);
562         filter1 = _mm_or_si128(filter1, work_a);
563         oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
564 
565         // Filter2 >> 3
566         work_a = _mm_cmpgt_epi8(zero, filter2);
567         filter2 = _mm_srli_epi16(filter2, 3);
568         work_a = _mm_and_si128(work_a, te0);
569         filter2 = _mm_and_si128(filter2, t1f);
570         filter2 = _mm_or_si128(filter2, work_a);
571         op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
572 
573         // filt >> 1
574         filt = _mm_adds_epi8(filter1, t1);
575         work_a = _mm_cmpgt_epi8(zero, filt);
576         filt = _mm_srli_epi16(filt, 1);
577         work_a = _mm_and_si128(work_a, t80);
578         filt = _mm_and_si128(filt, t7f);
579         filt = _mm_or_si128(filt, work_a);
580         filt = _mm_andnot_si128(hev, filt);
581         op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
582         oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
583         // loop_filter done
584 
585         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
586         // filter8
587 #if 1
588         {
589             const __m256i four = _mm256_set1_epi16(4);
590             __m256i f8;
591 
592             f8 = _mm256_add_epi16(_mm256_add_epi16(p3, four),
593                 _mm256_add_epi16(p3, p2));
594             f8 = _mm256_add_epi16(_mm256_add_epi16(p3, f8), _mm256_add_epi16(p2, p1));
595             f8 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f8);
596 
597             op2 = filter8_mask_avx2(flat, p[2], f8);
598             op1 = lpf_filter8_avx2(op1, flat, q1, p1, p2, p3, &f8);
599             op0 = lpf_filter8_avx2(op0, flat, q2, p0, p1, p3, &f8);
600             oq0 = lpf_filter8_avx2(oq0, flat, q3, q0, p0, p3, &f8);
601             oq1 = lpf_filter8_avx2(oq1, flat, q3, q1, q0, p2, &f8);
602             oq2 = lpf_filter8_avx2(q[2], flat, q3, q2, q1, p1, &f8);
603         }
604 
605         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
606         // wide flat calculations
607         {
608             const __m256i eight = _mm256_set1_epi16(8);
609             __m256i f;
610 
611             f = _mm256_sub_epi16(_mm256_slli_epi16(p7, 3), p7); // p7 * 7
612             f = _mm256_add_epi16(_mm256_slli_epi16(p6, 1), _mm256_add_epi16(p4, f));
613             f = _mm256_add_epi16(_mm256_add_epi16(p3, f), _mm256_add_epi16(p2, p1));
614             f = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f);
615             f = _mm256_add_epi16(_mm256_add_epi16(p5, eight), f);
616 
617             io[1] = filter16_mask_avx2(flat2, p[6], f);
618             io[2] = lpf_filter16_avx2(p[5], flat2, q1, p5, p6, p7, &f);
619             io[3] = lpf_filter16_avx2(p[4], flat2, q2, p4, p5, p7, &f);
620             io[4] = lpf_filter16_avx2(p[3], flat2, q3, p3, p4, p7, &f);
621             io[5] = lpf_filter16_avx2(op2, flat2, q4, p2, p3, p7, &f);
622             io[6] = lpf_filter16_avx2(op1, flat2, q5, p1, p2, p7, &f);
623             io[7] = lpf_filter16_avx2(op0, flat2, q6, p0, p1, p7, &f);
624             io[8] = lpf_filter16_avx2(oq0, flat2, q7, q0, p0, p7, &f);
625             io[9] = lpf_filter16_avx2(oq1, flat2, q7, q1, p6, q0, &f);
626             io[10] = lpf_filter16_avx2(oq2, flat2, q7, q2, p5, q1, &f);
627             io[11] = lpf_filter16_avx2(q[3], flat2, q7, q3, p4, q2, &f);
628             io[12] = lpf_filter16_avx2(q[4], flat2, q7, q4, p3, q3, &f);
629             io[13] = lpf_filter16_avx2(q[5], flat2, q7, q5, p2, q4, &f);
630             io[14] = lpf_filter16_avx2(q[6], flat2, q7, q6, p1, q5, &f);
631         }
632         // wide flat
633         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
634 #else
635     // Note: this is even slower. Keep the code so don't try the same idea.
636         __m256i op2_op1, op0_oq0, oq1_oq2;
637         {
638             const __m256i four = _mm256_set1_epi16(4);
639             const __m256i flatx = _mm256_setr_m128i(flat, flat);
640             __m256i f0, f1;
641 
642             f0 = _mm256_add_epi16(_mm256_add_epi16(p3, four),
643                 _mm256_add_epi16(p3, p2));
644             f0 = _mm256_add_epi16(_mm256_add_epi16(p3, f0), _mm256_add_epi16(p2, p1));
645             f0 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f0);
646 
647             f1 = filter_add2_sub2_avx2(f0, q1, p1, p2, p3);
648             op2_op1 =
649                 dual_filter8_mask_avx2(flatx, _mm256_setr_m128i(p[2], op1), f0, f1);
650             op0_oq0 = dual_filter8_avx2(q2, p0, p1, p3, q3, q0, p0, p3, flatx,
651                 _mm256_setr_m128i(op0, oq0), &f1);
652             oq1_oq2 = dual_filter8_avx2(q3, q1, q0, p2, q3, q2, q1, p1, flatx,
653                 _mm256_setr_m128i(oq1, q[2]), &f1);
654         }
655 
656         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
657         // wide flat calculations
658         {
659             const __m256i eight = _mm256_set1_epi16(8);
660             const __m256i flat2x = _mm256_setr_m128i(flat2, flat2);
661             __m256i f0, f1, xx;
662 
663             f0 = _mm256_sub_epi16(_mm256_slli_epi16(p7, 3), p7); // p7 * 7
664             f0 = _mm256_add_epi16(_mm256_slli_epi16(p6, 1), _mm256_add_epi16(p4, f0));
665             f0 = _mm256_add_epi16(_mm256_add_epi16(p3, f0), _mm256_add_epi16(p2, p1));
666             f0 = _mm256_add_epi16(_mm256_add_epi16(p0, q0), f0);
667             f0 = _mm256_add_epi16(_mm256_add_epi16(p5, eight), f0);
668 
669             f1 = filter_add2_sub2_avx2(f0, q1, p5, p6, p7);
670             xx = dual_filter16_mask_avx2(flat2x, _mm256_setr_m128i(p[6], p[5]), f0,
671                 f1);
672             io[1] = _mm256_extracti128_si256(xx, 0);
673             io[2] = _mm256_extracti128_si256(xx, 1);
674 
675             xx = dual_filter16_avx2(q2, p4, p5, p7, q3, p3, p4, p7, flat2x,
676                 _mm256_setr_m128i(p[4], p[3]), &f1);
677             io[3] = _mm256_extracti128_si256(xx, 0);
678             io[4] = _mm256_extracti128_si256(xx, 1);
679 
680             xx = dual_filter16_avx2(q4, p2, p3, p7, q5, p1, p2, p7, flat2x, op2_op1,
681                 &f1);
682             io[5] = _mm256_extracti128_si256(xx, 0);
683             io[6] = _mm256_extracti128_si256(xx, 1);
684 
685             xx = dual_filter16_avx2(q6, p0, p1, p7, q7, q0, p0, p7, flat2x, op0_oq0,
686                 &f1);
687             io[7] = _mm256_extracti128_si256(xx, 0);
688             io[8] = _mm256_extracti128_si256(xx, 1);
689 
690             xx = dual_filter16_avx2(q7, q1, p6, q0, q7, q2, p5, q1, flat2x, oq1_oq2,
691                 &f1);
692             io[9] = _mm256_extracti128_si256(xx, 0);
693             io[10] = _mm256_extracti128_si256(xx, 1);
694 
695             xx = dual_filter16_avx2(q7, q3, p4, q2, q7, q4, p3, q3, flat2x,
696                 _mm256_setr_m128i(q[3], q[4]), &f1);
697             io[11] = _mm256_extracti128_si256(xx, 0);
698             io[12] = _mm256_extracti128_si256(xx, 1);
699 
700             xx = dual_filter16_avx2(q7, q5, p2, q4, q7, q6, p1, q5, flat2x,
701                 _mm256_setr_m128i(q[5], q[6]), &f1);
702             io[13] = _mm256_extracti128_si256(xx, 0);
703             io[14] = _mm256_extracti128_si256(xx, 1);
704         }
705         // wide flat
706         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
707 #endif
708     }
709 }
eb_vp9_lpf_horizontal_16_dual_avx2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)710 void eb_vp9_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
711                                      const unsigned char *_blimit,
712                                      const unsigned char *_limit,
713                                      const unsigned char *_thresh) {
714   __m128i mask, hev, flat, flat2;
715   const __m128i zero = _mm_set1_epi16(0);
716   const __m128i one = _mm_set1_epi8(1);
717   __m128i p7, p6, p5;
718   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
719   __m128i q5, q6, q7;
720   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
721       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
722 
723   const __m128i thresh =
724       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
725   const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
726   const __m128i blimit =
727       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
728 
729   p256_4 =
730       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
731   p256_3 =
732       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
733   p256_2 =
734       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
735   p256_1 =
736       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
737   p256_0 =
738       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
739   q256_0 =
740       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
741   q256_1 =
742       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
743   q256_2 =
744       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
745   q256_3 =
746       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
747   q256_4 =
748       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
749 
750   p4 = _mm256_castsi256_si128(p256_4);
751   p3 = _mm256_castsi256_si128(p256_3);
752   p2 = _mm256_castsi256_si128(p256_2);
753   p1 = _mm256_castsi256_si128(p256_1);
754   p0 = _mm256_castsi256_si128(p256_0);
755   q0 = _mm256_castsi256_si128(q256_0);
756   q1 = _mm256_castsi256_si128(q256_1);
757   q2 = _mm256_castsi256_si128(q256_2);
758   q3 = _mm256_castsi256_si128(q256_3);
759   q4 = _mm256_castsi256_si128(q256_4);
760 
761   {
762     const __m128i abs_p1p0 =
763         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
764     const __m128i abs_q1q0 =
765         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
766     const __m128i fe = _mm_set1_epi8((char)0xfe);
767     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
768     __m128i abs_p0q0 =
769         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
770     __m128i abs_p1q1 =
771         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
772     __m128i work;
773     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
774     hev = _mm_subs_epu8(flat, thresh);
775     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
776 
777     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
778     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
779     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
780     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
781     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
782     mask = _mm_max_epu8(flat, mask);
783     // mask |= (abs(p1 - p0) > limit) * -1;
784     // mask |= (abs(q1 - q0) > limit) * -1;
785     work = _mm_max_epu8(
786         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
787         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
788     mask = _mm_max_epu8(work, mask);
789     work = _mm_max_epu8(
790         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
791         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
792     mask = _mm_max_epu8(work, mask);
793     mask = _mm_subs_epu8(mask, limit);
794     mask = _mm_cmpeq_epi8(mask, zero);
795   }
796 
797   // lp filter
798   {
799     const __m128i t4 = _mm_set1_epi8(4);
800     const __m128i t3 = _mm_set1_epi8(3);
801     const __m128i t80 = _mm_set1_epi8((char)0x80);
802     const __m128i te0 = _mm_set1_epi8((char)0xe0);
803     const __m128i t1f = _mm_set1_epi8(0x1f);
804     const __m128i t1 = _mm_set1_epi8(0x1);
805     const __m128i t7f = _mm_set1_epi8(0x7f);
806 
807     __m128i ps1 = _mm_xor_si128(p1, t80);
808     __m128i ps0 = _mm_xor_si128(p0, t80);
809     __m128i qs0 = _mm_xor_si128(q0, t80);
810     __m128i qs1 = _mm_xor_si128(q1, t80);
811     __m128i filt;
812     __m128i work_a;
813     __m128i filter1, filter2;
814     __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
815         flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
816         flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
817 
818     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
819     work_a = _mm_subs_epi8(qs0, ps0);
820     filt = _mm_adds_epi8(filt, work_a);
821     filt = _mm_adds_epi8(filt, work_a);
822     filt = _mm_adds_epi8(filt, work_a);
823     /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
824     filt = _mm_and_si128(filt, mask);
825 
826     filter1 = _mm_adds_epi8(filt, t4);
827     filter2 = _mm_adds_epi8(filt, t3);
828 
829     /* Filter1 >> 3 */
830     work_a = _mm_cmpgt_epi8(zero, filter1);
831     filter1 = _mm_srli_epi16(filter1, 3);
832     work_a = _mm_and_si128(work_a, te0);
833     filter1 = _mm_and_si128(filter1, t1f);
834     filter1 = _mm_or_si128(filter1, work_a);
835     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
836 
837     /* Filter2 >> 3 */
838     work_a = _mm_cmpgt_epi8(zero, filter2);
839     filter2 = _mm_srli_epi16(filter2, 3);
840     work_a = _mm_and_si128(work_a, te0);
841     filter2 = _mm_and_si128(filter2, t1f);
842     filter2 = _mm_or_si128(filter2, work_a);
843     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
844 
845     /* filt >> 1 */
846     filt = _mm_adds_epi8(filter1, t1);
847     work_a = _mm_cmpgt_epi8(zero, filt);
848     filt = _mm_srli_epi16(filt, 1);
849     work_a = _mm_and_si128(work_a, t80);
850     filt = _mm_and_si128(filt, t7f);
851     filt = _mm_or_si128(filt, work_a);
852     filt = _mm_andnot_si128(hev, filt);
853     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
854     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
855     // loop_filter done
856 
857     {
858       __m128i work;
859       work = _mm_max_epu8(
860           _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
861           _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
862       flat = _mm_max_epu8(work, flat);
863       work = _mm_max_epu8(
864           _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
865           _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
866       flat = _mm_max_epu8(work, flat);
867       work = _mm_max_epu8(
868           _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
869           _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
870       flat = _mm_subs_epu8(flat, one);
871       flat = _mm_cmpeq_epi8(flat, zero);
872       flat = _mm_and_si128(flat, mask);
873 
874       p256_5 = _mm256_castpd_si256(
875           _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
876       q256_5 = _mm256_castpd_si256(
877           _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
878       p5 = _mm256_castsi256_si128(p256_5);
879       q5 = _mm256_castsi256_si128(q256_5);
880       flat2 = _mm_max_epu8(
881           _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
882           _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
883 
884       flat2 = _mm_max_epu8(work, flat2);
885       p256_6 = _mm256_castpd_si256(
886           _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
887       q256_6 = _mm256_castpd_si256(
888           _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
889       p6 = _mm256_castsi256_si128(p256_6);
890       q6 = _mm256_castsi256_si128(q256_6);
891       work = _mm_max_epu8(
892           _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
893           _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
894 
895       flat2 = _mm_max_epu8(work, flat2);
896 
897       p256_7 = _mm256_castpd_si256(
898           _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
899       q256_7 = _mm256_castpd_si256(
900           _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
901       p7 = _mm256_castsi256_si128(p256_7);
902       q7 = _mm256_castsi256_si128(q256_7);
903       work = _mm_max_epu8(
904           _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
905           _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
906 
907       flat2 = _mm_max_epu8(work, flat2);
908       flat2 = _mm_subs_epu8(flat2, one);
909       flat2 = _mm_cmpeq_epi8(flat2, zero);
910       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
911     }
912 
913     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
914     // flat and wide flat calculations
915     {
916       const __m256i eight = _mm256_set1_epi16(8);
917       const __m256i four = _mm256_set1_epi16(4);
918       __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
919           pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
920 
921       const __m256i filter =
922           _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
923       p256_7 = _mm256_shuffle_epi8(p256_7, filter);
924       p256_6 = _mm256_shuffle_epi8(p256_6, filter);
925       p256_5 = _mm256_shuffle_epi8(p256_5, filter);
926       p256_4 = _mm256_shuffle_epi8(p256_4, filter);
927       p256_3 = _mm256_shuffle_epi8(p256_3, filter);
928       p256_2 = _mm256_shuffle_epi8(p256_2, filter);
929       p256_1 = _mm256_shuffle_epi8(p256_1, filter);
930       p256_0 = _mm256_shuffle_epi8(p256_0, filter);
931       q256_0 = _mm256_shuffle_epi8(q256_0, filter);
932       q256_1 = _mm256_shuffle_epi8(q256_1, filter);
933       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
934       q256_3 = _mm256_shuffle_epi8(q256_3, filter);
935       q256_4 = _mm256_shuffle_epi8(q256_4, filter);
936       q256_5 = _mm256_shuffle_epi8(q256_5, filter);
937       q256_6 = _mm256_shuffle_epi8(q256_6, filter);
938       q256_7 = _mm256_shuffle_epi8(q256_7, filter);
939 
940       pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
941                                        _mm256_add_epi16(p256_4, p256_3));
942       pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
943                                        _mm256_add_epi16(q256_4, q256_3));
944 
945       pixetFilter_p2p1p0 =
946           _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
947       pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
948 
949       pixetFilter_q2q1q0 =
950           _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
951       pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
952 
953       pixelFilter_p = _mm256_add_epi16(
954           eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
955 
956       pixetFilter_p2p1p0 = _mm256_add_epi16(
957           four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
958 
959       res_p = _mm256_srli_epi16(
960           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
961 
962       flat2_p0 = _mm256_castsi256_si128(
963           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
964 
965       res_q = _mm256_srli_epi16(
966           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
967 
968       flat2_q0 = _mm256_castsi256_si128(
969           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
970 
971       res_p =
972           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
973                                              _mm256_add_epi16(p256_3, p256_0)),
974                             3);
975 
976       flat_p0 = _mm256_castsi256_si128(
977           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
978 
979       res_q =
980           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
981                                              _mm256_add_epi16(q256_3, q256_0)),
982                             3);
983 
984       flat_q0 = _mm256_castsi256_si128(
985           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
986 
987       sum_p7 = _mm256_add_epi16(p256_7, p256_7);
988 
989       sum_q7 = _mm256_add_epi16(q256_7, q256_7);
990 
991       sum_p3 = _mm256_add_epi16(p256_3, p256_3);
992 
993       sum_q3 = _mm256_add_epi16(q256_3, q256_3);
994 
995       pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
996 
997       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
998 
999       res_p = _mm256_srli_epi16(
1000           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
1001 
1002       flat2_p1 = _mm256_castsi256_si128(
1003           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1004 
1005       res_q = _mm256_srli_epi16(
1006           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
1007 
1008       flat2_q1 = _mm256_castsi256_si128(
1009           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1010 
1011       pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
1012 
1013       pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
1014 
1015       res_p =
1016           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
1017                                              _mm256_add_epi16(sum_p3, p256_1)),
1018                             3);
1019 
1020       flat_p1 = _mm256_castsi256_si128(
1021           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1022 
1023       res_q =
1024           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
1025                                              _mm256_add_epi16(sum_q3, q256_1)),
1026                             3);
1027 
1028       flat_q1 = _mm256_castsi256_si128(
1029           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1030 
1031       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1032 
1033       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1034 
1035       sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
1036 
1037       sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
1038 
1039       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
1040 
1041       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
1042 
1043       res_p = _mm256_srli_epi16(
1044           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
1045 
1046       flat2_p2 = _mm256_castsi256_si128(
1047           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1048 
1049       res_q = _mm256_srli_epi16(
1050           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
1051 
1052       flat2_q2 = _mm256_castsi256_si128(
1053           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1054 
1055       pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
1056 
1057       pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
1058 
1059       res_p =
1060           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
1061                                              _mm256_add_epi16(sum_p3, p256_2)),
1062                             3);
1063 
1064       flat_p2 = _mm256_castsi256_si128(
1065           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1066 
1067       res_q =
1068           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
1069                                              _mm256_add_epi16(sum_q3, q256_2)),
1070                             3);
1071 
1072       flat_q2 = _mm256_castsi256_si128(
1073           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1074 
1075       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1076 
1077       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1078 
1079       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
1080 
1081       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
1082 
1083       res_p = _mm256_srli_epi16(
1084           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
1085 
1086       flat2_p3 = _mm256_castsi256_si128(
1087           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1088 
1089       res_q = _mm256_srli_epi16(
1090           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
1091 
1092       flat2_q3 = _mm256_castsi256_si128(
1093           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1094 
1095       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1096 
1097       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1098 
1099       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
1100 
1101       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
1102 
1103       res_p = _mm256_srli_epi16(
1104           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
1105 
1106       flat2_p4 = _mm256_castsi256_si128(
1107           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1108 
1109       res_q = _mm256_srli_epi16(
1110           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
1111 
1112       flat2_q4 = _mm256_castsi256_si128(
1113           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1114 
1115       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1116 
1117       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1118 
1119       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
1120 
1121       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
1122 
1123       res_p = _mm256_srli_epi16(
1124           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
1125 
1126       flat2_p5 = _mm256_castsi256_si128(
1127           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1128 
1129       res_q = _mm256_srli_epi16(
1130           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
1131 
1132       flat2_q5 = _mm256_castsi256_si128(
1133           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1134 
1135       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
1136 
1137       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
1138 
1139       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
1140 
1141       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
1142 
1143       res_p = _mm256_srli_epi16(
1144           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
1145 
1146       flat2_p6 = _mm256_castsi256_si128(
1147           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
1148 
1149       res_q = _mm256_srli_epi16(
1150           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
1151 
1152       flat2_q6 = _mm256_castsi256_si128(
1153           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
1154     }
1155 
1156     // wide flat
1157     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1158 
1159     p2 = _mm_andnot_si128(flat, p2);
1160     flat_p2 = _mm_and_si128(flat, flat_p2);
1161     p2 = _mm_or_si128(flat_p2, p2);
1162 
1163     p1 = _mm_andnot_si128(flat, ps1);
1164     flat_p1 = _mm_and_si128(flat, flat_p1);
1165     p1 = _mm_or_si128(flat_p1, p1);
1166 
1167     p0 = _mm_andnot_si128(flat, ps0);
1168     flat_p0 = _mm_and_si128(flat, flat_p0);
1169     p0 = _mm_or_si128(flat_p0, p0);
1170 
1171     q0 = _mm_andnot_si128(flat, qs0);
1172     flat_q0 = _mm_and_si128(flat, flat_q0);
1173     q0 = _mm_or_si128(flat_q0, q0);
1174 
1175     q1 = _mm_andnot_si128(flat, qs1);
1176     flat_q1 = _mm_and_si128(flat, flat_q1);
1177     q1 = _mm_or_si128(flat_q1, q1);
1178 
1179     q2 = _mm_andnot_si128(flat, q2);
1180     flat_q2 = _mm_and_si128(flat, flat_q2);
1181     q2 = _mm_or_si128(flat_q2, q2);
1182 
1183     p6 = _mm_andnot_si128(flat2, p6);
1184     flat2_p6 = _mm_and_si128(flat2, flat2_p6);
1185     p6 = _mm_or_si128(flat2_p6, p6);
1186     _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
1187 
1188     p5 = _mm_andnot_si128(flat2, p5);
1189     flat2_p5 = _mm_and_si128(flat2, flat2_p5);
1190     p5 = _mm_or_si128(flat2_p5, p5);
1191     _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
1192 
1193     p4 = _mm_andnot_si128(flat2, p4);
1194     flat2_p4 = _mm_and_si128(flat2, flat2_p4);
1195     p4 = _mm_or_si128(flat2_p4, p4);
1196     _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
1197 
1198     p3 = _mm_andnot_si128(flat2, p3);
1199     flat2_p3 = _mm_and_si128(flat2, flat2_p3);
1200     p3 = _mm_or_si128(flat2_p3, p3);
1201     _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
1202 
1203     p2 = _mm_andnot_si128(flat2, p2);
1204     flat2_p2 = _mm_and_si128(flat2, flat2_p2);
1205     p2 = _mm_or_si128(flat2_p2, p2);
1206     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1207 
1208     p1 = _mm_andnot_si128(flat2, p1);
1209     flat2_p1 = _mm_and_si128(flat2, flat2_p1);
1210     p1 = _mm_or_si128(flat2_p1, p1);
1211     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1212 
1213     p0 = _mm_andnot_si128(flat2, p0);
1214     flat2_p0 = _mm_and_si128(flat2, flat2_p0);
1215     p0 = _mm_or_si128(flat2_p0, p0);
1216     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1217 
1218     q0 = _mm_andnot_si128(flat2, q0);
1219     flat2_q0 = _mm_and_si128(flat2, flat2_q0);
1220     q0 = _mm_or_si128(flat2_q0, q0);
1221     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
1222 
1223     q1 = _mm_andnot_si128(flat2, q1);
1224     flat2_q1 = _mm_and_si128(flat2, flat2_q1);
1225     q1 = _mm_or_si128(flat2_q1, q1);
1226     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1227 
1228     q2 = _mm_andnot_si128(flat2, q2);
1229     flat2_q2 = _mm_and_si128(flat2, flat2_q2);
1230     q2 = _mm_or_si128(flat2_q2, q2);
1231     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1232 
1233     q3 = _mm_andnot_si128(flat2, q3);
1234     flat2_q3 = _mm_and_si128(flat2, flat2_q3);
1235     q3 = _mm_or_si128(flat2_q3, q3);
1236     _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
1237 
1238     q4 = _mm_andnot_si128(flat2, q4);
1239     flat2_q4 = _mm_and_si128(flat2, flat2_q4);
1240     q4 = _mm_or_si128(flat2_q4, q4);
1241     _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
1242 
1243     q5 = _mm_andnot_si128(flat2, q5);
1244     flat2_q5 = _mm_and_si128(flat2, flat2_q5);
1245     q5 = _mm_or_si128(flat2_q5, q5);
1246     _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
1247 
1248     q6 = _mm_andnot_si128(flat2, q6);
1249     flat2_q6 = _mm_and_si128(flat2, flat2_q6);
1250     q6 = _mm_or_si128(flat2_q6, q6);
1251     _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
1252   }
1253 }
vpx_lpf_vertical_16_dual_avx2(unsigned char * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1254 void vpx_lpf_vertical_16_dual_avx2(unsigned char *s, int p,
1255     const uint8_t *blimit, const uint8_t *limit,
1256     const uint8_t *thresh) {
1257     __m128i io[16];
1258 
1259     // Transpose 16x16
1260     loadu_8bit_16x16(s - 8, p, io);
1261     transpose_8bit_16x16_avx2(io, io);
1262 
1263     // Loop filtering
1264     lpf_horizontal_16_dual_avx2(blimit, limit, thresh, io);
1265 
1266     // Transpose back
1267     transpose_8bit_16x16_avx2(io, io);
1268     storeu_8bit_16x16(io, s - 8, p);
1269 }
1270