1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>  // SSE2
13 
14 #include "./aom_dsp_rtcd.h"
15 #include "aom_ports/mem.h"
16 #include "aom_ports/emmintrin_compat.h"
17 
abs_diff(__m128i a,__m128i b)18 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
19   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
20 }
21 
22 #if CONFIG_PARALLEL_DEBLOCKING
23 // filter_mask and hev_mask
24 #define FILTER_HEV_MASK4                                                      \
25   do {                                                                        \
26     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
27     __m128i flat = abs_diff(q1p1, q0p0);                                      \
28     /* abs(p1 - q1), abs(p0 - q0) */                                          \
29     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
30     __m128i abs_p0q0, abs_p1q1;                                               \
31                                                                               \
32     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
33     hev =                                                                     \
34         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
35     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
36     hev = _mm_packs_epi16(hev, hev);                                          \
37                                                                               \
38     /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
39     /*                                  p1, p0, q0, q1); */                   \
40     abs_p0q0 =                                                                \
41         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
42     abs_p1q1 =                                                                \
43         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
44     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
45     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
46     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
47     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
48     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
49     mask = _mm_unpacklo_epi64(mask, flat);                                    \
50     mask = _mm_subs_epu8(mask, limit);                                        \
51     mask = _mm_cmpeq_epi8(mask, zero);                                        \
52     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
53   } while (0)
54 #endif  // CONFIG_PARALLEL_DEBLOCKING
55 
56 // filter_mask and hev_mask
57 #define FILTER_HEV_MASK                                                       \
58   do {                                                                        \
59     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
60     __m128i flat = abs_diff(q1p1, q0p0);                                      \
61     /* abs(p1 - q1), abs(p0 - q0) */                                          \
62     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
63     __m128i abs_p0q0, abs_p1q1, work;                                         \
64                                                                               \
65     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
66     hev =                                                                     \
67         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
68     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
69     hev = _mm_packs_epi16(hev, hev);                                          \
70                                                                               \
71     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
72     /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
73     abs_p0q0 =                                                                \
74         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
75     abs_p1q1 =                                                                \
76         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
77     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
78     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
79     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
80     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
81     /* abs(p3 - p2), abs(p2 - p1) */                                          \
82     work = abs_diff(p3p2, p2p1);                                              \
83     flat = _mm_max_epu8(work, flat);                                          \
84     /* abs(q3 - q2), abs(q2 - q1) */                                          \
85     work = abs_diff(q3q2, q2q1);                                              \
86     flat = _mm_max_epu8(work, flat);                                          \
87     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
88     mask = _mm_unpacklo_epi64(mask, flat);                                    \
89     mask = _mm_subs_epu8(mask, limit);                                        \
90     mask = _mm_cmpeq_epi8(mask, zero);                                        \
91     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
92   } while (0)
93 
94 #define FILTER4                                                             \
95   do {                                                                      \
96     const __m128i t3t4 =                                                    \
97         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
98     const __m128i t80 = _mm_set1_epi8(0x80);                                \
99     __m128i filter, filter2filter1, work;                                   \
100                                                                             \
101     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
102     qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
103                                                                             \
104     /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
105     work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
106     filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
107     /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
108     filter = _mm_subs_epi8(filter, work);                                   \
109     filter = _mm_subs_epi8(filter, work);                                   \
110     filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
111     filter = _mm_and_si128(filter, mask); /* & mask */                      \
112     filter = _mm_unpacklo_epi64(filter, filter);                            \
113                                                                             \
114     /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
115     /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
116     filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
117     filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
118     filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
119     filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
120     filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
121     filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
122                                                                             \
123     /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
124     filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
125     filter = _mm_unpacklo_epi8(filter, filter);                             \
126     filter = _mm_srai_epi16(filter, 9); /* round */                         \
127     filter = _mm_packs_epi16(filter, filter);                               \
128     filter = _mm_andnot_si128(hev, filter);                                 \
129                                                                             \
130     hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
131     filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
132                                                                             \
133     /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
134     qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
135     /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
136     ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
137     qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
138     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
139   } while (0)
140 
aom_lpf_horizontal_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)141 void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
142                                const uint8_t *_blimit, const uint8_t *_limit,
143                                const uint8_t *_thresh) {
144   const __m128i zero = _mm_set1_epi16(0);
145   const __m128i limit =
146       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
147                          _mm_loadl_epi64((const __m128i *)_limit));
148   const __m128i thresh =
149       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
150   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
151 #if !CONFIG_PARALLEL_DEBLOCKING
152   __m128i p3p2, p2p1, q3q2, q2q1;
153 #endif  // !CONFIG_PARALLEL_DEBLOCKING
154   __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
155   __m128i mask, hev;
156 #if !CONFIG_PARALLEL_DEBLOCKING
157   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
158                             _mm_loadl_epi64((__m128i *)(s - 4 * p)));
159 #endif  // !CONFIG_PARALLEL_DEBLOCKING
160   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
161                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
162   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
163                             _mm_loadl_epi64((__m128i *)(s + 0 * p)));
164 #if !CONFIG_PARALLEL_DEBLOCKING
165   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
166                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
167 #endif  // !CONFIG_PARALLEL_DEBLOCKING
168   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
169   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
170 #if !CONFIG_PARALLEL_DEBLOCKING
171   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
172   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
173 #endif  // !CONFIG_PARALLEL_DEBLOCKING
174 #if !CONFIG_PARALLEL_DEBLOCKING
175   FILTER_HEV_MASK;
176 #else   // CONFIG_PARALLEL_DEBLOCKING
177   FILTER_HEV_MASK4;
178 #endif  // !CONFIG_PARALLEL_DEBLOCKING
179   FILTER4;
180 
181 #if CONFIG_PARALLEL_DEBLOCKING
182   *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0);
183   ps1ps0 = _mm_srli_si128(ps1ps0, 8);
184   *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0);
185 
186   *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0);
187   qs1qs0 = _mm_srli_si128(qs1qs0, 8);
188   *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0);
189 #else
190   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
191   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
192   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
193   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
194 #endif
195 }
196 
aom_lpf_vertical_4_sse2(uint8_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh)197 void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
198                              const uint8_t *_blimit, const uint8_t *_limit,
199                              const uint8_t *_thresh) {
200   const __m128i zero = _mm_set1_epi16(0);
201   const __m128i limit =
202       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
203                          _mm_loadl_epi64((const __m128i *)_limit));
204   const __m128i thresh =
205       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
206   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
207   __m128i x0, x1, x2, x3;
208 #if !CONFIG_PARALLEL_DEBLOCKING
209   __m128i p3p2, p2p1, q3q2, q2q1;
210 #endif  // !CONFIG_PARALLEL_DEBLOCKING
211   __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
212   __m128i mask, hev;
213 
214   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
215   q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
216                            _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
217 
218   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
219   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
220                          _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
221 
222   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
223   x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
224                          _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
225 
226   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
227   x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
228                          _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
229 
230   // Transpose 8x8
231   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
232   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
233   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
234   x0 = _mm_unpacklo_epi16(x2, x3);
235 #if !CONFIG_PARALLEL_DEBLOCKING
236   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
237   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
238 #endif  // !CONFIG_PARALLEL_DEBLOCKING
239   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
240   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
241 #if !CONFIG_PARALLEL_DEBLOCKING
242   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
243 #endif  // !CONFIG_PARALLEL_DEBLOCKING
244   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
245 
246   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
247   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
248   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
249   x2 = _mm_unpackhi_epi16(x2, x3);
250 #if !CONFIG_PARALLEL_DEBLOCKING
251   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
252   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
253 #endif  // !CONFIG_PARALLEL_DEBLOCKING
254   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
255   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
256 
257   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
258   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
259   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
260 #if !CONFIG_PARALLEL_DEBLOCKING
261   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
262   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
263 #endif  // !CONFIG_PARALLEL_DEBLOCKING
264 #if !CONFIG_PARALLEL_DEBLOCKING
265   FILTER_HEV_MASK;
266 #else   // CONFIG_PARALLEL_DEBLOCKING
267   FILTER_HEV_MASK4;
268 #endif  // !CONFIG_PARALLEL_DEBLOCKING
269   FILTER4;
270 
271   // Transpose 8x4 to 4x8
272   // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
273   // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
274   // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
275   ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
276   // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
277   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
278   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
279   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
280 #if !CONFIG_PARALLEL_DEBLOCKING
281   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
282   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
283 #endif
284   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
285   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
286 
287   *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
288   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
289   *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
290   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
291   *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
292   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
293   *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
294 #if !CONFIG_PARALLEL_DEBLOCKING
295   *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
296   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
297   *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
298   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
299   *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
300   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
301   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
302 #endif
303 }
304 
store_buffer_horz_8(const __m128i * x,int p,int num,uint8_t * s)305 static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num,
306                                        uint8_t *s) {
307 #if CONFIG_PARALLEL_DEBLOCKING
308   *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x);
309   const __m128i hi = _mm_srli_si128(*x, 8);
310   *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi);
311 #else
312   _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x);
313   _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x));
314 #endif
315 }
316 
aom_lpf_horizontal_edge_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)317 void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
318                                     const unsigned char *_blimit,
319                                     const unsigned char *_limit,
320                                     const unsigned char *_thresh) {
321   const __m128i zero = _mm_set1_epi16(0);
322   const __m128i one = _mm_set1_epi8(1);
323   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
324   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
325   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
326   __m128i mask, hev, flat, flat2;
327   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
328   __m128i abs_p1p0;
329 
330   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
331   q4p4 = _mm_castps_si128(
332       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
333   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
334   q3p3 = _mm_castps_si128(
335       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
336   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
337   q2p2 = _mm_castps_si128(
338       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
339   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
340   q1p1 = _mm_castps_si128(
341       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
342   p1q1 = _mm_shuffle_epi32(q1p1, 78);
343   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
344   q0p0 = _mm_castps_si128(
345       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
346   p0q0 = _mm_shuffle_epi32(q0p0, 78);
347 
348   {
349     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
350     abs_p1p0 = abs_diff(q1p1, q0p0);
351     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
352     fe = _mm_set1_epi8(0xfe);
353     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
354     abs_p0q0 = abs_diff(q0p0, p0q0);
355     abs_p1q1 = abs_diff(q1p1, p1q1);
356     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
357     hev = _mm_subs_epu8(flat, thresh);
358     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
359 
360     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
361     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
362     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
363     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
364     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
365     mask = _mm_max_epu8(abs_p1p0, mask);
366     // mask |= (abs(p1 - p0) > limit) * -1;
367     // mask |= (abs(q1 - q0) > limit) * -1;
368 
369     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
370     mask = _mm_max_epu8(work, mask);
371     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
372     mask = _mm_subs_epu8(mask, limit);
373     mask = _mm_cmpeq_epi8(mask, zero);
374   }
375 
376   // lp filter
377   {
378     const __m128i t4 = _mm_set1_epi8(4);
379     const __m128i t3 = _mm_set1_epi8(3);
380     const __m128i t80 = _mm_set1_epi8(0x80);
381     const __m128i t1 = _mm_set1_epi16(0x1);
382     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
383     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
384     __m128i qs0 = _mm_xor_si128(p0q0, t80);
385     __m128i qs1 = _mm_xor_si128(p1q1, t80);
386     __m128i filt;
387     __m128i work_a;
388     __m128i filter1, filter2;
389     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
390     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
391 
392     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
393     work_a = _mm_subs_epi8(qs0, qs0ps0);
394     filt = _mm_adds_epi8(filt, work_a);
395     filt = _mm_adds_epi8(filt, work_a);
396     filt = _mm_adds_epi8(filt, work_a);
397     // (aom_filter + 3 * (qs0 - ps0)) & mask
398     filt = _mm_and_si128(filt, mask);
399 
400     filter1 = _mm_adds_epi8(filt, t4);
401     filter2 = _mm_adds_epi8(filt, t3);
402 
403     filter1 = _mm_unpacklo_epi8(zero, filter1);
404     filter1 = _mm_srai_epi16(filter1, 0xB);
405     filter2 = _mm_unpacklo_epi8(zero, filter2);
406     filter2 = _mm_srai_epi16(filter2, 0xB);
407 
408     // Filter1 >> 3
409     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
410     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
411 
412     // filt >> 1
413     filt = _mm_adds_epi16(filter1, t1);
414     filt = _mm_srai_epi16(filt, 1);
415     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
416                             filt);
417     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
418     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
419     // loopfilter done
420 
421     {
422       __m128i work;
423       flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
424       flat = _mm_max_epu8(abs_p1p0, flat);
425       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
426       flat = _mm_subs_epu8(flat, one);
427       flat = _mm_cmpeq_epi8(flat, zero);
428       flat = _mm_and_si128(flat, mask);
429 
430       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
431       q5p5 = _mm_castps_si128(
432           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
433 
434       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
435       q6p6 = _mm_castps_si128(
436           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
437       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
438 
439       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
440       q7p7 = _mm_castps_si128(
441           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
442       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
443       flat2 = _mm_max_epu8(work, flat2);
444       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
445       flat2 = _mm_subs_epu8(flat2, one);
446       flat2 = _mm_cmpeq_epi8(flat2, zero);
447       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
448     }
449 
450     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
451     // flat and wide flat calculations
452     {
453       const __m128i eight = _mm_set1_epi16(8);
454       const __m128i four = _mm_set1_epi16(4);
455       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
456       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
457       __m128i pixelFilter_p, pixelFilter_q;
458       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
459       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
460 
461       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
462       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
463       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
464       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
465       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
466       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
467       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
468       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
469       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
470       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
471       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
472       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
473       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
474       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
475       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
476       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
477 
478       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
479                                     _mm_add_epi16(p4_16, p3_16));
480       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
481                                     _mm_add_epi16(q4_16, q3_16));
482 
483       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
484       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
485 
486       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
487       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
488       pixelFilter_p =
489           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
490       pixetFilter_p2p1p0 = _mm_add_epi16(
491           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
492       res_p = _mm_srli_epi16(
493           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
494       res_q = _mm_srli_epi16(
495           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
496       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
497       res_p = _mm_srli_epi16(
498           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
499       res_q = _mm_srli_epi16(
500           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
501 
502       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
503 
504       sum_p7 = _mm_add_epi16(p7_16, p7_16);
505       sum_q7 = _mm_add_epi16(q7_16, q7_16);
506       sum_p3 = _mm_add_epi16(p3_16, p3_16);
507       sum_q3 = _mm_add_epi16(q3_16, q3_16);
508 
509       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
510       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
511       res_p = _mm_srli_epi16(
512           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
513       res_q = _mm_srli_epi16(
514           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
515       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
516 
517       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
518       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
519       res_p = _mm_srli_epi16(
520           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
521       res_q = _mm_srli_epi16(
522           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
523       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
524 
525       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
526       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
527       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
528       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
529 
530       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
531       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
532       res_p = _mm_srli_epi16(
533           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
534       res_q = _mm_srli_epi16(
535           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
536       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
537 
538       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
539       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
540 
541       res_p = _mm_srli_epi16(
542           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
543       res_q = _mm_srli_epi16(
544           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
545       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
546 
547       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
548       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
549       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
550       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
551       res_p = _mm_srli_epi16(
552           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
553       res_q = _mm_srli_epi16(
554           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
555       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
556 
557       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
558       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
559       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
560       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
561       res_p = _mm_srli_epi16(
562           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
563       res_q = _mm_srli_epi16(
564           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
565       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
566 
567       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
568       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
569       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
570       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
571       res_p = _mm_srli_epi16(
572           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
573       res_q = _mm_srli_epi16(
574           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
575       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
576 
577       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
578       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
579       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
580       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
581       res_p = _mm_srli_epi16(
582           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
583       res_q = _mm_srli_epi16(
584           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
585       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
586     }
587     // wide flat
588     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
589 
590     flat = _mm_shuffle_epi32(flat, 68);
591     flat2 = _mm_shuffle_epi32(flat2, 68);
592 
593     q2p2 = _mm_andnot_si128(flat, q2p2);
594     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
595     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
596 
597     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
598     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
599     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
600 
601     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
602     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
603     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
604 
605     q6p6 = _mm_andnot_si128(flat2, q6p6);
606     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
607     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
608     store_buffer_horz_8(&q6p6, p, 6, s);
609 
610     q5p5 = _mm_andnot_si128(flat2, q5p5);
611     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
612     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
613     store_buffer_horz_8(&q5p5, p, 5, s);
614 
615     q4p4 = _mm_andnot_si128(flat2, q4p4);
616     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
617     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
618     store_buffer_horz_8(&q4p4, p, 4, s);
619 
620     q3p3 = _mm_andnot_si128(flat2, q3p3);
621     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
622     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
623     store_buffer_horz_8(&q3p3, p, 3, s);
624 
625     q2p2 = _mm_andnot_si128(flat2, q2p2);
626     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
627     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
628     store_buffer_horz_8(&q2p2, p, 2, s);
629 
630     q1p1 = _mm_andnot_si128(flat2, q1p1);
631     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
632     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
633     store_buffer_horz_8(&q1p1, p, 1, s);
634 
635     q0p0 = _mm_andnot_si128(flat2, q0p0);
636     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
637     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
638     store_buffer_horz_8(&q0p0, p, 0, s);
639   }
640 }
641 
filter_add2_sub2(const __m128i * const total,const __m128i * const a1,const __m128i * const a2,const __m128i * const s1,const __m128i * const s2)642 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
643                                        const __m128i *const a1,
644                                        const __m128i *const a2,
645                                        const __m128i *const s1,
646                                        const __m128i *const s2) {
647   __m128i x = _mm_add_epi16(*a1, *total);
648   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
649   return x;
650 }
651 
filter8_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f8_lo,const __m128i * const f8_hi)652 static INLINE __m128i filter8_mask(const __m128i *const flat,
653                                    const __m128i *const other_filt,
654                                    const __m128i *const f8_lo,
655                                    const __m128i *const f8_hi) {
656   const __m128i f8 =
657       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
658   const __m128i result = _mm_and_si128(*flat, f8);
659   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
660 }
661 
filter16_mask(const __m128i * const flat,const __m128i * const other_filt,const __m128i * const f_lo,const __m128i * const f_hi)662 static INLINE __m128i filter16_mask(const __m128i *const flat,
663                                     const __m128i *const other_filt,
664                                     const __m128i *const f_lo,
665                                     const __m128i *const f_hi) {
666   const __m128i f =
667       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
668   const __m128i result = _mm_and_si128(*flat, f);
669   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
670 }
671 
672 typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput;
673 
store_buffer_horz_16(PixelOutput pixel_num,const __m128i * x,int p,int offset,uint8_t * s)674 static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x,
675                                         int p, int offset, uint8_t *s) {
676   int i;
677   if (pixel_num == FOUR_PIXELS) {
678     for (i = 13; i >= 0; i--) {
679       *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]);
680     }
681   }
682   if (pixel_num == EIGHT_PIXELS) {
683     for (i = 13; i >= 0; i--) {
684       _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]);
685     }
686   }
687   if (pixel_num == SIXTEEN_PIXELS) {
688     for (i = 13; i >= 0; i--) {
689       _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]);
690     }
691   }
692 }
693 
lpf_horz_edge_16_internal(PixelOutput pixel_num,unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)694 static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num,
695                                              unsigned char *s, int p,
696                                              const unsigned char *_blimit,
697                                              const unsigned char *_limit,
698                                              const unsigned char *_thresh) {
699   const __m128i zero = _mm_set1_epi16(0);
700   const __m128i one = _mm_set1_epi8(1);
701   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
702   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
703   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
704   __m128i mask, hev, flat, flat2;
705   __m128i p7, p6, p5;
706   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
707   __m128i q5, q6, q7;
708 
709   __m128i op2, op1, op0, oq0, oq1, oq2;
710 
711   __m128i max_abs_p1p0q1q0;
712 
713   p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
714   p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
715   p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
716   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
717   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
718   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
719   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
720   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
721   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
722   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
723   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
724   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
725   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
726   q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
727   q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
728   q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
729 
730   {
731     const __m128i abs_p1p0 = abs_diff(p1, p0);
732     const __m128i abs_q1q0 = abs_diff(q1, q0);
733     const __m128i fe = _mm_set1_epi8(0xfe);
734     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
735     __m128i abs_p0q0 = abs_diff(p0, q0);
736     __m128i abs_p1q1 = abs_diff(p1, q1);
737     __m128i work;
738     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
739 
740     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
741     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
742     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
743     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
744     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
745     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
746     // mask |= (abs(p1 - p0) > limit) * -1;
747     // mask |= (abs(q1 - q0) > limit) * -1;
748     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
749     mask = _mm_max_epu8(work, mask);
750     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
751     mask = _mm_max_epu8(work, mask);
752     mask = _mm_subs_epu8(mask, limit);
753     mask = _mm_cmpeq_epi8(mask, zero);
754   }
755 
756   {
757     __m128i work;
758     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
759     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
760     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
761     flat = _mm_max_epu8(work, flat);
762     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
763     flat = _mm_subs_epu8(flat, one);
764     flat = _mm_cmpeq_epi8(flat, zero);
765     flat = _mm_and_si128(flat, mask);
766     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
767     flat2 = _mm_max_epu8(work, flat2);
768     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
769     flat2 = _mm_max_epu8(work, flat2);
770     work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
771     flat2 = _mm_max_epu8(work, flat2);
772     flat2 = _mm_subs_epu8(flat2, one);
773     flat2 = _mm_cmpeq_epi8(flat2, zero);
774     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
775   }
776 
777   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
778   // filter4
779   {
780     const __m128i t4 = _mm_set1_epi8(4);
781     const __m128i t3 = _mm_set1_epi8(3);
782     const __m128i t80 = _mm_set1_epi8(0x80);
783     const __m128i te0 = _mm_set1_epi8(0xe0);
784     const __m128i t1f = _mm_set1_epi8(0x1f);
785     const __m128i t1 = _mm_set1_epi8(0x1);
786     const __m128i t7f = _mm_set1_epi8(0x7f);
787     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
788 
789     __m128i filt;
790     __m128i work_a;
791     __m128i filter1, filter2;
792 
793     op1 = _mm_xor_si128(p1, t80);
794     op0 = _mm_xor_si128(p0, t80);
795     oq0 = _mm_xor_si128(q0, t80);
796     oq1 = _mm_xor_si128(q1, t80);
797 
798     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
799     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
800     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
801 
802     work_a = _mm_subs_epi8(oq0, op0);
803     filt = _mm_adds_epi8(filt, work_a);
804     filt = _mm_adds_epi8(filt, work_a);
805     filt = _mm_adds_epi8(filt, work_a);
806     // (aom_filter + 3 * (qs0 - ps0)) & mask
807     filt = _mm_and_si128(filt, mask);
808     filter1 = _mm_adds_epi8(filt, t4);
809     filter2 = _mm_adds_epi8(filt, t3);
810 
811     // Filter1 >> 3
812     work_a = _mm_cmpgt_epi8(zero, filter1);
813     filter1 = _mm_srli_epi16(filter1, 3);
814     work_a = _mm_and_si128(work_a, te0);
815     filter1 = _mm_and_si128(filter1, t1f);
816     filter1 = _mm_or_si128(filter1, work_a);
817     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
818 
819     // Filter2 >> 3
820     work_a = _mm_cmpgt_epi8(zero, filter2);
821     filter2 = _mm_srli_epi16(filter2, 3);
822     work_a = _mm_and_si128(work_a, te0);
823     filter2 = _mm_and_si128(filter2, t1f);
824     filter2 = _mm_or_si128(filter2, work_a);
825     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
826 
827     // filt >> 1
828     filt = _mm_adds_epi8(filter1, t1);
829     work_a = _mm_cmpgt_epi8(zero, filt);
830     filt = _mm_srli_epi16(filt, 1);
831     work_a = _mm_and_si128(work_a, t80);
832     filt = _mm_and_si128(filt, t7f);
833     filt = _mm_or_si128(filt, work_a);
834     filt = _mm_andnot_si128(hev, filt);
835     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
836     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
837     // loopfilter done
838 
839     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
840     // filter8
841     {
842       const __m128i four = _mm_set1_epi16(4);
843       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
844       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
845       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
846       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
847       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
848       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
849       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
850       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
851 
852       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
853       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
854       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
855       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
856       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
857       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
858       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
859       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
860       __m128i f8_lo, f8_hi;
861 
862       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
863                             _mm_add_epi16(p3_lo, p2_lo));
864       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
865                             _mm_add_epi16(p2_lo, p1_lo));
866       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
867 
868       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
869                             _mm_add_epi16(p3_hi, p2_hi));
870       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
871                             _mm_add_epi16(p2_hi, p1_hi));
872       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
873 
874       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
875 
876       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
877       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
878       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
879 
880       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
881       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
882       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
883 
884       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
885       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
886       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
887 
888       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
889       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
890       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
891 
892       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
893       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
894       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
895     }
896 
897     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
898     // wide flat calculations
899     {
900       const __m128i eight = _mm_set1_epi16(8);
901       const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
902       const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
903       const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
904       const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
905       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
906       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
907       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
908       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
909       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
910       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
911       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
912       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
913       const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
914       const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
915       const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
916       const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
917 
918       const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
919       const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
920       const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
921       const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
922       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
923       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
924       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
925       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
926       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
927       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
928       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
929       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
930       const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
931       const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
932       const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
933       const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
934 
935       __m128i f_lo;
936       __m128i f_hi;
937 
938       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
939       f_lo =
940           _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
941       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
942                            _mm_add_epi16(p2_lo, p1_lo));
943       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
944       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
945 
946       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
947       f_hi =
948           _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
949       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
950                            _mm_add_epi16(p2_hi, p1_hi));
951       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
952       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
953 
954       __m128i x[14];
955       x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
956 
957       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
958       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
959       x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
960 
961       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
962       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
963       x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
964 
965       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
966       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
967       x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
968 
969       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
970       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
971       x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
972 
973       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
974       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
975       x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
976 
977       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
978       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
979       x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
980 
981       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
982       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
983       x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
984 
985       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
986       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
987       x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
988 
989       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
990       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
991       x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
992 
993       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
994       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
995       x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
996 
997       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
998       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
999       x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
1000 
1001       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
1002       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
1003       x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
1004 
1005       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
1006       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
1007       x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
1008 
1009       store_buffer_horz_16(pixel_num, x, p, 6, s);
1010     }
1011     // wide flat
1012     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1013   }
1014 }
1015 
aom_lpf_horizontal_8_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1016 void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
1017                                const unsigned char *_blimit,
1018                                const unsigned char *_limit,
1019                                const unsigned char *_thresh) {
1020   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1021   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1022   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1023   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1024   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1025   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1026   const __m128i zero = _mm_set1_epi16(0);
1027   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
1028   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
1029   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1030   __m128i mask, hev, flat;
1031   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1032   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
1033 
1034   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
1035                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
1036   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
1037                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
1038   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1039                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
1040   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1041                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
1042   p1q1 = _mm_shuffle_epi32(q1p1, 78);
1043   p0q0 = _mm_shuffle_epi32(q0p0, 78);
1044 
1045   {
1046     // filter_mask and hev_mask
1047     const __m128i one = _mm_set1_epi8(1);
1048     const __m128i fe = _mm_set1_epi8(0xfe);
1049     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1050     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1051     abs_p1p0 = abs_diff(q1p1, q0p0);
1052     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1053 
1054     abs_p0q0 = abs_diff(q0p0, p0q0);
1055     abs_p1q1 = abs_diff(q1p1, p1q1);
1056     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1057     hev = _mm_subs_epu8(flat, thresh);
1058     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1059 
1060     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1061     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1062     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1063     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1064     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1065     mask = _mm_max_epu8(abs_p1p0, mask);
1066     // mask |= (abs(p1 - p0) > limit) * -1;
1067     // mask |= (abs(q1 - q0) > limit) * -1;
1068 
1069     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1070     mask = _mm_max_epu8(work, mask);
1071     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1072     mask = _mm_subs_epu8(mask, limit);
1073     mask = _mm_cmpeq_epi8(mask, zero);
1074 
1075     // flat_mask4
1076 
1077     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1078     flat = _mm_max_epu8(abs_p1p0, flat);
1079     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1080     flat = _mm_subs_epu8(flat, one);
1081     flat = _mm_cmpeq_epi8(flat, zero);
1082     flat = _mm_and_si128(flat, mask);
1083   }
1084 
1085   {
1086     const __m128i four = _mm_set1_epi16(4);
1087     unsigned char *src = s;
1088     {
1089       __m128i workp_a, workp_b, workp_shft;
1090       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1091       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1092       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1093       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1094       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1095       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1096       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1097       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1098 
1099       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1100       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1101       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1102       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1103       _mm_storel_epi64((__m128i *)&flat_op2[0],
1104                        _mm_packus_epi16(workp_shft, workp_shft));
1105 
1106       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1107       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1108       _mm_storel_epi64((__m128i *)&flat_op1[0],
1109                        _mm_packus_epi16(workp_shft, workp_shft));
1110 
1111       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1112       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1113       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1114       _mm_storel_epi64((__m128i *)&flat_op0[0],
1115                        _mm_packus_epi16(workp_shft, workp_shft));
1116 
1117       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1118       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1119       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1120       _mm_storel_epi64((__m128i *)&flat_oq0[0],
1121                        _mm_packus_epi16(workp_shft, workp_shft));
1122 
1123       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1124       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1125       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1126       _mm_storel_epi64((__m128i *)&flat_oq1[0],
1127                        _mm_packus_epi16(workp_shft, workp_shft));
1128 
1129       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1130       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1131       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1132       _mm_storel_epi64((__m128i *)&flat_oq2[0],
1133                        _mm_packus_epi16(workp_shft, workp_shft));
1134     }
1135   }
1136   // lp filter
1137   {
1138     const __m128i t4 = _mm_set1_epi8(4);
1139     const __m128i t3 = _mm_set1_epi8(3);
1140     const __m128i t80 = _mm_set1_epi8(0x80);
1141     const __m128i t1 = _mm_set1_epi8(0x1);
1142     const __m128i ps1 =
1143         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
1144     const __m128i ps0 =
1145         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
1146     const __m128i qs0 =
1147         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
1148     const __m128i qs1 =
1149         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
1150     __m128i filt;
1151     __m128i work_a;
1152     __m128i filter1, filter2;
1153 
1154     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1155     work_a = _mm_subs_epi8(qs0, ps0);
1156     filt = _mm_adds_epi8(filt, work_a);
1157     filt = _mm_adds_epi8(filt, work_a);
1158     filt = _mm_adds_epi8(filt, work_a);
1159     // (aom_filter + 3 * (qs0 - ps0)) & mask
1160     filt = _mm_and_si128(filt, mask);
1161 
1162     filter1 = _mm_adds_epi8(filt, t4);
1163     filter2 = _mm_adds_epi8(filt, t3);
1164 
1165     // Filter1 >> 3
1166     filter1 = _mm_unpacklo_epi8(zero, filter1);
1167     filter1 = _mm_srai_epi16(filter1, 11);
1168     filter1 = _mm_packs_epi16(filter1, filter1);
1169 
1170     // Filter2 >> 3
1171     filter2 = _mm_unpacklo_epi8(zero, filter2);
1172     filter2 = _mm_srai_epi16(filter2, 11);
1173     filter2 = _mm_packs_epi16(filter2, zero);
1174 
1175     // filt >> 1
1176     filt = _mm_adds_epi8(filter1, t1);
1177     filt = _mm_unpacklo_epi8(zero, filt);
1178     filt = _mm_srai_epi16(filt, 9);
1179     filt = _mm_packs_epi16(filt, zero);
1180 
1181     filt = _mm_andnot_si128(hev, filt);
1182 
1183     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1184     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1185     work_a = _mm_andnot_si128(flat, work_a);
1186     q0 = _mm_and_si128(flat, q0);
1187     q0 = _mm_or_si128(work_a, q0);
1188 
1189     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1190     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1191     work_a = _mm_andnot_si128(flat, work_a);
1192     q1 = _mm_and_si128(flat, q1);
1193     q1 = _mm_or_si128(work_a, q1);
1194 
1195     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1196     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1197     work_a = _mm_andnot_si128(flat, work_a);
1198     q2 = _mm_and_si128(flat, q2);
1199     q2 = _mm_or_si128(work_a, q2);
1200 
1201     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1202     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1203     work_a = _mm_andnot_si128(flat, work_a);
1204     p0 = _mm_and_si128(flat, p0);
1205     p0 = _mm_or_si128(work_a, p0);
1206 
1207     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1208     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1209     work_a = _mm_andnot_si128(flat, work_a);
1210     p1 = _mm_and_si128(flat, p1);
1211     p1 = _mm_or_si128(work_a, p1);
1212 
1213     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1214     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1215     work_a = _mm_andnot_si128(flat, work_a);
1216     p2 = _mm_and_si128(flat, p2);
1217     p2 = _mm_or_si128(work_a, p2);
1218 
1219 #if CONFIG_PARALLEL_DEBLOCKING
1220     *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2);
1221     *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1);
1222     *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0);
1223     *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0);
1224     *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1);
1225     *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2);
1226 #else
1227     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1228     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1229     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1230     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1231     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1232     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1233 #endif
1234   }
1235 }
1236 
aom_lpf_horizontal_edge_16_sse2(unsigned char * s,int p,const unsigned char * _blimit,const unsigned char * _limit,const unsigned char * _thresh)1237 void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
1238                                      const unsigned char *_blimit,
1239                                      const unsigned char *_limit,
1240                                      const unsigned char *_thresh) {
1241 #if CONFIG_PARALLEL_DEBLOCKING
1242   lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh);
1243 #else
1244   lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh);
1245 #endif
1246 }
1247 
aom_lpf_horizontal_8_dual_sse2(uint8_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1)1248 void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1249                                     const uint8_t *_limit0,
1250                                     const uint8_t *_thresh0,
1251                                     const uint8_t *_blimit1,
1252                                     const uint8_t *_limit1,
1253                                     const uint8_t *_thresh1) {
1254   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1255   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1256   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1257   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1258   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1259   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1260   const __m128i zero = _mm_set1_epi16(0);
1261   const __m128i blimit =
1262       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1263                          _mm_load_si128((const __m128i *)_blimit1));
1264   const __m128i limit =
1265       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1266                          _mm_load_si128((const __m128i *)_limit1));
1267   const __m128i thresh =
1268       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1269                          _mm_load_si128((const __m128i *)_thresh1));
1270 
1271   __m128i mask, hev, flat;
1272   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1273 
1274   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1275   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1276   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1277   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1278   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1279   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1280   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1281   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1282   {
1283     const __m128i abs_p1p0 =
1284         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1285     const __m128i abs_q1q0 =
1286         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1287     const __m128i one = _mm_set1_epi8(1);
1288     const __m128i fe = _mm_set1_epi8(0xfe);
1289     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1290     __m128i abs_p0q0 =
1291         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1292     __m128i abs_p1q1 =
1293         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1294     __m128i work;
1295 
1296     // filter_mask and hev_mask
1297     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1298     hev = _mm_subs_epu8(flat, thresh);
1299     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1300 
1301     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1302     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1303     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1304     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1305     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1306     mask = _mm_max_epu8(flat, mask);
1307     // mask |= (abs(p1 - p0) > limit) * -1;
1308     // mask |= (abs(q1 - q0) > limit) * -1;
1309     work = _mm_max_epu8(
1310         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1311         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1312     mask = _mm_max_epu8(work, mask);
1313     work = _mm_max_epu8(
1314         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1315         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1316     mask = _mm_max_epu8(work, mask);
1317     mask = _mm_subs_epu8(mask, limit);
1318     mask = _mm_cmpeq_epi8(mask, zero);
1319 
1320     // flat_mask4
1321     work = _mm_max_epu8(
1322         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1323         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1324     flat = _mm_max_epu8(work, flat);
1325     work = _mm_max_epu8(
1326         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1327         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1328     flat = _mm_max_epu8(work, flat);
1329     flat = _mm_subs_epu8(flat, one);
1330     flat = _mm_cmpeq_epi8(flat, zero);
1331     flat = _mm_and_si128(flat, mask);
1332   }
1333   {
1334     const __m128i four = _mm_set1_epi16(4);
1335     unsigned char *src = s;
1336     int i = 0;
1337 
1338     do {
1339       __m128i workp_a, workp_b, workp_shft;
1340       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1341       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1342       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1343       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1344       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1345       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1346       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1347       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1348 
1349       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1350       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1351       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1352       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1353       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1354                        _mm_packus_epi16(workp_shft, workp_shft));
1355 
1356       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1357       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1358       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1359                        _mm_packus_epi16(workp_shft, workp_shft));
1360 
1361       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1362       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1363       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1364       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1365                        _mm_packus_epi16(workp_shft, workp_shft));
1366 
1367       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1368       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1369       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1370       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1371                        _mm_packus_epi16(workp_shft, workp_shft));
1372 
1373       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1374       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1375       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1376       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1377                        _mm_packus_epi16(workp_shft, workp_shft));
1378 
1379       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1380       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1381       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1382       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1383                        _mm_packus_epi16(workp_shft, workp_shft));
1384 
1385       src += 8;
1386     } while (++i < 2);
1387   }
1388   // lp filter
1389   {
1390     const __m128i t4 = _mm_set1_epi8(4);
1391     const __m128i t3 = _mm_set1_epi8(3);
1392     const __m128i t80 = _mm_set1_epi8(0x80);
1393     const __m128i te0 = _mm_set1_epi8(0xe0);
1394     const __m128i t1f = _mm_set1_epi8(0x1f);
1395     const __m128i t1 = _mm_set1_epi8(0x1);
1396     const __m128i t7f = _mm_set1_epi8(0x7f);
1397 
1398     const __m128i ps1 =
1399         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1400     const __m128i ps0 =
1401         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1402     const __m128i qs0 =
1403         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1404     const __m128i qs1 =
1405         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1406     __m128i filt;
1407     __m128i work_a;
1408     __m128i filter1, filter2;
1409 
1410     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1411     work_a = _mm_subs_epi8(qs0, ps0);
1412     filt = _mm_adds_epi8(filt, work_a);
1413     filt = _mm_adds_epi8(filt, work_a);
1414     filt = _mm_adds_epi8(filt, work_a);
1415     // (aom_filter + 3 * (qs0 - ps0)) & mask
1416     filt = _mm_and_si128(filt, mask);
1417 
1418     filter1 = _mm_adds_epi8(filt, t4);
1419     filter2 = _mm_adds_epi8(filt, t3);
1420 
1421     // Filter1 >> 3
1422     work_a = _mm_cmpgt_epi8(zero, filter1);
1423     filter1 = _mm_srli_epi16(filter1, 3);
1424     work_a = _mm_and_si128(work_a, te0);
1425     filter1 = _mm_and_si128(filter1, t1f);
1426     filter1 = _mm_or_si128(filter1, work_a);
1427 
1428     // Filter2 >> 3
1429     work_a = _mm_cmpgt_epi8(zero, filter2);
1430     filter2 = _mm_srli_epi16(filter2, 3);
1431     work_a = _mm_and_si128(work_a, te0);
1432     filter2 = _mm_and_si128(filter2, t1f);
1433     filter2 = _mm_or_si128(filter2, work_a);
1434 
1435     // filt >> 1
1436     filt = _mm_adds_epi8(filter1, t1);
1437     work_a = _mm_cmpgt_epi8(zero, filt);
1438     filt = _mm_srli_epi16(filt, 1);
1439     work_a = _mm_and_si128(work_a, t80);
1440     filt = _mm_and_si128(filt, t7f);
1441     filt = _mm_or_si128(filt, work_a);
1442 
1443     filt = _mm_andnot_si128(hev, filt);
1444 
1445     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1446     q0 = _mm_load_si128((__m128i *)flat_oq0);
1447     work_a = _mm_andnot_si128(flat, work_a);
1448     q0 = _mm_and_si128(flat, q0);
1449     q0 = _mm_or_si128(work_a, q0);
1450 
1451     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1452     q1 = _mm_load_si128((__m128i *)flat_oq1);
1453     work_a = _mm_andnot_si128(flat, work_a);
1454     q1 = _mm_and_si128(flat, q1);
1455     q1 = _mm_or_si128(work_a, q1);
1456 
1457     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1458     q2 = _mm_load_si128((__m128i *)flat_oq2);
1459     work_a = _mm_andnot_si128(flat, work_a);
1460     q2 = _mm_and_si128(flat, q2);
1461     q2 = _mm_or_si128(work_a, q2);
1462 
1463     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1464     p0 = _mm_load_si128((__m128i *)flat_op0);
1465     work_a = _mm_andnot_si128(flat, work_a);
1466     p0 = _mm_and_si128(flat, p0);
1467     p0 = _mm_or_si128(work_a, p0);
1468 
1469     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1470     p1 = _mm_load_si128((__m128i *)flat_op1);
1471     work_a = _mm_andnot_si128(flat, work_a);
1472     p1 = _mm_and_si128(flat, p1);
1473     p1 = _mm_or_si128(work_a, p1);
1474 
1475     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1476     p2 = _mm_load_si128((__m128i *)flat_op2);
1477     work_a = _mm_andnot_si128(flat, work_a);
1478     p2 = _mm_and_si128(flat, p2);
1479     p2 = _mm_or_si128(work_a, p2);
1480 
1481     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1482     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1483     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1484     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1485     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1486     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1487   }
1488 }
1489 
aom_lpf_horizontal_4_dual_sse2(unsigned char * s,int p,const unsigned char * _blimit0,const unsigned char * _limit0,const unsigned char * _thresh0,const unsigned char * _blimit1,const unsigned char * _limit1,const unsigned char * _thresh1)1490 void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1491                                     const unsigned char *_blimit0,
1492                                     const unsigned char *_limit0,
1493                                     const unsigned char *_thresh0,
1494                                     const unsigned char *_blimit1,
1495                                     const unsigned char *_limit1,
1496                                     const unsigned char *_thresh1) {
1497   const __m128i blimit =
1498       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1499                          _mm_load_si128((const __m128i *)_blimit1));
1500   const __m128i limit =
1501       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1502                          _mm_load_si128((const __m128i *)_limit1));
1503   const __m128i thresh =
1504       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1505                          _mm_load_si128((const __m128i *)_thresh1));
1506   const __m128i zero = _mm_set1_epi16(0);
1507 #if !CONFIG_PARALLEL_DEBLOCKING
1508   __m128i p3, p2, q2, q3;
1509 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1510   __m128i p1, p0, q0, q1;
1511   __m128i mask, hev, flat;
1512 #if !CONFIG_PARALLEL_DEBLOCKING
1513   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1514   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1515 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1516   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1517   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1518   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1519   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1520 #if !CONFIG_PARALLEL_DEBLOCKING
1521   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1522   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1523 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1524   // filter_mask and hev_mask
1525   {
1526     const __m128i abs_p1p0 =
1527         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1528     const __m128i abs_q1q0 =
1529         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1530     const __m128i fe = _mm_set1_epi8(0xfe);
1531     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1532     __m128i abs_p0q0 =
1533         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1534     __m128i abs_p1q1 =
1535         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1536 #if !CONFIG_PARALLEL_DEBLOCKING
1537     __m128i work;
1538 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1539     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1540     hev = _mm_subs_epu8(flat, thresh);
1541     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1542 
1543     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1544     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1545     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1546     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1547     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1548     mask = _mm_max_epu8(flat, mask);
1549 #if !CONFIG_PARALLEL_DEBLOCKING
1550     // mask |= (abs(p1 - p0) > limit) * -1;
1551     // mask |= (abs(q1 - q0) > limit) * -1;
1552     work = _mm_max_epu8(
1553         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1554         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1555     mask = _mm_max_epu8(work, mask);
1556     work = _mm_max_epu8(
1557         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1558         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1559     mask = _mm_max_epu8(work, mask);
1560 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1561     mask = _mm_subs_epu8(mask, limit);
1562     mask = _mm_cmpeq_epi8(mask, zero);
1563   }
1564 
1565   // filter4
1566   {
1567     const __m128i t4 = _mm_set1_epi8(4);
1568     const __m128i t3 = _mm_set1_epi8(3);
1569     const __m128i t80 = _mm_set1_epi8(0x80);
1570     const __m128i te0 = _mm_set1_epi8(0xe0);
1571     const __m128i t1f = _mm_set1_epi8(0x1f);
1572     const __m128i t1 = _mm_set1_epi8(0x1);
1573     const __m128i t7f = _mm_set1_epi8(0x7f);
1574 
1575     const __m128i ps1 =
1576         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1577     const __m128i ps0 =
1578         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1579     const __m128i qs0 =
1580         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1581     const __m128i qs1 =
1582         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1583     __m128i filt;
1584     __m128i work_a;
1585     __m128i filter1, filter2;
1586 
1587     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1588     work_a = _mm_subs_epi8(qs0, ps0);
1589     filt = _mm_adds_epi8(filt, work_a);
1590     filt = _mm_adds_epi8(filt, work_a);
1591     filt = _mm_adds_epi8(filt, work_a);
1592     // (aom_filter + 3 * (qs0 - ps0)) & mask
1593     filt = _mm_and_si128(filt, mask);
1594 
1595     filter1 = _mm_adds_epi8(filt, t4);
1596     filter2 = _mm_adds_epi8(filt, t3);
1597 
1598     // Filter1 >> 3
1599     work_a = _mm_cmpgt_epi8(zero, filter1);
1600     filter1 = _mm_srli_epi16(filter1, 3);
1601     work_a = _mm_and_si128(work_a, te0);
1602     filter1 = _mm_and_si128(filter1, t1f);
1603     filter1 = _mm_or_si128(filter1, work_a);
1604 
1605     // Filter2 >> 3
1606     work_a = _mm_cmpgt_epi8(zero, filter2);
1607     filter2 = _mm_srli_epi16(filter2, 3);
1608     work_a = _mm_and_si128(work_a, te0);
1609     filter2 = _mm_and_si128(filter2, t1f);
1610     filter2 = _mm_or_si128(filter2, work_a);
1611 
1612     // filt >> 1
1613     filt = _mm_adds_epi8(filter1, t1);
1614     work_a = _mm_cmpgt_epi8(zero, filt);
1615     filt = _mm_srli_epi16(filt, 1);
1616     work_a = _mm_and_si128(work_a, t80);
1617     filt = _mm_and_si128(filt, t7f);
1618     filt = _mm_or_si128(filt, work_a);
1619 
1620     filt = _mm_andnot_si128(hev, filt);
1621 
1622     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1623     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1624     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1625     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1626 
1627     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1628     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1629     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1630     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1631   }
1632 }
1633 
transpose8x16(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)1634 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1635                                  int in_p, unsigned char *out, int out_p) {
1636   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1637   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1638 
1639   // 2-way interleave w/hoisting of unpacks
1640   x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
1641   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
1642   x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
1643 
1644   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
1645   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
1646   x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
1647 
1648   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
1649   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
1650   x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
1651 
1652   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
1653   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
1654   x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
1655   x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
1656 
1657   x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
1658   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
1659   x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
1660   x5 = _mm_unpacklo_epi16(x2, x3);                // 10
1661 
1662   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
1663   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
1664   x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
1665 
1666   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
1667   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
1668   x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
1669   x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
1670 
1671   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
1672   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
1673   x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
1674   x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
1675 
1676   x6 = _mm_unpacklo_epi32(x4, x5);     // 13
1677   x7 = _mm_unpackhi_epi32(x4, x5);     // 14
1678   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
1679   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
1680 
1681   // Store first 4-line result
1682   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1683   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1684   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1685   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1686 
1687   x4 = _mm_unpackhi_epi16(x0, x1);
1688   x5 = _mm_unpackhi_epi16(x2, x3);
1689   x12 = _mm_unpackhi_epi16(x8, x9);
1690   x13 = _mm_unpackhi_epi16(x10, x11);
1691 
1692   x6 = _mm_unpacklo_epi32(x4, x5);
1693   x7 = _mm_unpackhi_epi32(x4, x5);
1694   x14 = _mm_unpacklo_epi32(x12, x13);
1695   x15 = _mm_unpackhi_epi32(x12, x13);
1696 
1697   // Store second 4-line result
1698   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1699   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1700   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1701   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1702 }
1703 
1704 #if CONFIG_PARALLEL_DEBLOCKING
1705 #define movq(p) _mm_loadl_epi64((const __m128i *)(p))
1706 #define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
1707 #define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
1708 #define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
1709 #define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
1710 #define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
1711 enum { ROTATE_DWORD_RIGHT = 0x39 };
transpose16x4(uint8_t * pDst,const ptrdiff_t dstStride,const uint8_t * pSrc,const ptrdiff_t srcStride)1712 static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
1713                                  const uint8_t *pSrc,
1714                                  const ptrdiff_t srcStride) {
1715   for (uint32_t idx = 0; idx < 2; idx += 1) {
1716     __m128i r0, r1, r2, r3;
1717     // load data
1718     r0 = movq(pSrc);
1719     r1 = movq(pSrc + srcStride);
1720     r2 = movq(pSrc + srcStride * 2);
1721     r3 = movq(pSrc + srcStride * 3);
1722     // transpose
1723     r0 = punpcklbw(r0, r1);
1724     r2 = punpcklbw(r2, r3);
1725     r1 = punpckhwd(r0, r2);
1726     r0 = punpcklwd(r0, r2);
1727     // store data
1728     movd(pDst, r0);
1729     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1730     movd(pDst + dstStride, r0);
1731     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1732     movd(pDst + dstStride * 2, r0);
1733     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1734     movd(pDst + dstStride * 3, r0);
1735     movd(pDst + dstStride * 4, r1);
1736     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1737     movd(pDst + dstStride * 5, r1);
1738     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1739     movd(pDst + dstStride * 6, r1);
1740     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1741     movd(pDst + dstStride * 7, r1);
1742     // advance the pointers
1743     pDst += dstStride * 8;
1744     pSrc += 8;
1745   }
1746 }
1747 
1748 #endif  // CONFIG_PARALLEL_DEBLOCKING
transpose(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)1749 static INLINE void transpose(unsigned char *src[], int in_p,
1750                              unsigned char *dst[], int out_p,
1751                              int num_8x8_to_transpose) {
1752   int idx8x8 = 0;
1753   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1754   do {
1755     unsigned char *in = src[idx8x8];
1756     unsigned char *out = dst[idx8x8];
1757 
1758     x0 =
1759         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
1760     x1 =
1761         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
1762     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1763     x0 = _mm_unpacklo_epi8(x0, x1);
1764 
1765     x2 =
1766         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
1767     x3 =
1768         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
1769     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1770     x1 = _mm_unpacklo_epi8(x2, x3);
1771 
1772     x4 =
1773         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
1774     x5 =
1775         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
1776     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1777     x2 = _mm_unpacklo_epi8(x4, x5);
1778 
1779     x6 =
1780         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
1781     x7 =
1782         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
1783     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1784     x3 = _mm_unpacklo_epi8(x6, x7);
1785 
1786     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1787     x4 = _mm_unpacklo_epi16(x0, x1);
1788     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1789     x5 = _mm_unpacklo_epi16(x2, x3);
1790     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1791     x6 = _mm_unpacklo_epi32(x4, x5);
1792     _mm_storel_pd((double *)(out + 0 * out_p),
1793                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1794     _mm_storeh_pd((double *)(out + 1 * out_p),
1795                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1796     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1797     x7 = _mm_unpackhi_epi32(x4, x5);
1798     _mm_storel_pd((double *)(out + 2 * out_p),
1799                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1800     _mm_storeh_pd((double *)(out + 3 * out_p),
1801                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1802 
1803     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1804     x4 = _mm_unpackhi_epi16(x0, x1);
1805     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1806     x5 = _mm_unpackhi_epi16(x2, x3);
1807     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1808     x6 = _mm_unpacklo_epi32(x4, x5);
1809     _mm_storel_pd((double *)(out + 4 * out_p),
1810                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1811     _mm_storeh_pd((double *)(out + 5 * out_p),
1812                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1813     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1814     x7 = _mm_unpackhi_epi32(x4, x5);
1815 
1816     _mm_storel_pd((double *)(out + 6 * out_p),
1817                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1818     _mm_storeh_pd((double *)(out + 7 * out_p),
1819                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1820   } while (++idx8x8 < num_8x8_to_transpose);
1821 }
1822 
aom_lpf_vertical_4_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1823 void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1824                                   const uint8_t *limit0, const uint8_t *thresh0,
1825                                   const uint8_t *blimit1, const uint8_t *limit1,
1826                                   const uint8_t *thresh1) {
1827   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1828 #if !CONFIG_PARALLEL_DEBLOCKING
1829   unsigned char *src[2];
1830   unsigned char *dst[2];
1831 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1832   // Transpose 8x16
1833   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1834 
1835   // Loop filtering
1836   aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1837                                  blimit1, limit1, thresh1);
1838 #if !CONFIG_PARALLEL_DEBLOCKING
1839   src[0] = t_dst;
1840   src[1] = t_dst + 8;
1841   dst[0] = s - 4;
1842   dst[1] = s - 4 + p * 8;
1843 
1844   // Transpose back
1845   transpose(src, 16, dst, p, 2);
1846 #else  // CONFIG_PARALLEL_DEBLOCKING
1847   transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
1848 #endif  // !CONFIG_PARALLEL_DEBLOCKING
1849 }
1850 
aom_lpf_vertical_8_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1851 void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
1852                              const unsigned char *blimit,
1853                              const unsigned char *limit,
1854                              const unsigned char *thresh) {
1855   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1856   unsigned char *src[1];
1857   unsigned char *dst[1];
1858 
1859   // Transpose 8x8
1860   src[0] = s - 4;
1861   dst[0] = t_dst;
1862 
1863   transpose(src, p, dst, 8, 1);
1864 
1865   // Loop filtering
1866   aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1867 
1868   src[0] = t_dst;
1869   dst[0] = s - 4;
1870 
1871   // Transpose back
1872   transpose(src, 8, dst, p, 1);
1873 }
1874 
aom_lpf_vertical_8_dual_sse2(uint8_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)1875 void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1876                                   const uint8_t *limit0, const uint8_t *thresh0,
1877                                   const uint8_t *blimit1, const uint8_t *limit1,
1878                                   const uint8_t *thresh1) {
1879   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1880   unsigned char *src[2];
1881   unsigned char *dst[2];
1882 
1883   // Transpose 8x16
1884   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1885 
1886   // Loop filtering
1887   aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1888                                  blimit1, limit1, thresh1);
1889   src[0] = t_dst;
1890   src[1] = t_dst + 8;
1891 
1892   dst[0] = s - 4;
1893   dst[1] = s - 4 + p * 8;
1894 
1895   // Transpose back
1896   transpose(src, 16, dst, p, 2);
1897 }
1898 
aom_lpf_vertical_16_sse2(unsigned char * s,int p,const unsigned char * blimit,const unsigned char * limit,const unsigned char * thresh)1899 void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
1900                               const unsigned char *blimit,
1901                               const unsigned char *limit,
1902                               const unsigned char *thresh) {
1903   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1904   unsigned char *src[2];
1905   unsigned char *dst[2];
1906 
1907   src[0] = s - 8;
1908   src[1] = s;
1909   dst[0] = t_dst;
1910   dst[1] = t_dst + 8 * 8;
1911 
1912   // Transpose 16x8
1913   transpose(src, p, dst, 8, 2);
1914 
1915   // Loop filtering
1916   aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1917 
1918   src[0] = t_dst;
1919   src[1] = t_dst + 8 * 8;
1920   dst[0] = s - 8;
1921   dst[1] = s;
1922 
1923   // Transpose back
1924   transpose(src, 8, dst, p, 2);
1925 }
1926 
aom_lpf_vertical_16_dual_sse2(unsigned char * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)1927 void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1928                                    const uint8_t *blimit, const uint8_t *limit,
1929                                    const uint8_t *thresh) {
1930   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1931 
1932   // Transpose 16x16
1933   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1934   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1935 
1936   // Loop filtering
1937   aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1938 
1939   // Transpose back
1940   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1941   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1942 }
1943