1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h> // SSE2
12
13 #include "./vp9_rtcd.h"
14 #include "vpx_ports/mem.h"
15 #include "vp9/common/vp9_loopfilter.h"
16 #include "vpx_ports/emmintrin_compat.h"
17
signed_char_clamp_bd_sse2(__m128i value,int bd)18 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
19 __m128i ubounded;
20 __m128i lbounded;
21 __m128i retval;
22
23 const __m128i zero = _mm_set1_epi16(0);
24 const __m128i one = _mm_set1_epi16(1);
25 __m128i t80, max, min;
26
27 if (bd == 8) {
28 t80 = _mm_set1_epi16(0x80);
29 max = _mm_subs_epi16(
30 _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
31 } else if (bd == 10) {
32 t80 = _mm_set1_epi16(0x200);
33 max = _mm_subs_epi16(
34 _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
35 } else { // bd == 12
36 t80 = _mm_set1_epi16(0x800);
37 max = _mm_subs_epi16(
38 _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
39 }
40
41 min = _mm_subs_epi16(zero, t80);
42
43 ubounded = _mm_cmpgt_epi16(value, max);
44 lbounded = _mm_cmplt_epi16(value, min);
45 retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
46 ubounded = _mm_and_si128(ubounded, max);
47 lbounded = _mm_and_si128(lbounded, min);
48 retval = _mm_or_si128(retval, ubounded);
49 retval = _mm_or_si128(retval, lbounded);
50 return retval;
51 }
52
53 // TODO(debargha, peter): Break up large functions into smaller ones
54 // in this file.
highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh,int bd)55 static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
56 int p,
57 const uint8_t *_blimit,
58 const uint8_t *_limit,
59 const uint8_t *_thresh,
60 int bd) {
61 const __m128i zero = _mm_set1_epi16(0);
62 const __m128i one = _mm_set1_epi16(1);
63 __m128i blimit, limit, thresh;
64 __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
65 __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
66 __m128i ps1, qs1, ps0, qs0;
67 __m128i abs_p0q0, abs_p1q1, ffff, work;
68 __m128i filt, work_a, filter1, filter2;
69 __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
70 __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
71 __m128i flat2_q0, flat2_p0;
72 __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
73 __m128i pixelFilter_p, pixelFilter_q;
74 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
75 __m128i sum_p7, sum_q7, sum_p3, sum_q3;
76 __m128i t4, t3, t80, t1;
77 __m128i eight, four;
78
79 if (bd == 8) {
80 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
81 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
82 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
83 } else if (bd == 10) {
84 blimit = _mm_slli_epi16(
85 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
86 limit = _mm_slli_epi16(
87 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
88 thresh = _mm_slli_epi16(
89 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
90 } else { // bd == 12
91 blimit = _mm_slli_epi16(
92 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
93 limit = _mm_slli_epi16(
94 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
95 thresh = _mm_slli_epi16(
96 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
97 }
98
99 q4 = _mm_load_si128((__m128i *)(s + 4 * p));
100 p4 = _mm_load_si128((__m128i *)(s - 5 * p));
101 q3 = _mm_load_si128((__m128i *)(s + 3 * p));
102 p3 = _mm_load_si128((__m128i *)(s - 4 * p));
103 q2 = _mm_load_si128((__m128i *)(s + 2 * p));
104 p2 = _mm_load_si128((__m128i *)(s - 3 * p));
105 q1 = _mm_load_si128((__m128i *)(s + 1 * p));
106 p1 = _mm_load_si128((__m128i *)(s - 2 * p));
107 q0 = _mm_load_si128((__m128i *)(s + 0 * p));
108 p0 = _mm_load_si128((__m128i *)(s - 1 * p));
109
110 // highbd_filter_mask
111 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
112 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
113
114 ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
115
116 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
117 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
118
119 // highbd_hev_mask (in C code this is actually called from highbd_filter4)
120 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
121 hev = _mm_subs_epu16(flat, thresh);
122 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
123
124 abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
125 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
126 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
127 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
128 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
129 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p1, p0),
130 _mm_subs_epu16(p0, p1)),
131 _mm_or_si128(_mm_subs_epu16(q1, q0),
132 _mm_subs_epu16(q0, q1)));
133 mask = _mm_max_epi16(work, mask);
134 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
135 _mm_subs_epu16(p1, p2)),
136 _mm_or_si128(_mm_subs_epu16(q2, q1),
137 _mm_subs_epu16(q1, q2)));
138 mask = _mm_max_epi16(work, mask);
139 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
140 _mm_subs_epu16(p2, p3)),
141 _mm_or_si128(_mm_subs_epu16(q3, q2),
142 _mm_subs_epu16(q2, q3)));
143 mask = _mm_max_epi16(work, mask);
144
145 mask = _mm_subs_epu16(mask, limit);
146 mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
147
148 // lp filter
149 // highbd_filter4
150 t4 = _mm_set1_epi16(4);
151 t3 = _mm_set1_epi16(3);
152 if (bd == 8)
153 t80 = _mm_set1_epi16(0x80);
154 else if (bd == 10)
155 t80 = _mm_set1_epi16(0x200);
156 else // bd == 12
157 t80 = _mm_set1_epi16(0x800);
158
159 t1 = _mm_set1_epi16(0x1);
160
161 ps1 = _mm_subs_epi16(p1, t80);
162 qs1 = _mm_subs_epi16(q1, t80);
163 ps0 = _mm_subs_epi16(p0, t80);
164 qs0 = _mm_subs_epi16(q0, t80);
165
166 filt = _mm_and_si128(
167 signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), hev);
168 work_a = _mm_subs_epi16(qs0, ps0);
169 filt = _mm_adds_epi16(filt, work_a);
170 filt = _mm_adds_epi16(filt, work_a);
171 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
172 filt = _mm_and_si128(filt, mask);
173 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
174 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
175
176 // Filter1 >> 3
177 filter1 = _mm_srai_epi16(filter1, 0x3);
178 filter2 = _mm_srai_epi16(filter2, 0x3);
179
180 qs0 = _mm_adds_epi16(
181 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd),
182 t80);
183 ps0 = _mm_adds_epi16(
184 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd),
185 t80);
186 filt = _mm_adds_epi16(filter1, t1);
187 filt = _mm_srai_epi16(filt, 1);
188 filt = _mm_andnot_si128(hev, filt);
189 qs1 = _mm_adds_epi16(
190 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
191 t80);
192 ps1 = _mm_adds_epi16(
193 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
194 t80);
195
196 // end highbd_filter4
197 // loopfilter done
198
199 // highbd_flat_mask4
200 flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
201 _mm_subs_epu16(p0, p2)),
202 _mm_or_si128(_mm_subs_epu16(p3, p0),
203 _mm_subs_epu16(p0, p3)));
204 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q0),
205 _mm_subs_epu16(q0, q2)),
206 _mm_or_si128(_mm_subs_epu16(q3, q0),
207 _mm_subs_epu16(q0, q3)));
208 flat = _mm_max_epi16(work, flat);
209 work = _mm_max_epi16(abs_p1p0, abs_q1q0);
210 flat = _mm_max_epi16(work, flat);
211
212 if (bd == 8)
213 flat = _mm_subs_epu16(flat, one);
214 else if (bd == 10)
215 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
216 else // bd == 12
217 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
218
219 flat = _mm_cmpeq_epi16(flat, zero);
220 // end flat_mask4
221
222 // flat & mask = flat && mask (as used in filter8)
223 // (because, in both vars, each block of 16 either all 1s or all 0s)
224 flat = _mm_and_si128(flat, mask);
225
226 p5 = _mm_load_si128((__m128i *)(s - 6 * p));
227 q5 = _mm_load_si128((__m128i *)(s + 5 * p));
228 p6 = _mm_load_si128((__m128i *)(s - 7 * p));
229 q6 = _mm_load_si128((__m128i *)(s + 6 * p));
230 p7 = _mm_load_si128((__m128i *)(s - 8 * p));
231 q7 = _mm_load_si128((__m128i *)(s + 7 * p));
232
233 // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
234 // but referred to as p0-p4 & q0-q4 in fn)
235 flat2 = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p4, p0),
236 _mm_subs_epu16(p0, p4)),
237 _mm_or_si128(_mm_subs_epu16(q4, q0),
238 _mm_subs_epu16(q0, q4)));
239
240 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p5, p0),
241 _mm_subs_epu16(p0, p5)),
242 _mm_or_si128(_mm_subs_epu16(q5, q0),
243 _mm_subs_epu16(q0, q5)));
244 flat2 = _mm_max_epi16(work, flat2);
245
246 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p6, p0),
247 _mm_subs_epu16(p0, p6)),
248 _mm_or_si128(_mm_subs_epu16(q6, q0),
249 _mm_subs_epu16(q0, q6)));
250 flat2 = _mm_max_epi16(work, flat2);
251
252 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p7, p0),
253 _mm_subs_epu16(p0, p7)),
254 _mm_or_si128(_mm_subs_epu16(q7, q0),
255 _mm_subs_epu16(q0, q7)));
256 flat2 = _mm_max_epi16(work, flat2);
257
258 if (bd == 8)
259 flat2 = _mm_subs_epu16(flat2, one);
260 else if (bd == 10)
261 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
262 else // bd == 12
263 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
264
265 flat2 = _mm_cmpeq_epi16(flat2, zero);
266 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
267 // end highbd_flat_mask5
268
269 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
270 // flat and wide flat calculations
271 eight = _mm_set1_epi16(8);
272 four = _mm_set1_epi16(4);
273
274 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5),
275 _mm_add_epi16(p4, p3));
276 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5),
277 _mm_add_epi16(q4, q3));
278
279 pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
280 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
281
282 pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
283 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
284 pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
285 pixelFilter_q));
286 pixetFilter_p2p1p0 = _mm_add_epi16(four,
287 _mm_add_epi16(pixetFilter_p2p1p0,
288 pixetFilter_q2q1q0));
289 flat2_p0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
290 _mm_add_epi16(p7, p0)), 4);
291 flat2_q0 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
292 _mm_add_epi16(q7, q0)), 4);
293 flat_p0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
294 _mm_add_epi16(p3, p0)), 3);
295 flat_q0 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
296 _mm_add_epi16(q3, q0)), 3);
297
298 sum_p7 = _mm_add_epi16(p7, p7);
299 sum_q7 = _mm_add_epi16(q7, q7);
300 sum_p3 = _mm_add_epi16(p3, p3);
301 sum_q3 = _mm_add_epi16(q3, q3);
302
303 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
304 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
305 flat2_p1 = _mm_srli_epi16(
306 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
307 flat2_q1 = _mm_srli_epi16(
308 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
309
310 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
311 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
312 flat_p1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
313 _mm_add_epi16(sum_p3, p1)), 3);
314 flat_q1 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
315 _mm_add_epi16(sum_q3, q1)), 3);
316
317 sum_p7 = _mm_add_epi16(sum_p7, p7);
318 sum_q7 = _mm_add_epi16(sum_q7, q7);
319 sum_p3 = _mm_add_epi16(sum_p3, p3);
320 sum_q3 = _mm_add_epi16(sum_q3, q3);
321
322 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
323 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
324 flat2_p2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
325 _mm_add_epi16(sum_p7, p2)), 4);
326 flat2_q2 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
327 _mm_add_epi16(sum_q7, q2)), 4);
328
329 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
330 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
331 flat_p2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
332 _mm_add_epi16(sum_p3, p2)), 3);
333 flat_q2 = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
334 _mm_add_epi16(sum_q3, q2)), 3);
335
336 sum_p7 = _mm_add_epi16(sum_p7, p7);
337 sum_q7 = _mm_add_epi16(sum_q7, q7);
338 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
339 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
340 flat2_p3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
341 _mm_add_epi16(sum_p7, p3)), 4);
342 flat2_q3 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
343 _mm_add_epi16(sum_q7, q3)), 4);
344
345 sum_p7 = _mm_add_epi16(sum_p7, p7);
346 sum_q7 = _mm_add_epi16(sum_q7, q7);
347 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
348 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
349 flat2_p4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
350 _mm_add_epi16(sum_p7, p4)), 4);
351 flat2_q4 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
352 _mm_add_epi16(sum_q7, q4)), 4);
353
354 sum_p7 = _mm_add_epi16(sum_p7, p7);
355 sum_q7 = _mm_add_epi16(sum_q7, q7);
356 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
357 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
358 flat2_p5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
359 _mm_add_epi16(sum_p7, p5)), 4);
360 flat2_q5 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
361 _mm_add_epi16(sum_q7, q5)), 4);
362
363 sum_p7 = _mm_add_epi16(sum_p7, p7);
364 sum_q7 = _mm_add_epi16(sum_q7, q7);
365 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
366 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
367 flat2_p6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
368 _mm_add_epi16(sum_p7, p6)), 4);
369 flat2_q6 = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
370 _mm_add_epi16(sum_q7, q6)), 4);
371
372 // wide flat
373 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
374
375 // highbd_filter8
376 p2 = _mm_andnot_si128(flat, p2);
377 // p2 remains unchanged if !(flat && mask)
378 flat_p2 = _mm_and_si128(flat, flat_p2);
379 // when (flat && mask)
380 p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
381 q2 = _mm_andnot_si128(flat, q2);
382 flat_q2 = _mm_and_si128(flat, flat_q2);
383 q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
384
385 ps1 = _mm_andnot_si128(flat, ps1);
386 // p1 takes the value assigned to in in filter4 if !(flat && mask)
387 flat_p1 = _mm_and_si128(flat, flat_p1);
388 // when (flat && mask)
389 p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
390 qs1 = _mm_andnot_si128(flat, qs1);
391 flat_q1 = _mm_and_si128(flat, flat_q1);
392 q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
393
394 ps0 = _mm_andnot_si128(flat, ps0);
395 // p0 takes the value assigned to in in filter4 if !(flat && mask)
396 flat_p0 = _mm_and_si128(flat, flat_p0);
397 // when (flat && mask)
398 p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
399 qs0 = _mm_andnot_si128(flat, qs0);
400 flat_q0 = _mm_and_si128(flat, flat_q0);
401 q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
402 // end highbd_filter8
403
404 // highbd_filter16
405 p6 = _mm_andnot_si128(flat2, p6);
406 // p6 remains unchanged if !(flat2 && flat && mask)
407 flat2_p6 = _mm_and_si128(flat2, flat2_p6);
408 // get values for when (flat2 && flat && mask)
409 p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
410 q6 = _mm_andnot_si128(flat2, q6);
411 // q6 remains unchanged if !(flat2 && flat && mask)
412 flat2_q6 = _mm_and_si128(flat2, flat2_q6);
413 // get values for when (flat2 && flat && mask)
414 q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
415 _mm_store_si128((__m128i *)(s - 7 * p), p6);
416 _mm_store_si128((__m128i *)(s + 6 * p), q6);
417
418 p5 = _mm_andnot_si128(flat2, p5);
419 // p5 remains unchanged if !(flat2 && flat && mask)
420 flat2_p5 = _mm_and_si128(flat2, flat2_p5);
421 // get values for when (flat2 && flat && mask)
422 p5 = _mm_or_si128(p5, flat2_p5);
423 // full list of p5 values
424 q5 = _mm_andnot_si128(flat2, q5);
425 // q5 remains unchanged if !(flat2 && flat && mask)
426 flat2_q5 = _mm_and_si128(flat2, flat2_q5);
427 // get values for when (flat2 && flat && mask)
428 q5 = _mm_or_si128(q5, flat2_q5);
429 // full list of q5 values
430 _mm_store_si128((__m128i *)(s - 6 * p), p5);
431 _mm_store_si128((__m128i *)(s + 5 * p), q5);
432
433 p4 = _mm_andnot_si128(flat2, p4);
434 // p4 remains unchanged if !(flat2 && flat && mask)
435 flat2_p4 = _mm_and_si128(flat2, flat2_p4);
436 // get values for when (flat2 && flat && mask)
437 p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
438 q4 = _mm_andnot_si128(flat2, q4);
439 // q4 remains unchanged if !(flat2 && flat && mask)
440 flat2_q4 = _mm_and_si128(flat2, flat2_q4);
441 // get values for when (flat2 && flat && mask)
442 q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
443 _mm_store_si128((__m128i *)(s - 5 * p), p4);
444 _mm_store_si128((__m128i *)(s + 4 * p), q4);
445
446 p3 = _mm_andnot_si128(flat2, p3);
447 // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
448 flat2_p3 = _mm_and_si128(flat2, flat2_p3);
449 // get values for when (flat2 && flat && mask)
450 p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
451 q3 = _mm_andnot_si128(flat2, q3);
452 // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
453 flat2_q3 = _mm_and_si128(flat2, flat2_q3);
454 // get values for when (flat2 && flat && mask)
455 q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
456 _mm_store_si128((__m128i *)(s - 4 * p), p3);
457 _mm_store_si128((__m128i *)(s + 3 * p), q3);
458
459 p2 = _mm_andnot_si128(flat2, p2);
460 // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
461 flat2_p2 = _mm_and_si128(flat2, flat2_p2);
462 // get values for when (flat2 && flat && mask)
463 p2 = _mm_or_si128(p2, flat2_p2);
464 // full list of p2 values
465 q2 = _mm_andnot_si128(flat2, q2);
466 // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
467 flat2_q2 = _mm_and_si128(flat2, flat2_q2);
468 // get values for when (flat2 && flat && mask)
469 q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
470 _mm_store_si128((__m128i *)(s - 3 * p), p2);
471 _mm_store_si128((__m128i *)(s + 2 * p), q2);
472
473 p1 = _mm_andnot_si128(flat2, p1);
474 // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
475 flat2_p1 = _mm_and_si128(flat2, flat2_p1);
476 // get values for when (flat2 && flat && mask)
477 p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
478 q1 = _mm_andnot_si128(flat2, q1);
479 // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
480 flat2_q1 = _mm_and_si128(flat2, flat2_q1);
481 // get values for when (flat2 && flat && mask)
482 q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
483 _mm_store_si128((__m128i *)(s - 2 * p), p1);
484 _mm_store_si128((__m128i *)(s + 1 * p), q1);
485
486 p0 = _mm_andnot_si128(flat2, p0);
487 // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
488 flat2_p0 = _mm_and_si128(flat2, flat2_p0);
489 // get values for when (flat2 && flat && mask)
490 p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
491 q0 = _mm_andnot_si128(flat2, q0);
492 // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
493 flat2_q0 = _mm_and_si128(flat2, flat2_q0);
494 // get values for when (flat2 && flat && mask)
495 q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
496 _mm_store_si128((__m128i *)(s - 1 * p), p0);
497 _mm_store_si128((__m128i *)(s - 0 * p), q0);
498 }
499
highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh,int bd)500 static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
501 int p,
502 const uint8_t *_blimit,
503 const uint8_t *_limit,
504 const uint8_t *_thresh,
505 int bd) {
506 highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
507 highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
508 bd);
509 }
510
511 // TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
vp9_highbd_lpf_horizontal_16_sse2(uint16_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh,int count,int bd)512 void vp9_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
513 const uint8_t *_blimit,
514 const uint8_t *_limit,
515 const uint8_t *_thresh,
516 int count, int bd) {
517 if (count == 1)
518 highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
519 else
520 highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
521 }
522
vp9_highbd_lpf_horizontal_8_sse2(uint16_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh,int count,int bd)523 void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
524 const uint8_t *_blimit,
525 const uint8_t *_limit,
526 const uint8_t *_thresh,
527 int count, int bd) {
528 DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
529 DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
530 DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
531 DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
532 DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
533 DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
534 const __m128i zero = _mm_set1_epi16(0);
535 __m128i blimit, limit, thresh;
536 __m128i mask, hev, flat;
537 __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
538 __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
539 __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
540 __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
541 __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
542 __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
543 __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
544 __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
545 const __m128i one = _mm_set1_epi16(1);
546 const __m128i ffff = _mm_cmpeq_epi16(one, one);
547 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
548 const __m128i four = _mm_set1_epi16(4);
549 __m128i workp_a, workp_b, workp_shft;
550
551 const __m128i t4 = _mm_set1_epi16(4);
552 const __m128i t3 = _mm_set1_epi16(3);
553 __m128i t80;
554 const __m128i t1 = _mm_set1_epi16(0x1);
555 __m128i ps1, ps0, qs0, qs1;
556 __m128i filt;
557 __m128i work_a;
558 __m128i filter1, filter2;
559
560 (void)count;
561
562 if (bd == 8) {
563 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
564 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
565 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
566 t80 = _mm_set1_epi16(0x80);
567 } else if (bd == 10) {
568 blimit = _mm_slli_epi16(
569 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
570 limit = _mm_slli_epi16(
571 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
572 thresh = _mm_slli_epi16(
573 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
574 t80 = _mm_set1_epi16(0x200);
575 } else { // bd == 12
576 blimit = _mm_slli_epi16(
577 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
578 limit = _mm_slli_epi16(
579 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
580 thresh = _mm_slli_epi16(
581 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
582 t80 = _mm_set1_epi16(0x800);
583 }
584
585 ps1 = _mm_subs_epi16(p1, t80);
586 ps0 = _mm_subs_epi16(p0, t80);
587 qs0 = _mm_subs_epi16(q0, t80);
588 qs1 = _mm_subs_epi16(q1, t80);
589
590 // filter_mask and hev_mask
591 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
592 _mm_subs_epu16(p0, p1));
593 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
594 _mm_subs_epu16(q0, q1));
595
596 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
597 _mm_subs_epu16(q0, p0));
598 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
599 _mm_subs_epu16(q1, p1));
600 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
601 hev = _mm_subs_epu16(flat, thresh);
602 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
603
604 abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
605 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
606 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
607 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
608 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
609 // So taking maximums continues to work:
610 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
611 mask = _mm_max_epi16(abs_p1p0, mask);
612 // mask |= (abs(p1 - p0) > limit) * -1;
613 mask = _mm_max_epi16(abs_q1q0, mask);
614 // mask |= (abs(q1 - q0) > limit) * -1;
615
616 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
617 _mm_subs_epu16(p1, p2)),
618 _mm_or_si128(_mm_subs_epu16(q2, q1),
619 _mm_subs_epu16(q1, q2)));
620 mask = _mm_max_epi16(work, mask);
621 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p2),
622 _mm_subs_epu16(p2, p3)),
623 _mm_or_si128(_mm_subs_epu16(q3, q2),
624 _mm_subs_epu16(q2, q3)));
625 mask = _mm_max_epi16(work, mask);
626 mask = _mm_subs_epu16(mask, limit);
627 mask = _mm_cmpeq_epi16(mask, zero);
628
629 // flat_mask4
630 flat = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p0),
631 _mm_subs_epu16(p0, p2)),
632 _mm_or_si128(_mm_subs_epu16(q2, q0),
633 _mm_subs_epu16(q0, q2)));
634 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p3, p0),
635 _mm_subs_epu16(p0, p3)),
636 _mm_or_si128(_mm_subs_epu16(q3, q0),
637 _mm_subs_epu16(q0, q3)));
638 flat = _mm_max_epi16(work, flat);
639 flat = _mm_max_epi16(abs_p1p0, flat);
640 flat = _mm_max_epi16(abs_q1q0, flat);
641
642 if (bd == 8)
643 flat = _mm_subs_epu16(flat, one);
644 else if (bd == 10)
645 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
646 else // bd == 12
647 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
648
649 flat = _mm_cmpeq_epi16(flat, zero);
650 flat = _mm_and_si128(flat, mask); // flat & mask
651
652 // Added before shift for rounding part of ROUND_POWER_OF_TWO
653
654 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
655 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
656 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
657 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
658 _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
659
660 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
661 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
662 _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
663
664 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
665 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
666 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
667 _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
668
669 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
670 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
671 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
672 _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
673
674 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
675 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
676 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
677 _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
678
679 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
680 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
681 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
682 _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
683
684 // lp filter
685 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
686 filt = _mm_and_si128(filt, hev);
687 work_a = _mm_subs_epi16(qs0, ps0);
688 filt = _mm_adds_epi16(filt, work_a);
689 filt = _mm_adds_epi16(filt, work_a);
690 filt = _mm_adds_epi16(filt, work_a);
691 // (vp9_filter + 3 * (qs0 - ps0)) & mask
692 filt = signed_char_clamp_bd_sse2(filt, bd);
693 filt = _mm_and_si128(filt, mask);
694
695 filter1 = _mm_adds_epi16(filt, t4);
696 filter2 = _mm_adds_epi16(filt, t3);
697
698 // Filter1 >> 3
699 filter1 = signed_char_clamp_bd_sse2(filter1, bd);
700 filter1 = _mm_srai_epi16(filter1, 3);
701
702 // Filter2 >> 3
703 filter2 = signed_char_clamp_bd_sse2(filter2, bd);
704 filter2 = _mm_srai_epi16(filter2, 3);
705
706 // filt >> 1
707 filt = _mm_adds_epi16(filter1, t1);
708 filt = _mm_srai_epi16(filt, 1);
709 // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
710 filt = _mm_andnot_si128(hev, filt);
711
712 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
713 work_a = _mm_adds_epi16(work_a, t80);
714 q0 = _mm_load_si128((__m128i *)flat_oq0);
715 work_a = _mm_andnot_si128(flat, work_a);
716 q0 = _mm_and_si128(flat, q0);
717 q0 = _mm_or_si128(work_a, q0);
718
719 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
720 work_a = _mm_adds_epi16(work_a, t80);
721 q1 = _mm_load_si128((__m128i *)flat_oq1);
722 work_a = _mm_andnot_si128(flat, work_a);
723 q1 = _mm_and_si128(flat, q1);
724 q1 = _mm_or_si128(work_a, q1);
725
726 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
727 q2 = _mm_load_si128((__m128i *)flat_oq2);
728 work_a = _mm_andnot_si128(flat, work_a);
729 q2 = _mm_and_si128(flat, q2);
730 q2 = _mm_or_si128(work_a, q2);
731
732 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
733 work_a = _mm_adds_epi16(work_a, t80);
734 p0 = _mm_load_si128((__m128i *)flat_op0);
735 work_a = _mm_andnot_si128(flat, work_a);
736 p0 = _mm_and_si128(flat, p0);
737 p0 = _mm_or_si128(work_a, p0);
738
739 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
740 work_a = _mm_adds_epi16(work_a, t80);
741 p1 = _mm_load_si128((__m128i *)flat_op1);
742 work_a = _mm_andnot_si128(flat, work_a);
743 p1 = _mm_and_si128(flat, p1);
744 p1 = _mm_or_si128(work_a, p1);
745
746 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
747 p2 = _mm_load_si128((__m128i *)flat_op2);
748 work_a = _mm_andnot_si128(flat, work_a);
749 p2 = _mm_and_si128(flat, p2);
750 p2 = _mm_or_si128(work_a, p2);
751
752 _mm_store_si128((__m128i *)(s - 3 * p), p2);
753 _mm_store_si128((__m128i *)(s - 2 * p), p1);
754 _mm_store_si128((__m128i *)(s - 1 * p), p0);
755 _mm_store_si128((__m128i *)(s + 0 * p), q0);
756 _mm_store_si128((__m128i *)(s + 1 * p), q1);
757 _mm_store_si128((__m128i *)(s + 2 * p), q2);
758 }
759
vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1,int bd)760 void vp9_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
761 const uint8_t *_blimit0,
762 const uint8_t *_limit0,
763 const uint8_t *_thresh0,
764 const uint8_t *_blimit1,
765 const uint8_t *_limit1,
766 const uint8_t *_thresh1,
767 int bd) {
768 vp9_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
769 vp9_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
770 1, bd);
771 }
772
vp9_highbd_lpf_horizontal_4_sse2(uint16_t * s,int p,const uint8_t * _blimit,const uint8_t * _limit,const uint8_t * _thresh,int count,int bd)773 void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
774 const uint8_t *_blimit,
775 const uint8_t *_limit,
776 const uint8_t *_thresh,
777 int count, int bd) {
778 const __m128i zero = _mm_set1_epi16(0);
779 __m128i blimit, limit, thresh;
780 __m128i mask, hev, flat;
781 __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
782 __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
783 __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
784 __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
785 __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
786 __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
787 __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
788 __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
789 const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
790 _mm_subs_epu16(p0, p1));
791 const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0),
792 _mm_subs_epu16(q0, q1));
793 const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
794 const __m128i one = _mm_set1_epi16(1);
795 __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0),
796 _mm_subs_epu16(q0, p0));
797 __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1),
798 _mm_subs_epu16(q1, p1));
799 __m128i work;
800 const __m128i t4 = _mm_set1_epi16(4);
801 const __m128i t3 = _mm_set1_epi16(3);
802 __m128i t80;
803 __m128i tff80;
804 __m128i tffe0;
805 __m128i t1f;
806 // equivalent to shifting 0x1f left by bitdepth - 8
807 // and setting new bits to 1
808 const __m128i t1 = _mm_set1_epi16(0x1);
809 __m128i t7f;
810 // equivalent to shifting 0x7f left by bitdepth - 8
811 // and setting new bits to 1
812 __m128i ps1, ps0, qs0, qs1;
813 __m128i filt;
814 __m128i work_a;
815 __m128i filter1, filter2;
816
817 (void)count;
818
819 if (bd == 8) {
820 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
821 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
822 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
823 t80 = _mm_set1_epi16(0x80);
824 tff80 = _mm_set1_epi16(0xff80);
825 tffe0 = _mm_set1_epi16(0xffe0);
826 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
827 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
828 } else if (bd == 10) {
829 blimit = _mm_slli_epi16(
830 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
831 limit = _mm_slli_epi16(
832 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
833 thresh = _mm_slli_epi16(
834 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
835 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
836 tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
837 tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
838 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
839 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
840 } else { // bd == 12
841 blimit = _mm_slli_epi16(
842 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
843 limit = _mm_slli_epi16(
844 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
845 thresh = _mm_slli_epi16(
846 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
847 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
848 tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
849 tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
850 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
851 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
852 }
853
854 ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
855 ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
856 qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
857 qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
858
859 // filter_mask and hev_mask
860 flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
861 hev = _mm_subs_epu16(flat, thresh);
862 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
863
864 abs_p0q0 =_mm_adds_epu16(abs_p0q0, abs_p0q0);
865 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
866 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
867 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
868 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
869 // So taking maximums continues to work:
870 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
871 mask = _mm_max_epi16(flat, mask);
872 // mask |= (abs(p1 - p0) > limit) * -1;
873 // mask |= (abs(q1 - q0) > limit) * -1;
874 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(p2, p1),
875 _mm_subs_epu16(p1, p2)),
876 _mm_or_si128(_mm_subs_epu16(p3, p2),
877 _mm_subs_epu16(p2, p3)));
878 mask = _mm_max_epi16(work, mask);
879 work = _mm_max_epi16(_mm_or_si128(_mm_subs_epu16(q2, q1),
880 _mm_subs_epu16(q1, q2)),
881 _mm_or_si128(_mm_subs_epu16(q3, q2),
882 _mm_subs_epu16(q2, q3)));
883 mask = _mm_max_epi16(work, mask);
884 mask = _mm_subs_epu16(mask, limit);
885 mask = _mm_cmpeq_epi16(mask, zero);
886
887 // filter4
888 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
889 filt = _mm_and_si128(filt, hev);
890 work_a = _mm_subs_epi16(qs0, ps0);
891 filt = _mm_adds_epi16(filt, work_a);
892 filt = _mm_adds_epi16(filt, work_a);
893 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
894
895 // (vp9_filter + 3 * (qs0 - ps0)) & mask
896 filt = _mm_and_si128(filt, mask);
897
898 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
899 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
900
901 // Filter1 >> 3
902 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
903 filter1 = _mm_srli_epi16(filter1, 3);
904 work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
905 filter1 = _mm_and_si128(filter1, t1f); // clamp the range
906 filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
907
908 // Filter2 >> 3
909 work_a = _mm_cmpgt_epi16(zero, filter2);
910 filter2 = _mm_srli_epi16(filter2, 3);
911 work_a = _mm_and_si128(work_a, tffe0);
912 filter2 = _mm_and_si128(filter2, t1f);
913 filter2 = _mm_or_si128(filter2, work_a);
914
915 // filt >> 1
916 filt = _mm_adds_epi16(filter1, t1);
917 work_a = _mm_cmpgt_epi16(zero, filt);
918 filt = _mm_srli_epi16(filt, 1);
919 work_a = _mm_and_si128(work_a, tff80);
920 filt = _mm_and_si128(filt, t7f);
921 filt = _mm_or_si128(filt, work_a);
922
923 filt = _mm_andnot_si128(hev, filt);
924
925 q0 = _mm_adds_epi16(
926 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
927 q1 = _mm_adds_epi16(
928 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), t80);
929 p0 = _mm_adds_epi16(
930 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
931 p1 = _mm_adds_epi16(
932 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80);
933
934 _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
935 _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
936 _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
937 _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
938 }
939
vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t * s,int p,const uint8_t * _blimit0,const uint8_t * _limit0,const uint8_t * _thresh0,const uint8_t * _blimit1,const uint8_t * _limit1,const uint8_t * _thresh1,int bd)940 void vp9_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
941 const uint8_t *_blimit0,
942 const uint8_t *_limit0,
943 const uint8_t *_thresh0,
944 const uint8_t *_blimit1,
945 const uint8_t *_limit1,
946 const uint8_t *_thresh1,
947 int bd) {
948 vp9_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
949 vp9_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
950 bd);
951 }
952
highbd_transpose(uint16_t * src[],int in_p,uint16_t * dst[],int out_p,int num_8x8_to_transpose)953 static INLINE void highbd_transpose(uint16_t *src[], int in_p,
954 uint16_t *dst[], int out_p,
955 int num_8x8_to_transpose) {
956 int idx8x8 = 0;
957 __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
958 do {
959 uint16_t *in = src[idx8x8];
960 uint16_t *out = dst[idx8x8];
961
962 p0 = _mm_loadu_si128((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
963 p1 = _mm_loadu_si128((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
964 p2 = _mm_loadu_si128((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
965 p3 = _mm_loadu_si128((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
966 p4 = _mm_loadu_si128((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
967 p5 = _mm_loadu_si128((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
968 p6 = _mm_loadu_si128((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
969 p7 = _mm_loadu_si128((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
970 // 00 10 01 11 02 12 03 13
971 x0 = _mm_unpacklo_epi16(p0, p1);
972 // 20 30 21 31 22 32 23 33
973 x1 = _mm_unpacklo_epi16(p2, p3);
974 // 40 50 41 51 42 52 43 53
975 x2 = _mm_unpacklo_epi16(p4, p5);
976 // 60 70 61 71 62 72 63 73
977 x3 = _mm_unpacklo_epi16(p6, p7);
978 // 00 10 20 30 01 11 21 31
979 x4 = _mm_unpacklo_epi32(x0, x1);
980 // 40 50 60 70 41 51 61 71
981 x5 = _mm_unpacklo_epi32(x2, x3);
982 // 00 10 20 30 40 50 60 70
983 x6 = _mm_unpacklo_epi64(x4, x5);
984 // 01 11 21 31 41 51 61 71
985 x7 = _mm_unpackhi_epi64(x4, x5);
986
987 _mm_storeu_si128((__m128i *)(out + 0*out_p), x6);
988 // 00 10 20 30 40 50 60 70
989 _mm_storeu_si128((__m128i *)(out + 1*out_p), x7);
990 // 01 11 21 31 41 51 61 71
991
992 // 02 12 22 32 03 13 23 33
993 x4 = _mm_unpackhi_epi32(x0, x1);
994 // 42 52 62 72 43 53 63 73
995 x5 = _mm_unpackhi_epi32(x2, x3);
996 // 02 12 22 32 42 52 62 72
997 x6 = _mm_unpacklo_epi64(x4, x5);
998 // 03 13 23 33 43 53 63 73
999 x7 = _mm_unpackhi_epi64(x4, x5);
1000
1001 _mm_storeu_si128((__m128i *)(out + 2*out_p), x6);
1002 // 02 12 22 32 42 52 62 72
1003 _mm_storeu_si128((__m128i *)(out + 3*out_p), x7);
1004 // 03 13 23 33 43 53 63 73
1005
1006 // 04 14 05 15 06 16 07 17
1007 x0 = _mm_unpackhi_epi16(p0, p1);
1008 // 24 34 25 35 26 36 27 37
1009 x1 = _mm_unpackhi_epi16(p2, p3);
1010 // 44 54 45 55 46 56 47 57
1011 x2 = _mm_unpackhi_epi16(p4, p5);
1012 // 64 74 65 75 66 76 67 77
1013 x3 = _mm_unpackhi_epi16(p6, p7);
1014 // 04 14 24 34 05 15 25 35
1015 x4 = _mm_unpacklo_epi32(x0, x1);
1016 // 44 54 64 74 45 55 65 75
1017 x5 = _mm_unpacklo_epi32(x2, x3);
1018 // 04 14 24 34 44 54 64 74
1019 x6 = _mm_unpacklo_epi64(x4, x5);
1020 // 05 15 25 35 45 55 65 75
1021 x7 = _mm_unpackhi_epi64(x4, x5);
1022
1023 _mm_storeu_si128((__m128i *)(out + 4*out_p), x6);
1024 // 04 14 24 34 44 54 64 74
1025 _mm_storeu_si128((__m128i *)(out + 5*out_p), x7);
1026 // 05 15 25 35 45 55 65 75
1027
1028 // 06 16 26 36 07 17 27 37
1029 x4 = _mm_unpackhi_epi32(x0, x1);
1030 // 46 56 66 76 47 57 67 77
1031 x5 = _mm_unpackhi_epi32(x2, x3);
1032 // 06 16 26 36 46 56 66 76
1033 x6 = _mm_unpacklo_epi64(x4, x5);
1034 // 07 17 27 37 47 57 67 77
1035 x7 = _mm_unpackhi_epi64(x4, x5);
1036
1037 _mm_storeu_si128((__m128i *)(out + 6*out_p), x6);
1038 // 06 16 26 36 46 56 66 76
1039 _mm_storeu_si128((__m128i *)(out + 7*out_p), x7);
1040 // 07 17 27 37 47 57 67 77
1041 } while (++idx8x8 < num_8x8_to_transpose);
1042 }
1043
highbd_transpose8x16(uint16_t * in0,uint16_t * in1,int in_p,uint16_t * out,int out_p)1044 static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1,
1045 int in_p, uint16_t *out, int out_p) {
1046 uint16_t *src0[1];
1047 uint16_t *src1[1];
1048 uint16_t *dest0[1];
1049 uint16_t *dest1[1];
1050 src0[0] = in0;
1051 src1[0] = in1;
1052 dest0[0] = out;
1053 dest1[0] = out + 8;
1054 highbd_transpose(src0, in_p, dest0, out_p, 1);
1055 highbd_transpose(src1, in_p, dest1, out_p, 1);
1056 }
1057
vp9_highbd_lpf_vertical_4_sse2(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)1058 void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
1059 const uint8_t *blimit,
1060 const uint8_t *limit,
1061 const uint8_t *thresh,
1062 int count, int bd) {
1063 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1064 uint16_t *src[1];
1065 uint16_t *dst[1];
1066 (void)count;
1067
1068 // Transpose 8x8
1069 src[0] = s - 4;
1070 dst[0] = t_dst;
1071
1072 highbd_transpose(src, p, dst, 8, 1);
1073
1074 // Loop filtering
1075 vp9_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
1076 bd);
1077
1078 src[0] = t_dst;
1079 dst[0] = s - 4;
1080
1081 // Transpose back
1082 highbd_transpose(src, 8, dst, p, 1);
1083 }
1084
vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)1085 void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
1086 const uint8_t *blimit0,
1087 const uint8_t *limit0,
1088 const uint8_t *thresh0,
1089 const uint8_t *blimit1,
1090 const uint8_t *limit1,
1091 const uint8_t *thresh1,
1092 int bd) {
1093 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1094 uint16_t *src[2];
1095 uint16_t *dst[2];
1096
1097 // Transpose 8x16
1098 highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1099
1100 // Loop filtering
1101 vp9_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1102 thresh0, blimit1, limit1, thresh1, bd);
1103 src[0] = t_dst;
1104 src[1] = t_dst + 8;
1105 dst[0] = s - 4;
1106 dst[1] = s - 4 + p * 8;
1107
1108 // Transpose back
1109 highbd_transpose(src, 16, dst, p, 2);
1110 }
1111
vp9_highbd_lpf_vertical_8_sse2(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)1112 void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
1113 const uint8_t *blimit,
1114 const uint8_t *limit,
1115 const uint8_t *thresh,
1116 int count, int bd) {
1117 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1118 uint16_t *src[1];
1119 uint16_t *dst[1];
1120 (void)count;
1121
1122 // Transpose 8x8
1123 src[0] = s - 4;
1124 dst[0] = t_dst;
1125
1126 highbd_transpose(src, p, dst, 8, 1);
1127
1128 // Loop filtering
1129 vp9_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
1130 bd);
1131
1132 src[0] = t_dst;
1133 dst[0] = s - 4;
1134
1135 // Transpose back
1136 highbd_transpose(src, 8, dst, p, 1);
1137 }
1138
vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t * s,int p,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)1139 void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
1140 const uint8_t *blimit0,
1141 const uint8_t *limit0,
1142 const uint8_t *thresh0,
1143 const uint8_t *blimit1,
1144 const uint8_t *limit1,
1145 const uint8_t *thresh1,
1146 int bd) {
1147 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1148 uint16_t *src[2];
1149 uint16_t *dst[2];
1150
1151 // Transpose 8x16
1152 highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1153
1154 // Loop filtering
1155 vp9_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1156 thresh0, blimit1, limit1, thresh1, bd);
1157 src[0] = t_dst;
1158 src[1] = t_dst + 8;
1159
1160 dst[0] = s - 4;
1161 dst[1] = s - 4 + p * 8;
1162
1163 // Transpose back
1164 highbd_transpose(src, 16, dst, p, 2);
1165 }
1166
vp9_highbd_lpf_vertical_16_sse2(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1167 void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
1168 const uint8_t *blimit,
1169 const uint8_t *limit,
1170 const uint8_t *thresh,
1171 int bd) {
1172 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
1173 uint16_t *src[2];
1174 uint16_t *dst[2];
1175
1176 src[0] = s - 8;
1177 src[1] = s;
1178 dst[0] = t_dst;
1179 dst[1] = t_dst + 8 * 8;
1180
1181 // Transpose 16x8
1182 highbd_transpose(src, p, dst, 8, 2);
1183
1184 // Loop filtering
1185 highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
1186 thresh, bd);
1187 src[0] = t_dst;
1188 src[1] = t_dst + 8 * 8;
1189 dst[0] = s - 8;
1190 dst[1] = s;
1191
1192 // Transpose back
1193 highbd_transpose(src, 8, dst, p, 2);
1194 }
1195
vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t * s,int p,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)1196 void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
1197 int p,
1198 const uint8_t *blimit,
1199 const uint8_t *limit,
1200 const uint8_t *thresh,
1201 int bd) {
1202 DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
1203
1204 // Transpose 16x16
1205 highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1206 highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1207
1208 // Loop filtering
1209 highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1210 thresh, bd);
1211
1212 // Transpose back
1213 highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1214 highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1215 }
1216