1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_ports/mem.h"
17 
signed_char_clamp(int t)18 static INLINE int8_t signed_char_clamp(int t) {
19   return (int8_t)clamp(t, -128, 127);
20 }
21 
22 #if CONFIG_VP9_HIGHBITDEPTH
signed_char_clamp_high(int t,int bd)23 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
24   switch (bd) {
25     case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
26     case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
27     case 8:
28     default: return (int16_t)clamp(t, -128, 128 - 1);
29   }
30 }
31 #endif
32 
33 // Should we apply any filter at all: 11111111 yes, 00000000 no
filter_mask(uint8_t limit,uint8_t blimit,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)34 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
35                                  uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
36                                  uint8_t q1, uint8_t q2, uint8_t q3) {
37   int8_t mask = 0;
38   mask |= (abs(p3 - p2) > limit) * -1;
39   mask |= (abs(p2 - p1) > limit) * -1;
40   mask |= (abs(p1 - p0) > limit) * -1;
41   mask |= (abs(q1 - q0) > limit) * -1;
42   mask |= (abs(q2 - q1) > limit) * -1;
43   mask |= (abs(q3 - q2) > limit) * -1;
44   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
45   return ~mask;
46 }
47 
flat_mask4(uint8_t thresh,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)48 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
49                                 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
50                                 uint8_t q2, uint8_t q3) {
51   int8_t mask = 0;
52   mask |= (abs(p1 - p0) > thresh) * -1;
53   mask |= (abs(q1 - q0) > thresh) * -1;
54   mask |= (abs(p2 - p0) > thresh) * -1;
55   mask |= (abs(q2 - q0) > thresh) * -1;
56   mask |= (abs(p3 - p0) > thresh) * -1;
57   mask |= (abs(q3 - q0) > thresh) * -1;
58   return ~mask;
59 }
60 
flat_mask5(uint8_t thresh,uint8_t p4,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3,uint8_t q4)61 static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
62                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
63                                 uint8_t q1, uint8_t q2, uint8_t q3,
64                                 uint8_t q4) {
65   int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
66   mask |= (abs(p4 - p0) > thresh) * -1;
67   mask |= (abs(q4 - q0) > thresh) * -1;
68   return ~mask;
69 }
70 
71 // Is there high edge variance internal edge: 11111111 yes, 00000000 no
hev_mask(uint8_t thresh,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1)72 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
73                               uint8_t q0, uint8_t q1) {
74   int8_t hev = 0;
75   hev |= (abs(p1 - p0) > thresh) * -1;
76   hev |= (abs(q1 - q0) > thresh) * -1;
77   return hev;
78 }
79 
filter4(int8_t mask,uint8_t thresh,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1)80 static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
81                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
82   int8_t filter1, filter2;
83 
84   const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
85   const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
86   const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
87   const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
88   const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
89 
90   // add outer taps if we have high edge variance
91   int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
92 
93   // inner taps
94   filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
95 
96   // save bottom 3 bits so that we round one side +4 and the other +3
97   // if it equals 4 we'll set it to adjust by -1 to account for the fact
98   // we'd round it by 3 the other way
99   filter1 = signed_char_clamp(filter + 4) >> 3;
100   filter2 = signed_char_clamp(filter + 3) >> 3;
101 
102   *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
103   *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
104 
105   // outer tap adjustments
106   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
107 
108   *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
109   *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
110 }
111 
vpx_lpf_horizontal_4_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)112 void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
113                             const uint8_t *limit, const uint8_t *thresh) {
114   int i;
115 
116   // loop filter designed to work using chars so that we can make maximum use
117   // of 8 bit simd instructions.
118   for (i = 0; i < 8; ++i) {
119     const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
120                   p0 = s[-pitch];
121     const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
122                   q3 = s[3 * pitch];
123     const int8_t mask =
124         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
125     filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
126     ++s;
127   }
128 }
129 
vpx_lpf_horizontal_4_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)130 void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
131                                  const uint8_t *limit0, const uint8_t *thresh0,
132                                  const uint8_t *blimit1, const uint8_t *limit1,
133                                  const uint8_t *thresh1) {
134   vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
135   vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
136 }
137 
vpx_lpf_vertical_4_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)138 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
139                           const uint8_t *limit, const uint8_t *thresh) {
140   int i;
141 
142   // loop filter designed to work using chars so that we can make maximum use
143   // of 8 bit simd instructions.
144   for (i = 0; i < 8; ++i) {
145     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
146     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
147     const int8_t mask =
148         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
149     filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
150     s += pitch;
151   }
152 }
153 
vpx_lpf_vertical_4_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)154 void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
155                                const uint8_t *limit0, const uint8_t *thresh0,
156                                const uint8_t *blimit1, const uint8_t *limit1,
157                                const uint8_t *thresh1) {
158   vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
159   vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
160 }
161 
filter8(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3)162 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
163                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
164                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
165                            uint8_t *oq2, uint8_t *oq3) {
166   if (flat && mask) {
167     const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
168     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
169 
170     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
171     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
172     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
173     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
174     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
175     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
176     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
177   } else {
178     filter4(mask, thresh, op1, op0, oq0, oq1);
179   }
180 }
181 
vpx_lpf_horizontal_8_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)182 void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
183                             const uint8_t *limit, const uint8_t *thresh) {
184   int i;
185 
186   // loop filter designed to work using chars so that we can make maximum use
187   // of 8 bit simd instructions.
188   for (i = 0; i < 8; ++i) {
189     const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
190                   p0 = s[-pitch];
191     const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
192                   q3 = s[3 * pitch];
193 
194     const int8_t mask =
195         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
196     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
197     filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
198             s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
199     ++s;
200   }
201 }
202 
vpx_lpf_horizontal_8_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)203 void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
204                                  const uint8_t *limit0, const uint8_t *thresh0,
205                                  const uint8_t *blimit1, const uint8_t *limit1,
206                                  const uint8_t *thresh1) {
207   vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
208   vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
209 }
210 
vpx_lpf_vertical_8_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)211 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
212                           const uint8_t *limit, const uint8_t *thresh) {
213   int i;
214 
215   for (i = 0; i < 8; ++i) {
216     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
217     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
218     const int8_t mask =
219         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
220     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
221     filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
222             s + 3);
223     s += pitch;
224   }
225 }
226 
vpx_lpf_vertical_8_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)227 void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
228                                const uint8_t *limit0, const uint8_t *thresh0,
229                                const uint8_t *blimit1, const uint8_t *limit1,
230                                const uint8_t *thresh1) {
231   vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
232   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
233 }
234 
filter16(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t flat2,uint8_t * op7,uint8_t * op6,uint8_t * op5,uint8_t * op4,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3,uint8_t * oq4,uint8_t * oq5,uint8_t * oq6,uint8_t * oq7)235 static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
236                             uint8_t flat2, uint8_t *op7, uint8_t *op6,
237                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
238                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
239                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
240                             uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
241                             uint8_t *oq6, uint8_t *oq7) {
242   if (flat2 && flat && mask) {
243     const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
244                   p2 = *op2, p1 = *op1, p0 = *op0;
245 
246     const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
247                   q5 = *oq5, q6 = *oq6, q7 = *oq7;
248 
249     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
250     *op6 = ROUND_POWER_OF_TWO(
251         p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
252     *op5 = ROUND_POWER_OF_TWO(
253         p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
254     *op4 = ROUND_POWER_OF_TWO(
255         p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
256     *op3 = ROUND_POWER_OF_TWO(
257         p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
258     *op2 = ROUND_POWER_OF_TWO(
259         p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
260         4);
261     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
262                                   q0 + q1 + q2 + q3 + q4 + q5,
263                               4);
264     *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
265                                   q1 + q2 + q3 + q4 + q5 + q6,
266                               4);
267     *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
268                                   q2 + q3 + q4 + q5 + q6 + q7,
269                               4);
270     *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
271                                   q3 + q4 + q5 + q6 + q7 * 2,
272                               4);
273     *oq2 = ROUND_POWER_OF_TWO(
274         p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
275         4);
276     *oq3 = ROUND_POWER_OF_TWO(
277         p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
278     *oq4 = ROUND_POWER_OF_TWO(
279         p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
280     *oq5 = ROUND_POWER_OF_TWO(
281         p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
282     *oq6 = ROUND_POWER_OF_TWO(
283         p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
284   } else {
285     filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
286   }
287 }
288 
mb_lpf_horizontal_edge_w(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)289 static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
290                                      const uint8_t *blimit,
291                                      const uint8_t *limit,
292                                      const uint8_t *thresh, int count) {
293   int i;
294 
295   // loop filter designed to work using chars so that we can make maximum use
296   // of 8 bit simd instructions.
297   for (i = 0; i < 8 * count; ++i) {
298     const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
299                   p0 = s[-pitch];
300     const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
301                   q3 = s[3 * pitch];
302     const int8_t mask =
303         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
304     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
305     const int8_t flat2 = flat_mask5(
306         1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
307         s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
308 
309     filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
310              s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
311              s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
312              s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
313              s + 7 * pitch);
314     ++s;
315   }
316 }
317 
vpx_lpf_horizontal_16_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)318 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
319                              const uint8_t *limit, const uint8_t *thresh) {
320   mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
321 }
322 
vpx_lpf_horizontal_16_dual_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)323 void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
324                                   const uint8_t *limit, const uint8_t *thresh) {
325   mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
326 }
327 
mb_lpf_vertical_edge_w(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)328 static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
329                                    const uint8_t *limit, const uint8_t *thresh,
330                                    int count) {
331   int i;
332 
333   for (i = 0; i < count; ++i) {
334     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
335     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
336     const int8_t mask =
337         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
338     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
339     const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
340                                     s[5], s[6], s[7]);
341 
342     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
343              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
344              s + 7);
345     s += pitch;
346   }
347 }
348 
vpx_lpf_vertical_16_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)349 void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
350                            const uint8_t *limit, const uint8_t *thresh) {
351   mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
352 }
353 
vpx_lpf_vertical_16_dual_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)354 void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
355                                 const uint8_t *limit, const uint8_t *thresh) {
356   mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
357 }
358 
359 #if CONFIG_VP9_HIGHBITDEPTH
360 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
highbd_filter_mask(uint8_t limit,uint8_t blimit,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)361 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
362                                         uint16_t p3, uint16_t p2, uint16_t p1,
363                                         uint16_t p0, uint16_t q0, uint16_t q1,
364                                         uint16_t q2, uint16_t q3, int bd) {
365   int8_t mask = 0;
366   int16_t limit16 = (uint16_t)limit << (bd - 8);
367   int16_t blimit16 = (uint16_t)blimit << (bd - 8);
368   mask |= (abs(p3 - p2) > limit16) * -1;
369   mask |= (abs(p2 - p1) > limit16) * -1;
370   mask |= (abs(p1 - p0) > limit16) * -1;
371   mask |= (abs(q1 - q0) > limit16) * -1;
372   mask |= (abs(q2 - q1) > limit16) * -1;
373   mask |= (abs(q3 - q2) > limit16) * -1;
374   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
375   return ~mask;
376 }
377 
highbd_flat_mask4(uint8_t thresh,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)378 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
379                                        uint16_t p1, uint16_t p0, uint16_t q0,
380                                        uint16_t q1, uint16_t q2, uint16_t q3,
381                                        int bd) {
382   int8_t mask = 0;
383   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
384   mask |= (abs(p1 - p0) > thresh16) * -1;
385   mask |= (abs(q1 - q0) > thresh16) * -1;
386   mask |= (abs(p2 - p0) > thresh16) * -1;
387   mask |= (abs(q2 - q0) > thresh16) * -1;
388   mask |= (abs(p3 - p0) > thresh16) * -1;
389   mask |= (abs(q3 - q0) > thresh16) * -1;
390   return ~mask;
391 }
392 
highbd_flat_mask5(uint8_t thresh,uint16_t p4,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,uint16_t q4,int bd)393 static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
394                                        uint16_t p2, uint16_t p1, uint16_t p0,
395                                        uint16_t q0, uint16_t q1, uint16_t q2,
396                                        uint16_t q3, uint16_t q4, int bd) {
397   int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
398   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
399   mask |= (abs(p4 - p0) > thresh16) * -1;
400   mask |= (abs(q4 - q0) > thresh16) * -1;
401   return ~mask;
402 }
403 
404 // Is there high edge variance internal edge:
405 // 11111111_11111111 yes, 00000000_00000000 no ?
highbd_hev_mask(uint8_t thresh,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,int bd)406 static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
407                                       uint16_t q0, uint16_t q1, int bd) {
408   int16_t hev = 0;
409   int16_t thresh16 = (uint16_t)thresh << (bd - 8);
410   hev |= (abs(p1 - p0) > thresh16) * -1;
411   hev |= (abs(q1 - q0) > thresh16) * -1;
412   return hev;
413 }
414 
highbd_filter4(int8_t mask,uint8_t thresh,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,int bd)415 static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
416                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
417                                   int bd) {
418   int16_t filter1, filter2;
419   // ^0x80 equivalent to subtracting 0x80 from the values to turn them
420   // into -128 to +127 instead of 0 to 255.
421   int shift = bd - 8;
422   const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
423   const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
424   const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
425   const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
426   const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
427 
428   // Add outer taps if we have high edge variance.
429   int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
430 
431   // Inner taps.
432   filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
433 
434   // Save bottom 3 bits so that we round one side +4 and the other +3
435   // if it equals 4 we'll set it to adjust by -1 to account for the fact
436   // we'd round it by 3 the other way.
437   filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
438   filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
439 
440   *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
441   *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
442 
443   // Outer tap adjustments.
444   filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
445 
446   *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
447   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
448 }
449 
vpx_highbd_lpf_horizontal_4_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)450 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
451                                    const uint8_t *blimit, const uint8_t *limit,
452                                    const uint8_t *thresh, int bd) {
453   int i;
454 
455   // loop filter designed to work using chars so that we can make maximum use
456   // of 8 bit simd instructions.
457   for (i = 0; i < 8; ++i) {
458     const uint16_t p3 = s[-4 * pitch];
459     const uint16_t p2 = s[-3 * pitch];
460     const uint16_t p1 = s[-2 * pitch];
461     const uint16_t p0 = s[-pitch];
462     const uint16_t q0 = s[0 * pitch];
463     const uint16_t q1 = s[1 * pitch];
464     const uint16_t q2 = s[2 * pitch];
465     const uint16_t q3 = s[3 * pitch];
466     const int8_t mask =
467         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
468     highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
469                    s + 1 * pitch, bd);
470     ++s;
471   }
472 }
473 
vpx_highbd_lpf_horizontal_4_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)474 void vpx_highbd_lpf_horizontal_4_dual_c(
475     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
476     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
477     const uint8_t *thresh1, int bd) {
478   vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
479   vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
480 }
481 
vpx_highbd_lpf_vertical_4_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)482 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
483                                  const uint8_t *limit, const uint8_t *thresh,
484                                  int bd) {
485   int i;
486 
487   // loop filter designed to work using chars so that we can make maximum use
488   // of 8 bit simd instructions.
489   for (i = 0; i < 8; ++i) {
490     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
491     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
492     const int8_t mask =
493         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
494     highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
495     s += pitch;
496   }
497 }
498 
vpx_highbd_lpf_vertical_4_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)499 void vpx_highbd_lpf_vertical_4_dual_c(
500     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
501     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
502     const uint8_t *thresh1, int bd) {
503   vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
504   vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
505                               bd);
506 }
507 
highbd_filter8(int8_t mask,uint8_t thresh,uint8_t flat,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,int bd)508 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
509                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
510                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
511                                   uint16_t *oq2, uint16_t *oq3, int bd) {
512   if (flat && mask) {
513     const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
514     const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
515 
516     // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
517     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
518     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
519     *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
520     *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
521     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
522     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
523   } else {
524     highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
525   }
526 }
527 
vpx_highbd_lpf_horizontal_8_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)528 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
529                                    const uint8_t *blimit, const uint8_t *limit,
530                                    const uint8_t *thresh, int bd) {
531   int i;
532 
533   // loop filter designed to work using chars so that we can make maximum use
534   // of 8 bit simd instructions.
535   for (i = 0; i < 8; ++i) {
536     const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
537                    p0 = s[-pitch];
538     const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
539                    q3 = s[3 * pitch];
540 
541     const int8_t mask =
542         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
543     const int8_t flat =
544         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
545     highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
546                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
547                    s + 2 * pitch, s + 3 * pitch, bd);
548     ++s;
549   }
550 }
551 
vpx_highbd_lpf_horizontal_8_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)552 void vpx_highbd_lpf_horizontal_8_dual_c(
553     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
554     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
555     const uint8_t *thresh1, int bd) {
556   vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
557   vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
558 }
559 
vpx_highbd_lpf_vertical_8_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)560 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
561                                  const uint8_t *limit, const uint8_t *thresh,
562                                  int bd) {
563   int i;
564 
565   for (i = 0; i < 8; ++i) {
566     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
567     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
568     const int8_t mask =
569         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
570     const int8_t flat =
571         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
572     highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
573                    s + 2, s + 3, bd);
574     s += pitch;
575   }
576 }
577 
vpx_highbd_lpf_vertical_8_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)578 void vpx_highbd_lpf_vertical_8_dual_c(
579     uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
580     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
581     const uint8_t *thresh1, int bd) {
582   vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
583   vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
584                               bd);
585 }
586 
highbd_filter16(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t flat2,uint16_t * op7,uint16_t * op6,uint16_t * op5,uint16_t * op4,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,uint16_t * oq4,uint16_t * oq5,uint16_t * oq6,uint16_t * oq7,int bd)587 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
588                                    uint8_t flat2, uint16_t *op7, uint16_t *op6,
589                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
590                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
591                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
592                                    uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
593                                    uint16_t *oq6, uint16_t *oq7, int bd) {
594   if (flat2 && flat && mask) {
595     const uint16_t p7 = *op7;
596     const uint16_t p6 = *op6;
597     const uint16_t p5 = *op5;
598     const uint16_t p4 = *op4;
599     const uint16_t p3 = *op3;
600     const uint16_t p2 = *op2;
601     const uint16_t p1 = *op1;
602     const uint16_t p0 = *op0;
603     const uint16_t q0 = *oq0;
604     const uint16_t q1 = *oq1;
605     const uint16_t q2 = *oq2;
606     const uint16_t q3 = *oq3;
607     const uint16_t q4 = *oq4;
608     const uint16_t q5 = *oq5;
609     const uint16_t q6 = *oq6;
610     const uint16_t q7 = *oq7;
611 
612     // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
613     *op6 = ROUND_POWER_OF_TWO(
614         p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
615     *op5 = ROUND_POWER_OF_TWO(
616         p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
617     *op4 = ROUND_POWER_OF_TWO(
618         p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
619     *op3 = ROUND_POWER_OF_TWO(
620         p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
621     *op2 = ROUND_POWER_OF_TWO(
622         p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
623         4);
624     *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
625                                   q0 + q1 + q2 + q3 + q4 + q5,
626                               4);
627     *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
628                                   q1 + q2 + q3 + q4 + q5 + q6,
629                               4);
630     *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
631                                   q2 + q3 + q4 + q5 + q6 + q7,
632                               4);
633     *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
634                                   q3 + q4 + q5 + q6 + q7 * 2,
635                               4);
636     *oq2 = ROUND_POWER_OF_TWO(
637         p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
638         4);
639     *oq3 = ROUND_POWER_OF_TWO(
640         p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
641     *oq4 = ROUND_POWER_OF_TWO(
642         p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
643     *oq5 = ROUND_POWER_OF_TWO(
644         p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
645     *oq6 = ROUND_POWER_OF_TWO(
646         p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
647   } else {
648     highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
649                    bd);
650   }
651 }
652 
highbd_mb_lpf_horizontal_edge_w(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)653 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
654                                             const uint8_t *blimit,
655                                             const uint8_t *limit,
656                                             const uint8_t *thresh, int count,
657                                             int bd) {
658   int i;
659 
660   // loop filter designed to work using chars so that we can make maximum use
661   // of 8 bit simd instructions.
662   for (i = 0; i < 8 * count; ++i) {
663     const uint16_t p3 = s[-4 * pitch];
664     const uint16_t p2 = s[-3 * pitch];
665     const uint16_t p1 = s[-2 * pitch];
666     const uint16_t p0 = s[-pitch];
667     const uint16_t q0 = s[0 * pitch];
668     const uint16_t q1 = s[1 * pitch];
669     const uint16_t q2 = s[2 * pitch];
670     const uint16_t q3 = s[3 * pitch];
671     const int8_t mask =
672         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
673     const int8_t flat =
674         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
675     const int8_t flat2 = highbd_flat_mask5(
676         1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
677         s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
678 
679     highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
680                     s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
681                     s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
682                     s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
683                     s + 6 * pitch, s + 7 * pitch, bd);
684     ++s;
685   }
686 }
687 
vpx_highbd_lpf_horizontal_16_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)688 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
689                                     const uint8_t *blimit, const uint8_t *limit,
690                                     const uint8_t *thresh, int bd) {
691   highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
692 }
693 
vpx_highbd_lpf_horizontal_16_dual_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)694 void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
695                                          const uint8_t *blimit,
696                                          const uint8_t *limit,
697                                          const uint8_t *thresh, int bd) {
698   highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
699 }
700 
highbd_mb_lpf_vertical_edge_w(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)701 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
702                                           const uint8_t *blimit,
703                                           const uint8_t *limit,
704                                           const uint8_t *thresh, int count,
705                                           int bd) {
706   int i;
707 
708   for (i = 0; i < count; ++i) {
709     const uint16_t p3 = s[-4];
710     const uint16_t p2 = s[-3];
711     const uint16_t p1 = s[-2];
712     const uint16_t p0 = s[-1];
713     const uint16_t q0 = s[0];
714     const uint16_t q1 = s[1];
715     const uint16_t q2 = s[2];
716     const uint16_t q3 = s[3];
717     const int8_t mask =
718         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
719     const int8_t flat =
720         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
721     const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
722                                            q0, s[4], s[5], s[6], s[7], bd);
723 
724     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
725                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
726                     s + 5, s + 6, s + 7, bd);
727     s += pitch;
728   }
729 }
730 
vpx_highbd_lpf_vertical_16_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)731 void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
732                                   const uint8_t *limit, const uint8_t *thresh,
733                                   int bd) {
734   highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
735 }
736 
vpx_highbd_lpf_vertical_16_dual_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)737 void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
738                                        const uint8_t *blimit,
739                                        const uint8_t *limit,
740                                        const uint8_t *thresh, int bd) {
741   highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
742 }
743 #endif  // CONFIG_VP9_HIGHBITDEPTH
744