1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vp8/encoder/denoising.h"
12 #include "vp8/common/reconinter.h"
13 #include "vpx/vpx_integer.h"
14 #include "vpx_mem/vpx_mem.h"
15 #include "vp8_rtcd.h"
16 
17 #include <emmintrin.h>
18 #include "vpx_ports/emmintrin_compat.h"
19 
20 /* Compute the sum of all pixel differences of this MB. */
abs_sum_diff_16x1(__m128i acc_diff)21 static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
22   const __m128i k_1 = _mm_set1_epi16(1);
23   const __m128i acc_diff_lo =
24       _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
25   const __m128i acc_diff_hi =
26       _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
27   const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
28   const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
29   const __m128i hgfe_dcba =
30       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
31   const __m128i hgfedcba =
32       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
33   unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
34 
35   return sum_diff;
36 }
37 
vp8_denoiser_filter_sse2(unsigned char * mc_running_avg_y,int mc_avg_y_stride,unsigned char * running_avg_y,int avg_y_stride,unsigned char * sig,int sig_stride,unsigned int motion_magnitude,int increase_denoising)38 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
39                              int mc_avg_y_stride, unsigned char *running_avg_y,
40                              int avg_y_stride, unsigned char *sig,
41                              int sig_stride, unsigned int motion_magnitude,
42                              int increase_denoising) {
43   unsigned char *running_avg_y_start = running_avg_y;
44   unsigned char *sig_start = sig;
45   unsigned int sum_diff_thresh;
46   int r;
47   int shift_inc =
48       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
49           ? 1
50           : 0;
51   __m128i acc_diff = _mm_setzero_si128();
52   const __m128i k_0 = _mm_setzero_si128();
53   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
54   const __m128i k_8 = _mm_set1_epi8(8);
55   const __m128i k_16 = _mm_set1_epi8(16);
56   /* Modify each level's adjustment according to motion_magnitude. */
57   const __m128i l3 = _mm_set1_epi8(
58       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
59   /* Difference between level 3 and level 2 is 2. */
60   const __m128i l32 = _mm_set1_epi8(2);
61   /* Difference between level 2 and level 1 is 1. */
62   const __m128i l21 = _mm_set1_epi8(1);
63 
64   for (r = 0; r < 16; ++r) {
65     /* Calculate differences */
66     const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
67     const __m128i v_mc_running_avg_y =
68         _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
69     __m128i v_running_avg_y;
70     const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
71     const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
72     /* Obtain the sign. FF if diff is negative. */
73     const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
74     /* Clamp absolute difference to 16 to be used to get mask. Doing this
75      * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
76     const __m128i clamped_absdiff =
77         _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
78     /* Get masks for l2 l1 and l0 adjustments */
79     const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
80     const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
81     const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
82     /* Get adjustments for l2, l1, and l0 */
83     __m128i adj2 = _mm_and_si128(mask2, l32);
84     const __m128i adj1 = _mm_and_si128(mask1, l21);
85     const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
86     __m128i adj, padj, nadj;
87 
88     /* Combine the adjustments and get absolute adjustments. */
89     adj2 = _mm_add_epi8(adj2, adj1);
90     adj = _mm_sub_epi8(l3, adj2);
91     adj = _mm_andnot_si128(mask0, adj);
92     adj = _mm_or_si128(adj, adj0);
93 
94     /* Restore the sign and get positive and negative adjustments. */
95     padj = _mm_andnot_si128(diff_sign, adj);
96     nadj = _mm_and_si128(diff_sign, adj);
97 
98     /* Calculate filtered value. */
99     v_running_avg_y = _mm_adds_epu8(v_sig, padj);
100     v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
101     _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
102 
103     /* Adjustments <=7, and each element in acc_diff can fit in signed
104      * char.
105      */
106     acc_diff = _mm_adds_epi8(acc_diff, padj);
107     acc_diff = _mm_subs_epi8(acc_diff, nadj);
108 
109     /* Update pointers for next iteration. */
110     sig += sig_stride;
111     mc_running_avg_y += mc_avg_y_stride;
112     running_avg_y += avg_y_stride;
113   }
114 
115   {
116     /* Compute the sum of all pixel differences of this MB. */
117     unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
118     sum_diff_thresh = SUM_DIFF_THRESHOLD;
119     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
120     if (abs_sum_diff > sum_diff_thresh) {
121       // Before returning to copy the block (i.e., apply no denoising),
122       // check if we can still apply some (weaker) temporal filtering to
123       // this block, that would otherwise not be denoised at all. Simplest
124       // is to apply an additional adjustment to running_avg_y to bring it
125       // closer to sig. The adjustment is capped by a maximum delta, and
126       // chosen such that in most cases the resulting sum_diff will be
127       // within the acceptable range given by sum_diff_thresh.
128 
129       // The delta is set by the excess of absolute pixel diff over the
130       // threshold.
131       int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
132       // Only apply the adjustment for max delta up to 3.
133       if (delta < 4) {
134         const __m128i k_delta = _mm_set1_epi8(delta);
135         sig -= sig_stride * 16;
136         mc_running_avg_y -= mc_avg_y_stride * 16;
137         running_avg_y -= avg_y_stride * 16;
138         for (r = 0; r < 16; ++r) {
139           __m128i v_running_avg_y =
140               _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
141           // Calculate differences.
142           const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
143           const __m128i v_mc_running_avg_y =
144               _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
145           const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
146           const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
147           // Obtain the sign. FF if diff is negative.
148           const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
149           // Clamp absolute difference to delta to get the adjustment.
150           const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
151           // Restore the sign and get positive and negative adjustments.
152           __m128i padj, nadj;
153           padj = _mm_andnot_si128(diff_sign, adj);
154           nadj = _mm_and_si128(diff_sign, adj);
155           // Calculate filtered value.
156           v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
157           v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
158           _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
159 
160           // Accumulate the adjustments.
161           acc_diff = _mm_subs_epi8(acc_diff, padj);
162           acc_diff = _mm_adds_epi8(acc_diff, nadj);
163 
164           // Update pointers for next iteration.
165           sig += sig_stride;
166           mc_running_avg_y += mc_avg_y_stride;
167           running_avg_y += avg_y_stride;
168         }
169         abs_sum_diff = abs_sum_diff_16x1(acc_diff);
170         if (abs_sum_diff > sum_diff_thresh) {
171           return COPY_BLOCK;
172         }
173       } else {
174         return COPY_BLOCK;
175       }
176     }
177   }
178 
179   vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
180   return FILTER_BLOCK;
181 }
182 
vp8_denoiser_filter_uv_sse2(unsigned char * mc_running_avg,int mc_avg_stride,unsigned char * running_avg,int avg_stride,unsigned char * sig,int sig_stride,unsigned int motion_magnitude,int increase_denoising)183 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
184                                 int mc_avg_stride, unsigned char *running_avg,
185                                 int avg_stride, unsigned char *sig,
186                                 int sig_stride, unsigned int motion_magnitude,
187                                 int increase_denoising) {
188   unsigned char *running_avg_start = running_avg;
189   unsigned char *sig_start = sig;
190   unsigned int sum_diff_thresh;
191   int r;
192   int shift_inc =
193       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV)
194           ? 1
195           : 0;
196   __m128i acc_diff = _mm_setzero_si128();
197   const __m128i k_0 = _mm_setzero_si128();
198   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
199   const __m128i k_8 = _mm_set1_epi8(8);
200   const __m128i k_16 = _mm_set1_epi8(16);
201   /* Modify each level's adjustment according to motion_magnitude. */
202   const __m128i l3 = _mm_set1_epi8(
203       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 7 + shift_inc : 6);
204   /* Difference between level 3 and level 2 is 2. */
205   const __m128i l32 = _mm_set1_epi8(2);
206   /* Difference between level 2 and level 1 is 1. */
207   const __m128i l21 = _mm_set1_epi8(1);
208 
209   {
210     const __m128i k_1 = _mm_set1_epi16(1);
211     __m128i vec_sum_block = _mm_setzero_si128();
212 
213     // Avoid denoising color signal if its close to average level.
214     for (r = 0; r < 8; ++r) {
215       const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
216       const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
217       vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
218       sig += sig_stride;
219     }
220     sig -= sig_stride * 8;
221     {
222       const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
223       const __m128i hgfe_dcba =
224           _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
225       const __m128i hgfedcba =
226           _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
227       const int sum_block = _mm_cvtsi128_si32(hgfedcba);
228       if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
229         return COPY_BLOCK;
230       }
231     }
232   }
233 
234   for (r = 0; r < 4; ++r) {
235     /* Calculate differences */
236     const __m128i v_sig_low =
237         _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
238     const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
239         _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
240     const __m128i v_mc_running_avg_low =
241         _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
242     const __m128i v_mc_running_avg = _mm_castpd_si128(
243         _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
244                      (double *)(&mc_running_avg[mc_avg_stride])));
245     const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
246     const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
247     /* Obtain the sign. FF if diff is negative. */
248     const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
249     /* Clamp absolute difference to 16 to be used to get mask. Doing this
250      * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
251     const __m128i clamped_absdiff =
252         _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
253     /* Get masks for l2 l1 and l0 adjustments */
254     const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
255     const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
256     const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
257     /* Get adjustments for l2, l1, and l0 */
258     __m128i adj2 = _mm_and_si128(mask2, l32);
259     const __m128i adj1 = _mm_and_si128(mask1, l21);
260     const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
261     __m128i adj, padj, nadj;
262     __m128i v_running_avg;
263 
264     /* Combine the adjustments and get absolute adjustments. */
265     adj2 = _mm_add_epi8(adj2, adj1);
266     adj = _mm_sub_epi8(l3, adj2);
267     adj = _mm_andnot_si128(mask0, adj);
268     adj = _mm_or_si128(adj, adj0);
269 
270     /* Restore the sign and get positive and negative adjustments. */
271     padj = _mm_andnot_si128(diff_sign, adj);
272     nadj = _mm_and_si128(diff_sign, adj);
273 
274     /* Calculate filtered value. */
275     v_running_avg = _mm_adds_epu8(v_sig, padj);
276     v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
277 
278     _mm_storel_pd((double *)&running_avg[0], _mm_castsi128_pd(v_running_avg));
279     _mm_storeh_pd((double *)&running_avg[avg_stride],
280                   _mm_castsi128_pd(v_running_avg));
281 
282     /* Adjustments <=7, and each element in acc_diff can fit in signed
283      * char.
284      */
285     acc_diff = _mm_adds_epi8(acc_diff, padj);
286     acc_diff = _mm_subs_epi8(acc_diff, nadj);
287 
288     /* Update pointers for next iteration. */
289     sig += sig_stride * 2;
290     mc_running_avg += mc_avg_stride * 2;
291     running_avg += avg_stride * 2;
292   }
293 
294   {
295     unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
296     sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
297     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
298     if (abs_sum_diff > sum_diff_thresh) {
299       // Before returning to copy the block (i.e., apply no denoising),
300       // check if we can still apply some (weaker) temporal filtering to
301       // this block, that would otherwise not be denoised at all. Simplest
302       // is to apply an additional adjustment to running_avg_y to bring it
303       // closer to sig. The adjustment is capped by a maximum delta, and
304       // chosen such that in most cases the resulting sum_diff will be
305       // within the acceptable range given by sum_diff_thresh.
306 
307       // The delta is set by the excess of absolute pixel diff over the
308       // threshold.
309       int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
310       // Only apply the adjustment for max delta up to 3.
311       if (delta < 4) {
312         const __m128i k_delta = _mm_set1_epi8(delta);
313         sig -= sig_stride * 8;
314         mc_running_avg -= mc_avg_stride * 8;
315         running_avg -= avg_stride * 8;
316         for (r = 0; r < 4; ++r) {
317           // Calculate differences.
318           const __m128i v_sig_low =
319               _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
320           const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
321               _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
322           const __m128i v_mc_running_avg_low =
323               _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
324           const __m128i v_mc_running_avg = _mm_castpd_si128(
325               _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
326                            (double *)(&mc_running_avg[mc_avg_stride])));
327           const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
328           const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
329           // Obtain the sign. FF if diff is negative.
330           const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
331           // Clamp absolute difference to delta to get the adjustment.
332           const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
333           // Restore the sign and get positive and negative adjustments.
334           __m128i padj, nadj;
335           const __m128i v_running_avg_low =
336               _mm_castpd_si128(_mm_load_sd((double *)(&running_avg[0])));
337           __m128i v_running_avg = _mm_castpd_si128(
338               _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
339                            (double *)(&running_avg[avg_stride])));
340           padj = _mm_andnot_si128(diff_sign, adj);
341           nadj = _mm_and_si128(diff_sign, adj);
342           // Calculate filtered value.
343           v_running_avg = _mm_subs_epu8(v_running_avg, padj);
344           v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
345 
346           _mm_storel_pd((double *)&running_avg[0],
347                         _mm_castsi128_pd(v_running_avg));
348           _mm_storeh_pd((double *)&running_avg[avg_stride],
349                         _mm_castsi128_pd(v_running_avg));
350 
351           // Accumulate the adjustments.
352           acc_diff = _mm_subs_epi8(acc_diff, padj);
353           acc_diff = _mm_adds_epi8(acc_diff, nadj);
354 
355           // Update pointers for next iteration.
356           sig += sig_stride * 2;
357           mc_running_avg += mc_avg_stride * 2;
358           running_avg += avg_stride * 2;
359         }
360         abs_sum_diff = abs_sum_diff_16x1(acc_diff);
361         if (abs_sum_diff > sum_diff_thresh) {
362           return COPY_BLOCK;
363         }
364       } else {
365         return COPY_BLOCK;
366       }
367     }
368   }
369 
370   vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
371   return FILTER_BLOCK;
372 }
373