1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #ifndef LIB_JXL_BASE_ROBUST_STATISTICS_H_
7 #define LIB_JXL_BASE_ROBUST_STATISTICS_H_
8 
9 // Robust statistics: Mode, Median, MedianAbsoluteDeviation.
10 
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <stdlib.h>
14 
15 #include <algorithm>
16 #include <cmath>
17 #include <hwy/base.h>
18 #include <limits>
19 #include <type_traits>
20 #include <utility>
21 #include <vector>
22 namespace jxl {
23 
24 template <typename T>
Geomean(const T * items,size_t count)25 T Geomean(const T* items, size_t count) {
26   double product = 1.0;
27   for (size_t i = 0; i < count; ++i) {
28     product *= items[i];
29   }
30   return static_cast<T>(std::pow(product, 1.0 / count));
31 }
32 
33 // Round up for integers
34 template <class T, typename std::enable_if<
35                        std::numeric_limits<T>::is_integer>::type* = nullptr>
Half(T x)36 inline T Half(T x) {
37   return (x + 1) / 2;
38 }
39 
40 // Mul is faster than div.
41 template <class T, typename std::enable_if<
42                        !std::numeric_limits<T>::is_integer>::type* = nullptr>
Half(T x)43 inline T Half(T x) {
44   return x * T(0.5);
45 }
46 
47 // Returns the median value. Side effect: values <= median will appear before,
48 // values >= median after the middle index.
49 // Guarantees average speed O(num_values).
50 template <typename T>
Median(T * samples,const size_t num_samples)51 T Median(T* samples, const size_t num_samples) {
52   HWY_ASSERT(num_samples != 0);
53   std::nth_element(samples, samples + num_samples / 2, samples + num_samples);
54   T result = samples[num_samples / 2];
55   // If even size, find largest element in the partially sorted vector to
56   // use as second element to average with
57   if ((num_samples & 1) == 0) {
58     T biggest = *std::max_element(samples, samples + num_samples / 2);
59     result = Half(result + biggest);
60   }
61   return result;
62 }
63 
64 template <typename T>
Median(std::vector<T> * samples)65 T Median(std::vector<T>* samples) {
66   return Median(samples->data(), samples->size());
67 }
68 
69 template <typename T>
Median3(const T a,const T b,const T c)70 static inline T Median3(const T a, const T b, const T c) {
71   return std::max(std::min(a, b), std::min(c, std::max(a, b)));
72 }
73 
74 template <typename T>
Median5(const T a,const T b,const T c,const T d,const T e)75 static inline T Median5(const T a, const T b, const T c, const T d, const T e) {
76   return Median3(e, std::max(std::min(a, b), std::min(c, d)),
77                  std::min(std::max(a, b), std::max(c, d)));
78 }
79 
80 // Returns a robust measure of variability.
81 template <typename T>
MedianAbsoluteDeviation(const T * samples,const size_t num_samples,const T median)82 T MedianAbsoluteDeviation(const T* samples, const size_t num_samples,
83                           const T median) {
84   HWY_ASSERT(num_samples != 0);
85   std::vector<T> abs_deviations;
86   abs_deviations.reserve(num_samples);
87   for (size_t i = 0; i < num_samples; ++i) {
88     abs_deviations.push_back(std::abs(samples[i] - median));
89   }
90   return Median(&abs_deviations);
91 }
92 
93 template <typename T>
MedianAbsoluteDeviation(const std::vector<T> & samples,const T median)94 T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
95   return MedianAbsoluteDeviation(samples.data(), samples.size(), median);
96 }
97 
98 // Half{Range/Sample}Mode are implementations of "Robust estimators of the mode
99 // and skewness of continuous data". The mode is less affected by outliers in
100 // highly-skewed distributions than the median.
101 
102 // Robust estimator of the mode for data given as sorted values.
103 // O(N*logN), N=num_values.
104 class HalfSampleMode {
105  public:
106   // Returns mode. "sorted" must be in ascending order.
107   template <typename T>
operator()108   T operator()(const T* const HWY_RESTRICT sorted,
109                const size_t num_values) const {
110     int64_t center = num_values / 2;
111     int64_t width = num_values;
112 
113     // Zoom in on modal intervals of decreasing width. Stop before we reach
114     // width=1, i.e. single values, for which there is no "slope".
115     while (width > 2) {
116       // Round up so we can still reach the outer edges of odd widths.
117       width = Half(width);
118 
119       center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
120     }
121 
122     return sorted[center];  // mode := middle value in modal interval.
123   }
124 
125  private:
126   // Returns center of the densest region [c-radius, c+radius].
127   template <typename T>
CenterOfIntervalWithMinSlope(const T * HWY_RESTRICT sorted,const int64_t total_values,const int64_t center,const int64_t width)128   static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
129       const T* HWY_RESTRICT sorted, const int64_t total_values,
130       const int64_t center, const int64_t width) {
131     const int64_t radius = Half(width);
132 
133     auto compute_slope = [radius, total_values, sorted](
134                              int64_t c, int64_t* actual_center = nullptr) {
135       // For symmetry, check 2*radius+1 values, i.e. [min, max].
136       const int64_t min = std::max(c - radius, int64_t(0));
137       const int64_t max = std::min(c + radius, total_values - 1);
138       HWY_ASSERT(min < max);
139       HWY_ASSERT(sorted[min] <=
140                  sorted[max] + std::numeric_limits<float>::epsilon());
141       const float dx = max - min + 1;
142       const float slope = (sorted[max] - sorted[min]) / dx;
143 
144       if (actual_center != nullptr) {
145         // c may be out of bounds, so return center of the clamped bounds.
146         *actual_center = Half(min + max);
147       }
148       return slope;
149     };
150 
151     // First find min_slope for all centers.
152     float min_slope = std::numeric_limits<float>::max();
153     for (int64_t c = center - radius; c <= center + radius; ++c) {
154       min_slope = std::min(min_slope, compute_slope(c));
155     }
156 
157     // Candidates := centers with slope ~= min_slope.
158     std::vector<int64_t> candidates;
159     for (int64_t c = center - radius; c <= center + radius; ++c) {
160       int64_t actual_center;
161       const float slope = compute_slope(c, &actual_center);
162       if (slope <= min_slope * 1.001f) {
163         candidates.push_back(actual_center);
164       }
165     }
166 
167     // Keep the median.
168     HWY_ASSERT(!candidates.empty());
169     if (candidates.size() == 1) return candidates[0];
170     return Median(&candidates);
171   }
172 };
173 
174 // Robust estimator of the mode for data given as a CDF.
175 // O(N*logN), N=num_bins.
176 class HalfRangeMode {
177  public:
178   // Returns mode expressed as a histogram bin index. "cdf" must be weakly
179   // monotonically increasing, e.g. from std::partial_sum.
operator()180   int operator()(const uint32_t* HWY_RESTRICT cdf,
181                  const size_t num_bins) const {
182     int center = num_bins / 2;
183     int width = num_bins;
184 
185     // Zoom in on modal intervals of decreasing width. Stop before we reach
186     // width=1, i.e. original bins, because those are noisy.
187     while (width > 2) {
188       // Round up so we can still reach the outer edges of odd widths.
189       width = Half(width);
190 
191       center = CenterOfIntervalWithMaxDensity(cdf, num_bins, center, width);
192     }
193 
194     return center;  // mode := midpoint of modal interval.
195   }
196 
197  private:
198   // Returns center of the densest interval [c-radius, c+radius].
CenterOfIntervalWithMaxDensity(const uint32_t * HWY_RESTRICT cdf,const int total_bins,const int center,const int width)199   static HWY_INLINE int CenterOfIntervalWithMaxDensity(
200       const uint32_t* HWY_RESTRICT cdf, const int total_bins, const int center,
201       const int width) {
202     const int radius = Half(width);
203 
204     auto compute_density = [radius, total_bins, cdf](
205                                int c, int* actual_center = nullptr) {
206       // For symmetry, check 2*radius+1 bins, i.e. [min, max].
207       const int min = std::max(c - radius, 1);  // for -1 below
208       const int max = std::min(c + radius, total_bins - 1);
209       HWY_ASSERT(min < max);
210       HWY_ASSERT(cdf[min] <= cdf[max - 1]);
211       const int num_bins = max - min + 1;
212       // Sum over [min, max] == CDF(max) - CDF(min-1).
213       const float density = float(cdf[max] - cdf[min - 1]) / num_bins;
214 
215       if (actual_center != nullptr) {
216         // c may be out of bounds, so take center of the clamped bounds.
217         *actual_center = Half(min + max);
218       }
219       return density;
220     };
221 
222     // First find max_density for all centers.
223     float max_density = 0.0f;
224     for (int c = center - radius; c <= center + radius; ++c) {
225       max_density = std::max(max_density, compute_density(c));
226     }
227 
228     // Candidates := centers with density ~= max_density.
229     std::vector<int> candidates;
230     for (int c = center - radius; c <= center + radius; ++c) {
231       int actual_center;
232       const float density = compute_density(c, &actual_center);
233       if (density >= max_density * 0.999f) {
234         candidates.push_back(actual_center);
235       }
236     }
237 
238     // Keep the median.
239     HWY_ASSERT(!candidates.empty());
240     if (candidates.size() == 1) return candidates[0];
241     return Median(&candidates);
242   }
243 };
244 
245 // Sorts integral values in ascending order. About 3x faster than std::sort for
246 // input distributions with very few unique values.
247 template <class T>
CountingSort(T * begin,T * end)248 void CountingSort(T* begin, T* end) {
249   // Unique values and their frequency (similar to flat_map).
250   using Unique = std::pair<T, int>;
251   std::vector<Unique> unique;
252   for (const T* p = begin; p != end; ++p) {
253     const T value = *p;
254     const auto pos =
255         std::find_if(unique.begin(), unique.end(),
256                      [value](const Unique& u) { return u.first == value; });
257     if (pos == unique.end()) {
258       unique.push_back(std::make_pair(*p, 1));
259     } else {
260       ++pos->second;
261     }
262   }
263 
264   // Sort in ascending order of value (pair.first).
265   std::sort(unique.begin(), unique.end());
266 
267   // Write that many copies of each unique value to the array.
268   T* HWY_RESTRICT p = begin;
269   for (const auto& value_count : unique) {
270     std::fill(p, p + value_count.second, value_count.first);
271     p += value_count.second;
272   }
273   HWY_ASSERT(p == end);
274 }
275 
276 struct Bivariate {
BivariateBivariate277   Bivariate(float x, float y) : x(x), y(y) {}
278   float x;
279   float y;
280 };
281 
282 class Line {
283  public:
Line(const float slope,const float intercept)284   constexpr Line(const float slope, const float intercept)
285       : slope_(slope), intercept_(intercept) {}
286 
slope()287   constexpr float slope() const { return slope_; }
intercept()288   constexpr float intercept() const { return intercept_; }
289 
290   // Robust line fit using Siegel's repeated-median algorithm.
Line(const std::vector<Bivariate> & points)291   explicit Line(const std::vector<Bivariate>& points) {
292     const size_t N = points.size();
293     // This straightforward N^2 implementation is OK for small N.
294     HWY_ASSERT(N < 10 * 1000);
295 
296     // One for every point i.
297     std::vector<float> medians;
298     medians.reserve(N);
299 
300     // One for every j != i. Never cleared to avoid reallocation.
301     std::vector<float> slopes(N - 1);
302 
303     for (size_t i = 0; i < N; ++i) {
304       // Index within slopes[] (avoids the hole where j == i).
305       size_t idx_slope = 0;
306 
307       for (size_t j = 0; j < N; ++j) {
308         if (j == i) continue;
309 
310         const float dy = points[j].y - points[i].y;
311         const float dx = points[j].x - points[i].x;
312         HWY_ASSERT(std::abs(dx) > 1E-7f);  // x must be distinct
313         slopes[idx_slope++] = dy / dx;
314       }
315       HWY_ASSERT(idx_slope == N - 1);
316 
317       const float median = Median(&slopes);
318       medians.push_back(median);
319     }
320 
321     slope_ = Median(&medians);
322 
323     // Solve for intercept, overwriting medians[].
324     for (size_t i = 0; i < N; ++i) {
325       medians[i] = points[i].y - slope_ * points[i].x;
326     }
327     intercept_ = Median(&medians);
328   }
329 
operator()330   constexpr float operator()(float x) const { return x * slope_ + intercept_; }
331 
332  private:
333   float slope_;
334   float intercept_;
335 };
336 
EvaluateQuality(const Line & line,const std::vector<Bivariate> & points,float * HWY_RESTRICT max_l1,float * HWY_RESTRICT median_abs_deviation)337 static inline void EvaluateQuality(const Line& line,
338                                    const std::vector<Bivariate>& points,
339                                    float* HWY_RESTRICT max_l1,
340                                    float* HWY_RESTRICT median_abs_deviation) {
341   // For computing median_abs_deviation.
342   std::vector<float> abs_deviations;
343   abs_deviations.reserve(points.size());
344 
345   *max_l1 = 0.0f;
346   for (const Bivariate& point : points) {
347     const float l1 = std::abs(line(point.x) - point.y);
348     *max_l1 = std::max(*max_l1, l1);
349     abs_deviations.push_back(l1);
350   }
351 
352   *median_abs_deviation = Median(&abs_deviations);
353 }
354 
355 }  // namespace jxl
356 
357 #endif  // LIB_JXL_BASE_ROBUST_STATISTICS_H_
358