1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 #ifndef LIB_JXL_BASE_ROBUST_STATISTICS_H_
7 #define LIB_JXL_BASE_ROBUST_STATISTICS_H_
8
9 // Robust statistics: Mode, Median, MedianAbsoluteDeviation.
10
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <stdlib.h>
14
15 #include <algorithm>
16 #include <cmath>
17 #include <hwy/base.h>
18 #include <limits>
19 #include <type_traits>
20 #include <utility>
21 #include <vector>
22 namespace jxl {
23
24 template <typename T>
Geomean(const T * items,size_t count)25 T Geomean(const T* items, size_t count) {
26 double product = 1.0;
27 for (size_t i = 0; i < count; ++i) {
28 product *= items[i];
29 }
30 return static_cast<T>(std::pow(product, 1.0 / count));
31 }
32
33 // Round up for integers
34 template <class T, typename std::enable_if<
35 std::numeric_limits<T>::is_integer>::type* = nullptr>
Half(T x)36 inline T Half(T x) {
37 return (x + 1) / 2;
38 }
39
40 // Mul is faster than div.
41 template <class T, typename std::enable_if<
42 !std::numeric_limits<T>::is_integer>::type* = nullptr>
Half(T x)43 inline T Half(T x) {
44 return x * T(0.5);
45 }
46
47 // Returns the median value. Side effect: values <= median will appear before,
48 // values >= median after the middle index.
49 // Guarantees average speed O(num_values).
50 template <typename T>
Median(T * samples,const size_t num_samples)51 T Median(T* samples, const size_t num_samples) {
52 HWY_ASSERT(num_samples != 0);
53 std::nth_element(samples, samples + num_samples / 2, samples + num_samples);
54 T result = samples[num_samples / 2];
55 // If even size, find largest element in the partially sorted vector to
56 // use as second element to average with
57 if ((num_samples & 1) == 0) {
58 T biggest = *std::max_element(samples, samples + num_samples / 2);
59 result = Half(result + biggest);
60 }
61 return result;
62 }
63
64 template <typename T>
Median(std::vector<T> * samples)65 T Median(std::vector<T>* samples) {
66 return Median(samples->data(), samples->size());
67 }
68
69 template <typename T>
Median3(const T a,const T b,const T c)70 static inline T Median3(const T a, const T b, const T c) {
71 return std::max(std::min(a, b), std::min(c, std::max(a, b)));
72 }
73
74 template <typename T>
Median5(const T a,const T b,const T c,const T d,const T e)75 static inline T Median5(const T a, const T b, const T c, const T d, const T e) {
76 return Median3(e, std::max(std::min(a, b), std::min(c, d)),
77 std::min(std::max(a, b), std::max(c, d)));
78 }
79
80 // Returns a robust measure of variability.
81 template <typename T>
MedianAbsoluteDeviation(const T * samples,const size_t num_samples,const T median)82 T MedianAbsoluteDeviation(const T* samples, const size_t num_samples,
83 const T median) {
84 HWY_ASSERT(num_samples != 0);
85 std::vector<T> abs_deviations;
86 abs_deviations.reserve(num_samples);
87 for (size_t i = 0; i < num_samples; ++i) {
88 abs_deviations.push_back(std::abs(samples[i] - median));
89 }
90 return Median(&abs_deviations);
91 }
92
93 template <typename T>
MedianAbsoluteDeviation(const std::vector<T> & samples,const T median)94 T MedianAbsoluteDeviation(const std::vector<T>& samples, const T median) {
95 return MedianAbsoluteDeviation(samples.data(), samples.size(), median);
96 }
97
98 // Half{Range/Sample}Mode are implementations of "Robust estimators of the mode
99 // and skewness of continuous data". The mode is less affected by outliers in
100 // highly-skewed distributions than the median.
101
102 // Robust estimator of the mode for data given as sorted values.
103 // O(N*logN), N=num_values.
104 class HalfSampleMode {
105 public:
106 // Returns mode. "sorted" must be in ascending order.
107 template <typename T>
operator()108 T operator()(const T* const HWY_RESTRICT sorted,
109 const size_t num_values) const {
110 int64_t center = num_values / 2;
111 int64_t width = num_values;
112
113 // Zoom in on modal intervals of decreasing width. Stop before we reach
114 // width=1, i.e. single values, for which there is no "slope".
115 while (width > 2) {
116 // Round up so we can still reach the outer edges of odd widths.
117 width = Half(width);
118
119 center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
120 }
121
122 return sorted[center]; // mode := middle value in modal interval.
123 }
124
125 private:
126 // Returns center of the densest region [c-radius, c+radius].
127 template <typename T>
CenterOfIntervalWithMinSlope(const T * HWY_RESTRICT sorted,const int64_t total_values,const int64_t center,const int64_t width)128 static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
129 const T* HWY_RESTRICT sorted, const int64_t total_values,
130 const int64_t center, const int64_t width) {
131 const int64_t radius = Half(width);
132
133 auto compute_slope = [radius, total_values, sorted](
134 int64_t c, int64_t* actual_center = nullptr) {
135 // For symmetry, check 2*radius+1 values, i.e. [min, max].
136 const int64_t min = std::max(c - radius, int64_t(0));
137 const int64_t max = std::min(c + radius, total_values - 1);
138 HWY_ASSERT(min < max);
139 HWY_ASSERT(sorted[min] <=
140 sorted[max] + std::numeric_limits<float>::epsilon());
141 const float dx = max - min + 1;
142 const float slope = (sorted[max] - sorted[min]) / dx;
143
144 if (actual_center != nullptr) {
145 // c may be out of bounds, so return center of the clamped bounds.
146 *actual_center = Half(min + max);
147 }
148 return slope;
149 };
150
151 // First find min_slope for all centers.
152 float min_slope = std::numeric_limits<float>::max();
153 for (int64_t c = center - radius; c <= center + radius; ++c) {
154 min_slope = std::min(min_slope, compute_slope(c));
155 }
156
157 // Candidates := centers with slope ~= min_slope.
158 std::vector<int64_t> candidates;
159 for (int64_t c = center - radius; c <= center + radius; ++c) {
160 int64_t actual_center;
161 const float slope = compute_slope(c, &actual_center);
162 if (slope <= min_slope * 1.001f) {
163 candidates.push_back(actual_center);
164 }
165 }
166
167 // Keep the median.
168 HWY_ASSERT(!candidates.empty());
169 if (candidates.size() == 1) return candidates[0];
170 return Median(&candidates);
171 }
172 };
173
174 // Robust estimator of the mode for data given as a CDF.
175 // O(N*logN), N=num_bins.
176 class HalfRangeMode {
177 public:
178 // Returns mode expressed as a histogram bin index. "cdf" must be weakly
179 // monotonically increasing, e.g. from std::partial_sum.
operator()180 int operator()(const uint32_t* HWY_RESTRICT cdf,
181 const size_t num_bins) const {
182 int center = num_bins / 2;
183 int width = num_bins;
184
185 // Zoom in on modal intervals of decreasing width. Stop before we reach
186 // width=1, i.e. original bins, because those are noisy.
187 while (width > 2) {
188 // Round up so we can still reach the outer edges of odd widths.
189 width = Half(width);
190
191 center = CenterOfIntervalWithMaxDensity(cdf, num_bins, center, width);
192 }
193
194 return center; // mode := midpoint of modal interval.
195 }
196
197 private:
198 // Returns center of the densest interval [c-radius, c+radius].
CenterOfIntervalWithMaxDensity(const uint32_t * HWY_RESTRICT cdf,const int total_bins,const int center,const int width)199 static HWY_INLINE int CenterOfIntervalWithMaxDensity(
200 const uint32_t* HWY_RESTRICT cdf, const int total_bins, const int center,
201 const int width) {
202 const int radius = Half(width);
203
204 auto compute_density = [radius, total_bins, cdf](
205 int c, int* actual_center = nullptr) {
206 // For symmetry, check 2*radius+1 bins, i.e. [min, max].
207 const int min = std::max(c - radius, 1); // for -1 below
208 const int max = std::min(c + radius, total_bins - 1);
209 HWY_ASSERT(min < max);
210 HWY_ASSERT(cdf[min] <= cdf[max - 1]);
211 const int num_bins = max - min + 1;
212 // Sum over [min, max] == CDF(max) - CDF(min-1).
213 const float density = float(cdf[max] - cdf[min - 1]) / num_bins;
214
215 if (actual_center != nullptr) {
216 // c may be out of bounds, so take center of the clamped bounds.
217 *actual_center = Half(min + max);
218 }
219 return density;
220 };
221
222 // First find max_density for all centers.
223 float max_density = 0.0f;
224 for (int c = center - radius; c <= center + radius; ++c) {
225 max_density = std::max(max_density, compute_density(c));
226 }
227
228 // Candidates := centers with density ~= max_density.
229 std::vector<int> candidates;
230 for (int c = center - radius; c <= center + radius; ++c) {
231 int actual_center;
232 const float density = compute_density(c, &actual_center);
233 if (density >= max_density * 0.999f) {
234 candidates.push_back(actual_center);
235 }
236 }
237
238 // Keep the median.
239 HWY_ASSERT(!candidates.empty());
240 if (candidates.size() == 1) return candidates[0];
241 return Median(&candidates);
242 }
243 };
244
245 // Sorts integral values in ascending order. About 3x faster than std::sort for
246 // input distributions with very few unique values.
247 template <class T>
CountingSort(T * begin,T * end)248 void CountingSort(T* begin, T* end) {
249 // Unique values and their frequency (similar to flat_map).
250 using Unique = std::pair<T, int>;
251 std::vector<Unique> unique;
252 for (const T* p = begin; p != end; ++p) {
253 const T value = *p;
254 const auto pos =
255 std::find_if(unique.begin(), unique.end(),
256 [value](const Unique& u) { return u.first == value; });
257 if (pos == unique.end()) {
258 unique.push_back(std::make_pair(*p, 1));
259 } else {
260 ++pos->second;
261 }
262 }
263
264 // Sort in ascending order of value (pair.first).
265 std::sort(unique.begin(), unique.end());
266
267 // Write that many copies of each unique value to the array.
268 T* HWY_RESTRICT p = begin;
269 for (const auto& value_count : unique) {
270 std::fill(p, p + value_count.second, value_count.first);
271 p += value_count.second;
272 }
273 HWY_ASSERT(p == end);
274 }
275
276 struct Bivariate {
BivariateBivariate277 Bivariate(float x, float y) : x(x), y(y) {}
278 float x;
279 float y;
280 };
281
282 class Line {
283 public:
Line(const float slope,const float intercept)284 constexpr Line(const float slope, const float intercept)
285 : slope_(slope), intercept_(intercept) {}
286
slope()287 constexpr float slope() const { return slope_; }
intercept()288 constexpr float intercept() const { return intercept_; }
289
290 // Robust line fit using Siegel's repeated-median algorithm.
Line(const std::vector<Bivariate> & points)291 explicit Line(const std::vector<Bivariate>& points) {
292 const size_t N = points.size();
293 // This straightforward N^2 implementation is OK for small N.
294 HWY_ASSERT(N < 10 * 1000);
295
296 // One for every point i.
297 std::vector<float> medians;
298 medians.reserve(N);
299
300 // One for every j != i. Never cleared to avoid reallocation.
301 std::vector<float> slopes(N - 1);
302
303 for (size_t i = 0; i < N; ++i) {
304 // Index within slopes[] (avoids the hole where j == i).
305 size_t idx_slope = 0;
306
307 for (size_t j = 0; j < N; ++j) {
308 if (j == i) continue;
309
310 const float dy = points[j].y - points[i].y;
311 const float dx = points[j].x - points[i].x;
312 HWY_ASSERT(std::abs(dx) > 1E-7f); // x must be distinct
313 slopes[idx_slope++] = dy / dx;
314 }
315 HWY_ASSERT(idx_slope == N - 1);
316
317 const float median = Median(&slopes);
318 medians.push_back(median);
319 }
320
321 slope_ = Median(&medians);
322
323 // Solve for intercept, overwriting medians[].
324 for (size_t i = 0; i < N; ++i) {
325 medians[i] = points[i].y - slope_ * points[i].x;
326 }
327 intercept_ = Median(&medians);
328 }
329
operator()330 constexpr float operator()(float x) const { return x * slope_ + intercept_; }
331
332 private:
333 float slope_;
334 float intercept_;
335 };
336
EvaluateQuality(const Line & line,const std::vector<Bivariate> & points,float * HWY_RESTRICT max_l1,float * HWY_RESTRICT median_abs_deviation)337 static inline void EvaluateQuality(const Line& line,
338 const std::vector<Bivariate>& points,
339 float* HWY_RESTRICT max_l1,
340 float* HWY_RESTRICT median_abs_deviation) {
341 // For computing median_abs_deviation.
342 std::vector<float> abs_deviations;
343 abs_deviations.reserve(points.size());
344
345 *max_l1 = 0.0f;
346 for (const Bivariate& point : points) {
347 const float l1 = std::abs(line(point.x) - point.y);
348 *max_l1 = std::max(*max_l1, l1);
349 abs_deviations.push_back(l1);
350 }
351
352 *median_abs_deviation = Median(&abs_deviations);
353 }
354
355 } // namespace jxl
356
357 #endif // LIB_JXL_BASE_ROBUST_STATISTICS_H_
358