1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 //
6 // Author: Jyrki Alakuijala (jyrki.alakuijala@gmail.com)
7 //
8 // The physical architecture of butteraugli is based on the following naming
9 // convention:
10 //   * Opsin - dynamics of the photosensitive chemicals in the retina
11 //             with their immediate electrical processing
12 //   * Xyb - hybrid opponent/trichromatic color space
13 //     x is roughly red-subtract-green.
14 //     y is yellow.
15 //     b is blue.
16 //     Xyb values are computed from Opsin mixing, not directly from rgb.
17 //   * Mask - for visual masking
18 //   * Hf - color modeling for spatially high-frequency features
19 //   * Lf - color modeling for spatially low-frequency features
20 //   * Diffmap - to cluster and build an image of error between the images
21 //   * Blur - to hold the smoothing code
22 
23 #include "lib/jxl/butteraugli/butteraugli.h"
24 
25 #include <stdint.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include <algorithm>
31 #include <array>
32 #include <cmath>
33 #include <new>
34 #include <vector>
35 
36 #undef HWY_TARGET_INCLUDE
37 #define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
38 #include <hwy/foreach_target.h>
39 
40 #include "lib/jxl/base/profiler.h"
41 #include "lib/jxl/base/status.h"
42 #if PROFILER_ENABLED
43 #include "lib/jxl/base/time.h"
44 #endif  // PROFILER_ENABLED
45 #include "lib/jxl/convolve.h"
46 #include "lib/jxl/fast_math-inl.h"
47 #include "lib/jxl/gauss_blur.h"
48 #include "lib/jxl/image_ops.h"
49 
50 #ifndef JXL_BUTTERAUGLI_ONCE
51 #define JXL_BUTTERAUGLI_ONCE
52 
53 namespace jxl {
54 
ComputeKernel(float sigma)55 std::vector<float> ComputeKernel(float sigma) {
56   const float m = 2.25;  // Accuracy increases when m is increased.
57   const double scaler = -1.0 / (2.0 * sigma * sigma);
58   const int diff = std::max<int>(1, m * std::fabs(sigma));
59   std::vector<float> kernel(2 * diff + 1);
60   for (int i = -diff; i <= diff; ++i) {
61     kernel[i + diff] = std::exp(scaler * i * i);
62   }
63   return kernel;
64 }
65 
ConvolveBorderColumn(const ImageF & in,const std::vector<float> & kernel,const size_t x,float * BUTTERAUGLI_RESTRICT row_out)66 void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
67                           const size_t x, float* BUTTERAUGLI_RESTRICT row_out) {
68   const size_t offset = kernel.size() / 2;
69   int minx = x < offset ? 0 : x - offset;
70   int maxx = std::min<int>(in.xsize() - 1, x + offset);
71   float weight = 0.0f;
72   for (int j = minx; j <= maxx; ++j) {
73     weight += kernel[j - x + offset];
74   }
75   float scale = 1.0f / weight;
76   for (size_t y = 0; y < in.ysize(); ++y) {
77     const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
78     float sum = 0.0f;
79     for (int j = minx; j <= maxx; ++j) {
80       sum += row_in[j] * kernel[j - x + offset];
81     }
82     row_out[y] = sum * scale;
83   }
84 }
85 
86 // Computes a horizontal convolution and transposes the result.
ConvolutionWithTranspose(const ImageF & in,const std::vector<float> & kernel,ImageF * BUTTERAUGLI_RESTRICT out)87 void ConvolutionWithTranspose(const ImageF& in,
88                               const std::vector<float>& kernel,
89                               ImageF* BUTTERAUGLI_RESTRICT out) {
90   PROFILER_FUNC;
91   JXL_CHECK(out->xsize() == in.ysize());
92   JXL_CHECK(out->ysize() == in.xsize());
93   const size_t len = kernel.size();
94   const size_t offset = len / 2;
95   float weight_no_border = 0.0f;
96   for (size_t j = 0; j < len; ++j) {
97     weight_no_border += kernel[j];
98   }
99   const float scale_no_border = 1.0f / weight_no_border;
100   const size_t border1 = std::min(in.xsize(), offset);
101   const size_t border2 = in.xsize() > offset ? in.xsize() - offset : 0;
102   std::vector<float> scaled_kernel(len / 2 + 1);
103   for (size_t i = 0; i <= len / 2; ++i) {
104     scaled_kernel[i] = kernel[i] * scale_no_border;
105   }
106 
107   // middle
108   switch (len) {
109 #if 1  // speed-optimized version
110     case 7: {
111       PROFILER_ZONE("conv7");
112       const float sk0 = scaled_kernel[0];
113       const float sk1 = scaled_kernel[1];
114       const float sk2 = scaled_kernel[2];
115       const float sk3 = scaled_kernel[3];
116       for (size_t y = 0; y < in.ysize(); ++y) {
117         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
118         for (size_t x = border1; x < border2; ++x, ++row_in) {
119           const float sum0 = (row_in[0] + row_in[6]) * sk0;
120           const float sum1 = (row_in[1] + row_in[5]) * sk1;
121           const float sum2 = (row_in[2] + row_in[4]) * sk2;
122           const float sum = (row_in[3]) * sk3 + sum0 + sum1 + sum2;
123           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
124           row_out[y] = sum;
125         }
126       }
127     } break;
128     case 13: {
129       PROFILER_ZONE("conv15");
130       for (size_t y = 0; y < in.ysize(); ++y) {
131         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
132         for (size_t x = border1; x < border2; ++x, ++row_in) {
133           float sum0 = (row_in[0] + row_in[12]) * scaled_kernel[0];
134           float sum1 = (row_in[1] + row_in[11]) * scaled_kernel[1];
135           float sum2 = (row_in[2] + row_in[10]) * scaled_kernel[2];
136           float sum3 = (row_in[3] + row_in[9]) * scaled_kernel[3];
137           sum0 += (row_in[4] + row_in[8]) * scaled_kernel[4];
138           sum1 += (row_in[5] + row_in[7]) * scaled_kernel[5];
139           const float sum = (row_in[6]) * scaled_kernel[6];
140           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
141           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
142         }
143       }
144       break;
145     }
146     case 15: {
147       PROFILER_ZONE("conv15");
148       for (size_t y = 0; y < in.ysize(); ++y) {
149         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
150         for (size_t x = border1; x < border2; ++x, ++row_in) {
151           float sum0 = (row_in[0] + row_in[14]) * scaled_kernel[0];
152           float sum1 = (row_in[1] + row_in[13]) * scaled_kernel[1];
153           float sum2 = (row_in[2] + row_in[12]) * scaled_kernel[2];
154           float sum3 = (row_in[3] + row_in[11]) * scaled_kernel[3];
155           sum0 += (row_in[4] + row_in[10]) * scaled_kernel[4];
156           sum1 += (row_in[5] + row_in[9]) * scaled_kernel[5];
157           sum2 += (row_in[6] + row_in[8]) * scaled_kernel[6];
158           const float sum = (row_in[7]) * scaled_kernel[7];
159           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
160           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
161         }
162       }
163       break;
164     }
165     case 25: {
166       PROFILER_ZONE("conv25");
167       for (size_t y = 0; y < in.ysize(); ++y) {
168         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
169         for (size_t x = border1; x < border2; ++x, ++row_in) {
170           float sum0 = (row_in[0] + row_in[24]) * scaled_kernel[0];
171           float sum1 = (row_in[1] + row_in[23]) * scaled_kernel[1];
172           float sum2 = (row_in[2] + row_in[22]) * scaled_kernel[2];
173           float sum3 = (row_in[3] + row_in[21]) * scaled_kernel[3];
174           sum0 += (row_in[4] + row_in[20]) * scaled_kernel[4];
175           sum1 += (row_in[5] + row_in[19]) * scaled_kernel[5];
176           sum2 += (row_in[6] + row_in[18]) * scaled_kernel[6];
177           sum3 += (row_in[7] + row_in[17]) * scaled_kernel[7];
178           sum0 += (row_in[8] + row_in[16]) * scaled_kernel[8];
179           sum1 += (row_in[9] + row_in[15]) * scaled_kernel[9];
180           sum2 += (row_in[10] + row_in[14]) * scaled_kernel[10];
181           sum3 += (row_in[11] + row_in[13]) * scaled_kernel[11];
182           const float sum = (row_in[12]) * scaled_kernel[12];
183           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
184           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
185         }
186       }
187       break;
188     }
189     case 33: {
190       PROFILER_ZONE("conv33");
191       for (size_t y = 0; y < in.ysize(); ++y) {
192         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
193         for (size_t x = border1; x < border2; ++x, ++row_in) {
194           float sum0 = (row_in[0] + row_in[32]) * scaled_kernel[0];
195           float sum1 = (row_in[1] + row_in[31]) * scaled_kernel[1];
196           float sum2 = (row_in[2] + row_in[30]) * scaled_kernel[2];
197           float sum3 = (row_in[3] + row_in[29]) * scaled_kernel[3];
198           sum0 += (row_in[4] + row_in[28]) * scaled_kernel[4];
199           sum1 += (row_in[5] + row_in[27]) * scaled_kernel[5];
200           sum2 += (row_in[6] + row_in[26]) * scaled_kernel[6];
201           sum3 += (row_in[7] + row_in[25]) * scaled_kernel[7];
202           sum0 += (row_in[8] + row_in[24]) * scaled_kernel[8];
203           sum1 += (row_in[9] + row_in[23]) * scaled_kernel[9];
204           sum2 += (row_in[10] + row_in[22]) * scaled_kernel[10];
205           sum3 += (row_in[11] + row_in[21]) * scaled_kernel[11];
206           sum0 += (row_in[12] + row_in[20]) * scaled_kernel[12];
207           sum1 += (row_in[13] + row_in[19]) * scaled_kernel[13];
208           sum2 += (row_in[14] + row_in[18]) * scaled_kernel[14];
209           sum3 += (row_in[15] + row_in[17]) * scaled_kernel[15];
210           const float sum = (row_in[16]) * scaled_kernel[16];
211           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
212           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
213         }
214       }
215       break;
216     }
217     case 37: {
218       PROFILER_ZONE("conv37");
219       for (size_t y = 0; y < in.ysize(); ++y) {
220         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
221         for (size_t x = border1; x < border2; ++x, ++row_in) {
222           float sum0 = (row_in[0] + row_in[36]) * scaled_kernel[0];
223           float sum1 = (row_in[1] + row_in[35]) * scaled_kernel[1];
224           float sum2 = (row_in[2] + row_in[34]) * scaled_kernel[2];
225           float sum3 = (row_in[3] + row_in[33]) * scaled_kernel[3];
226           sum0 += (row_in[4] + row_in[32]) * scaled_kernel[4];
227           sum0 += (row_in[5] + row_in[31]) * scaled_kernel[5];
228           sum0 += (row_in[6] + row_in[30]) * scaled_kernel[6];
229           sum0 += (row_in[7] + row_in[29]) * scaled_kernel[7];
230           sum0 += (row_in[8] + row_in[28]) * scaled_kernel[8];
231           sum1 += (row_in[9] + row_in[27]) * scaled_kernel[9];
232           sum2 += (row_in[10] + row_in[26]) * scaled_kernel[10];
233           sum3 += (row_in[11] + row_in[25]) * scaled_kernel[11];
234           sum0 += (row_in[12] + row_in[24]) * scaled_kernel[12];
235           sum1 += (row_in[13] + row_in[23]) * scaled_kernel[13];
236           sum2 += (row_in[14] + row_in[22]) * scaled_kernel[14];
237           sum3 += (row_in[15] + row_in[21]) * scaled_kernel[15];
238           sum0 += (row_in[16] + row_in[20]) * scaled_kernel[16];
239           sum1 += (row_in[17] + row_in[19]) * scaled_kernel[17];
240           const float sum = (row_in[18]) * scaled_kernel[18];
241           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
242           row_out[y] = sum + sum0 + sum1 + sum2 + sum3;
243         }
244       }
245       break;
246     }
247     default:
248       printf("Warning: Unexpected kernel size! %zu\n", len);
249 #else
250     default:
251 #endif
252       for (size_t y = 0; y < in.ysize(); ++y) {
253         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
254         for (size_t x = border1; x < border2; ++x) {
255           const int d = x - offset;
256           float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
257           float sum = 0.0f;
258           size_t j;
259           for (j = 0; j <= len / 2; ++j) {
260             sum += row_in[d + j] * scaled_kernel[j];
261           }
262           for (; j < len; ++j) {
263             sum += row_in[d + j] * scaled_kernel[len - 1 - j];
264           }
265           row_out[y] = sum;
266         }
267       }
268   }
269   // left border
270   for (size_t x = 0; x < border1; ++x) {
271     ConvolveBorderColumn(in, kernel, x, out->Row(x));
272   }
273 
274   // right border
275   for (size_t x = border2; x < in.xsize(); ++x) {
276     ConvolveBorderColumn(in, kernel, x, out->Row(x));
277   }
278 }
279 
280 // Separate horizontal and vertical (next function) convolution passes.
BlurHorizontalConv(const ImageF & in,const intptr_t xbegin,const intptr_t xend,const intptr_t ybegin,const intptr_t yend,const std::vector<float> & kernel,ImageF * out)281 void BlurHorizontalConv(const ImageF& in, const intptr_t xbegin,
282                         const intptr_t xend, const intptr_t ybegin,
283                         const intptr_t yend, const std::vector<float>& kernel,
284                         ImageF* out) {
285   if (xbegin >= xend || ybegin >= yend) return;
286   const intptr_t xsize = in.xsize();
287   const intptr_t ysize = in.ysize();
288   JXL_ASSERT(0 <= xbegin && xend <= xsize);
289   JXL_ASSERT(0 <= ybegin && yend <= ysize);
290   (void)xsize;
291   (void)ysize;
292   const intptr_t radius = kernel.size() / 2;
293 
294   for (intptr_t y = ybegin; y < yend; ++y) {
295     float* JXL_RESTRICT row_out = out->Row(y);
296     for (intptr_t x = xbegin; x < xend; ++x) {
297       float sum = 0.0f;
298       float sum_weights = 0.0f;
299       const float* JXL_RESTRICT row_in = in.Row(y);
300       for (intptr_t ix = -radius; ix <= radius; ++ix) {
301         const intptr_t in_x = x + ix;
302         if (in_x < 0 || in_x >= xsize) continue;
303         const float weight_x = kernel[ix + radius];
304         sum += row_in[in_x] * weight_x;
305         sum_weights += weight_x;
306       }
307       row_out[x] = sum / sum_weights;
308     }
309   }
310 }
311 
BlurVerticalConv(const ImageF & in,const intptr_t xbegin,const intptr_t xend,const intptr_t ybegin,const intptr_t yend,const std::vector<float> & kernel,ImageF * out)312 void BlurVerticalConv(const ImageF& in, const intptr_t xbegin,
313                       const intptr_t xend, const intptr_t ybegin,
314                       const intptr_t yend, const std::vector<float>& kernel,
315                       ImageF* out) {
316   if (xbegin >= xend || ybegin >= yend) return;
317   const intptr_t xsize = in.xsize();
318   const intptr_t ysize = in.ysize();
319   JXL_ASSERT(0 <= xbegin && xend <= xsize);
320   JXL_ASSERT(0 <= ybegin && yend <= ysize);
321   (void)xsize;
322   const intptr_t radius = kernel.size() / 2;
323   for (intptr_t y = ybegin; y < yend; ++y) {
324     float* JXL_RESTRICT row_out = out->Row(y);
325     for (intptr_t x = xbegin; x < xend; ++x) {
326       float sum = 0.0f;
327       float sum_weights = 0.0f;
328       for (intptr_t iy = -radius; iy <= radius; ++iy) {
329         const intptr_t in_y = y + iy;
330         if (in_y < 0 || in_y >= ysize) continue;
331         const float weight_y = kernel[iy + radius];
332         sum += in.ConstRow(in_y)[x] * weight_y;
333         sum_weights += weight_y;
334       }
335       row_out[x] = sum / sum_weights;
336     }
337   }
338 }
339 
340 // A blur somewhat similar to a 2D Gaussian blur.
341 // See: https://en.wikipedia.org/wiki/Gaussian_blur
342 //
343 // This is a bottleneck because the sigma can be quite large (>7). We can use
344 // gauss_blur.cc (runtime independent of sigma, closer to a 4*sigma truncated
345 // Gaussian and our 2.25 in ComputeKernel), but its boundary conditions are
346 // zero-valued. This leads to noticeable differences at the edges of diffmaps.
347 // We retain a special case for 5x5 kernels (even faster than gauss_blur),
348 // optionally use gauss_blur followed by fixup of the borders for large images,
349 // or fall back to the previous truncated FIR followed by a transpose.
Blur(const ImageF & in,float sigma,const ButteraugliParams & params,BlurTemp * temp,ImageF * out)350 void Blur(const ImageF& in, float sigma, const ButteraugliParams& params,
351           BlurTemp* temp, ImageF* out) {
352   std::vector<float> kernel = ComputeKernel(sigma);
353   // Separable5 does an in-place convolution, so this fast path is not safe if
354   // in aliases out.
355   if (kernel.size() == 5 && &in != out) {
356     float sum_weights = 0.0f;
357     for (const float w : kernel) {
358       sum_weights += w;
359     }
360     const float scale = 1.0f / sum_weights;
361     const float w0 = kernel[2] * scale;
362     const float w1 = kernel[1] * scale;
363     const float w2 = kernel[0] * scale;
364     const WeightsSeparable5 weights = {
365         {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
366         {HWY_REP4(w0), HWY_REP4(w1), HWY_REP4(w2)},
367     };
368     Separable5(in, Rect(in), weights, /*pool=*/nullptr, out);
369     return;
370   }
371 
372   const bool fast_gauss = params.approximate_border;
373   const bool kBorderFixup = fast_gauss && false;
374   // Fast+fixup is actually slower for small images that are all border.
375   const bool too_small_for_fast_gauss =
376       kBorderFixup &&
377       in.xsize() * in.ysize() < 9 * kernel.size() * kernel.size();
378   // If fast gaussian is disabled, use previous transposed convolution.
379   if (!fast_gauss || too_small_for_fast_gauss) {
380     ImageF* JXL_RESTRICT temp_t = temp->GetTransposed(in);
381     ConvolutionWithTranspose(in, kernel, temp_t);
382     ConvolutionWithTranspose(*temp_t, kernel, out);
383     return;
384   }
385   auto rg = CreateRecursiveGaussian(sigma);
386   ImageF* JXL_RESTRICT temp_ = temp->Get(in);
387   ThreadPool* null_pool = nullptr;
388   FastGaussian(rg, in, null_pool, temp_, out);
389 
390   if (kBorderFixup) {
391     // Produce rg_radius extra pixels around each border
392     const intptr_t rg_radius = rg->radius;
393     const intptr_t radius = kernel.size() / 2;
394     const intptr_t xsize = in.xsize();
395     const intptr_t ysize = in.ysize();
396     const intptr_t yend_top = std::min(rg_radius + radius, ysize);
397     const intptr_t ybegin_bottom =
398         std::max(intptr_t(0), ysize - rg_radius - radius);
399     // Top (requires radius extra for the vertical pass)
400     BlurHorizontalConv(in, 0, xsize, 0, yend_top, kernel, temp_);
401     // Bottom
402     BlurHorizontalConv(in, 0, xsize, ybegin_bottom, ysize, kernel, temp_);
403     // Left/right columns between top and bottom
404     const intptr_t xbegin_right = std::max(intptr_t(0), xsize - rg_radius);
405     const intptr_t xend_left = std::min(rg_radius, xsize);
406     BlurHorizontalConv(in, 0, xend_left, yend_top, ybegin_bottom, kernel,
407                        temp_);
408     BlurHorizontalConv(in, xbegin_right, xsize, yend_top, ybegin_bottom, kernel,
409                        temp_);
410 
411     // Entire left/right columns
412     BlurVerticalConv(*temp_, 0, xend_left, 0, ysize, kernel, out);
413     BlurVerticalConv(*temp_, xbegin_right, xsize, 0, ysize, kernel, out);
414     // Top/bottom between left/right
415     const intptr_t ybegin_bottom2 = std::max(intptr_t(0), ysize - rg_radius);
416     const intptr_t yend_top2 = std::min(rg_radius, ysize);
417     BlurVerticalConv(*temp_, xend_left, xbegin_right, 0, yend_top2, kernel,
418                      out);
419     BlurVerticalConv(*temp_, xend_left, xbegin_right, ybegin_bottom2, ysize,
420                      kernel, out);
421   }
422 }
423 
424 // Allows PaddedMaltaUnit to call either function via overloading.
425 struct MaltaTagLF {};
426 struct MaltaTag {};
427 
428 }  // namespace jxl
429 
430 #endif  // JXL_BUTTERAUGLI_ONCE
431 
432 #include <hwy/highway.h>
433 HWY_BEFORE_NAMESPACE();
434 namespace jxl {
435 namespace HWY_NAMESPACE {
436 
437 // These templates are not found via ADL.
438 using hwy::HWY_NAMESPACE::Vec;
439 
440 template <class D, class V>
MaximumClamp(D d,V v,double kMaxVal)441 HWY_INLINE V MaximumClamp(D d, V v, double kMaxVal) {
442   static const double kMul = 0.724216145665;
443   const V mul = Set(d, kMul);
444   const V maxval = Set(d, kMaxVal);
445   // If greater than maxval or less than -maxval, replace with if_*.
446   const V if_pos = MulAdd(v - maxval, mul, maxval);
447   const V if_neg = MulSub(v + maxval, mul, maxval);
448   const V pos_or_v = IfThenElse(v >= maxval, if_pos, v);
449   return IfThenElse(v < Neg(maxval), if_neg, pos_or_v);
450 }
451 
452 // Make area around zero less important (remove it).
453 template <class D, class V>
RemoveRangeAroundZero(const D d,const double kw,const V x)454 HWY_INLINE V RemoveRangeAroundZero(const D d, const double kw, const V x) {
455   const auto w = Set(d, kw);
456   return IfThenElse(x > w, x - w, IfThenElseZero(x < Neg(w), x + w));
457 }
458 
459 // Make area around zero more important (2x it until the limit).
460 template <class D, class V>
AmplifyRangeAroundZero(const D d,const double kw,const V x)461 HWY_INLINE V AmplifyRangeAroundZero(const D d, const double kw, const V x) {
462   const auto w = Set(d, kw);
463   return IfThenElse(x > w, x + w, IfThenElse(x < Neg(w), x - w, x + x));
464 }
465 
466 // XybLowFreqToVals converts from low-frequency XYB space to the 'vals' space.
467 // Vals space can be converted to L2-norm space (Euclidean and normalized)
468 // through visual masking.
469 template <class D, class V>
XybLowFreqToVals(const D d,const V & x,const V & y,const V & b_arg,V * HWY_RESTRICT valx,V * HWY_RESTRICT valy,V * HWY_RESTRICT valb)470 HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
471                                  const V& b_arg, V* HWY_RESTRICT valx,
472                                  V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
473   static const double xmuli = 32.2217497012;
474   static const double ymuli = 13.7697791434;
475   static const double bmuli = 47.504615728;
476   static const double y_to_b_muli = -0.362267051518;
477   const V xmul = Set(d, xmuli);
478   const V ymul = Set(d, ymuli);
479   const V bmul = Set(d, bmuli);
480   const V y_to_b_mul = Set(d, y_to_b_muli);
481   const V b = MulAdd(y_to_b_mul, y, b_arg);
482   *valb = b * bmul;
483   *valx = x * xmul;
484   *valy = y * ymul;
485 }
486 
SuppressXByY(const ImageF & in_x,const ImageF & in_y,const double yw,ImageF * HWY_RESTRICT out)487 void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw,
488                   ImageF* HWY_RESTRICT out) {
489   JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out));
490   const size_t xsize = in_x.xsize();
491   const size_t ysize = in_x.ysize();
492 
493   const HWY_FULL(float) d;
494   static const double s = 0.653020556257;
495   const auto sv = Set(d, s);
496   const auto one_minus_s = Set(d, 1.0 - s);
497   const auto ywv = Set(d, yw);
498 
499   for (size_t y = 0; y < ysize; ++y) {
500     const float* HWY_RESTRICT row_x = in_x.ConstRow(y);
501     const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
502     float* HWY_RESTRICT row_out = out->Row(y);
503 
504     for (size_t x = 0; x < xsize; x += Lanes(d)) {
505       const auto vx = Load(d, row_x + x);
506       const auto vy = Load(d, row_y + x);
507       const auto scaler = MulAdd(ywv / MulAdd(vy, vy, ywv), one_minus_s, sv);
508       Store(scaler * vx, d, row_out + x);
509     }
510   }
511 }
512 
SeparateFrequencies(size_t xsize,size_t ysize,const ButteraugliParams & params,BlurTemp * blur_temp,const Image3F & xyb,PsychoImage & ps)513 static void SeparateFrequencies(size_t xsize, size_t ysize,
514                                 const ButteraugliParams& params,
515                                 BlurTemp* blur_temp, const Image3F& xyb,
516                                 PsychoImage& ps) {
517   PROFILER_FUNC;
518   const HWY_FULL(float) d;
519 
520   // Extract lf ...
521   static const double kSigmaLf = 7.15593339443;
522   static const double kSigmaHf = 3.22489901262;
523   static const double kSigmaUhf = 1.56416327805;
524   ps.mf = Image3F(xsize, ysize);
525   ps.hf[0] = ImageF(xsize, ysize);
526   ps.hf[1] = ImageF(xsize, ysize);
527   ps.lf = Image3F(xyb.xsize(), xyb.ysize());
528   ps.mf = Image3F(xyb.xsize(), xyb.ysize());
529   for (int i = 0; i < 3; ++i) {
530     Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i));
531 
532     // ... and keep everything else in mf.
533     for (size_t y = 0; y < ysize; ++y) {
534       const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y);
535       const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y);
536       float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
537       for (size_t x = 0; x < xsize; x += Lanes(d)) {
538         const auto mf = Load(d, row_xyb + x) - Load(d, row_lf + x);
539         Store(mf, d, row_mf + x);
540       }
541     }
542     if (i == 2) {
543       Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
544       break;
545     }
546     // Divide mf into mf and hf.
547     for (size_t y = 0; y < ysize; ++y) {
548       float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
549       float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
550       for (size_t x = 0; x < xsize; x += Lanes(d)) {
551         Store(Load(d, row_mf + x), d, row_hf + x);
552       }
553     }
554     Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
555     static const double kRemoveMfRange = 0.29;
556     static const double kAddMfRange = 0.1;
557     if (i == 0) {
558       for (size_t y = 0; y < ysize; ++y) {
559         float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y);
560         float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
561         for (size_t x = 0; x < xsize; x += Lanes(d)) {
562           auto mf = Load(d, row_mf + x);
563           auto hf = Load(d, row_hf + x) - mf;
564           mf = RemoveRangeAroundZero(d, kRemoveMfRange, mf);
565           Store(mf, d, row_mf + x);
566           Store(hf, d, row_hf + x);
567         }
568       }
569     } else {
570       for (size_t y = 0; y < ysize; ++y) {
571         float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y);
572         float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
573         for (size_t x = 0; x < xsize; x += Lanes(d)) {
574           auto mf = Load(d, row_mf + x);
575           auto hf = Load(d, row_hf + x) - mf;
576 
577           mf = AmplifyRangeAroundZero(d, kAddMfRange, mf);
578           Store(mf, d, row_mf + x);
579           Store(hf, d, row_hf + x);
580         }
581       }
582     }
583   }
584 
585   // Temporarily used as output of SuppressXByY
586   ps.uhf[0] = ImageF(xsize, ysize);
587   ps.uhf[1] = ImageF(xsize, ysize);
588 
589   // Suppress red-green by intensity change in the high freq channels.
590   static const double suppress = 46.0;
591   SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]);
592   // hf is the SuppressXByY output, uhf will be written below.
593   ps.hf[0].Swap(ps.uhf[0]);
594 
595   for (int i = 0; i < 2; ++i) {
596     // Divide hf into hf and uhf.
597     for (size_t y = 0; y < ysize; ++y) {
598       float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y);
599       float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
600       for (size_t x = 0; x < xsize; ++x) {
601         row_uhf[x] = row_hf[x];
602       }
603     }
604     Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]);
605     static const double kRemoveHfRange = 1.5;
606     static const double kAddHfRange = 0.132;
607     static const double kRemoveUhfRange = 0.04;
608     static const double kMaxclampHf = 28.4691806922;
609     static const double kMaxclampUhf = 5.19175294647;
610     static double kMulYHf = 2.155;
611     static double kMulYUhf = 2.69313763794;
612     if (i == 0) {
613       for (size_t y = 0; y < ysize; ++y) {
614         float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y);
615         float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
616         for (size_t x = 0; x < xsize; x += Lanes(d)) {
617           auto hf = Load(d, row_hf + x);
618           auto uhf = Load(d, row_uhf + x) - hf;
619           hf = RemoveRangeAroundZero(d, kRemoveHfRange, hf);
620           uhf = RemoveRangeAroundZero(d, kRemoveUhfRange, uhf);
621           Store(hf, d, row_hf + x);
622           Store(uhf, d, row_uhf + x);
623         }
624       }
625     } else {
626       for (size_t y = 0; y < ysize; ++y) {
627         float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y);
628         float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
629         for (size_t x = 0; x < xsize; x += Lanes(d)) {
630           auto hf = Load(d, row_hf + x);
631           hf = MaximumClamp(d, hf, kMaxclampHf);
632 
633           auto uhf = Load(d, row_uhf + x) - hf;
634           uhf = MaximumClamp(d, uhf, kMaxclampUhf);
635           uhf *= Set(d, kMulYUhf);
636           Store(uhf, d, row_uhf + x);
637 
638           hf *= Set(d, kMulYHf);
639           hf = AmplifyRangeAroundZero(d, kAddHfRange, hf);
640           Store(hf, d, row_hf + x);
641         }
642       }
643     }
644   }
645   // Modify range around zero code only concerns the high frequency
646   // planes and only the X and Y channels.
647   // Convert low freq xyb to vals space so that we can do a simple squared sum
648   // diff on the low frequencies later.
649   for (size_t y = 0; y < ysize; ++y) {
650     float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y);
651     float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y);
652     float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y);
653     for (size_t x = 0; x < xsize; x += Lanes(d)) {
654       auto valx = Undefined(d);
655       auto valy = Undefined(d);
656       auto valb = Undefined(d);
657       XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
658                        Load(d, row_b + x), &valx, &valy, &valb);
659       Store(valx, d, row_x + x);
660       Store(valy, d, row_y + x);
661       Store(valb, d, row_b + x);
662     }
663   }
664 }
665 
666 template <class D>
MaltaUnit(MaltaTagLF,const D df,const float * BUTTERAUGLI_RESTRICT d,const intptr_t xs)667 Vec<D> MaltaUnit(MaltaTagLF /*tag*/, const D df,
668                  const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
669   const intptr_t xs3 = 3 * xs;
670 
671   const auto center = LoadU(df, d);
672 
673   // x grows, y constant
674   const auto sum_yconst = LoadU(df, d - 4) + LoadU(df, d - 2) + center +
675                           LoadU(df, d + 2) + LoadU(df, d + 4);
676   // Will return this, sum of all line kernels
677   auto retval = sum_yconst * sum_yconst;
678   {
679     // y grows, x constant
680     auto sum = LoadU(df, d - xs3 - xs) + LoadU(df, d - xs - xs) + center +
681                LoadU(df, d + xs + xs) + LoadU(df, d + xs3 + xs);
682     retval = MulAdd(sum, sum, retval);
683   }
684   {
685     // both grow
686     auto sum = LoadU(df, d - xs3 - 3) + LoadU(df, d - xs - xs - 2) + center +
687                LoadU(df, d + xs + xs + 2) + LoadU(df, d + xs3 + 3);
688     retval = MulAdd(sum, sum, retval);
689   }
690   {
691     // y grows, x shrinks
692     auto sum = LoadU(df, d - xs3 + 3) + LoadU(df, d - xs - xs + 2) + center +
693                LoadU(df, d + xs + xs - 2) + LoadU(df, d + xs3 - 3);
694     retval = MulAdd(sum, sum, retval);
695   }
696   {
697     // y grows -4 to 4, x shrinks 1 -> -1
698     auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs - xs + 1) +
699                center + LoadU(df, d + xs + xs - 1) +
700                LoadU(df, d + xs3 + xs - 1);
701     retval = MulAdd(sum, sum, retval);
702   }
703   {
704     //  y grows -4 to 4, x grows -1 -> 1
705     auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs - xs - 1) +
706                center + LoadU(df, d + xs + xs + 1) +
707                LoadU(df, d + xs3 + xs + 1);
708     retval = MulAdd(sum, sum, retval);
709   }
710   {
711     // x grows -4 to 4, y grows -1 to 1
712     auto sum = LoadU(df, d - 4 - xs) + LoadU(df, d - 2 - xs) + center +
713                LoadU(df, d + 2 + xs) + LoadU(df, d + 4 + xs);
714     retval = MulAdd(sum, sum, retval);
715   }
716   {
717     // x grows -4 to 4, y shrinks 1 to -1
718     auto sum = LoadU(df, d - 4 + xs) + LoadU(df, d - 2 + xs) + center +
719                LoadU(df, d + 2 - xs) + LoadU(df, d + 4 - xs);
720     retval = MulAdd(sum, sum, retval);
721   }
722   {
723     /* 0_________
724        1__*______
725        2___*_____
726        3_________
727        4____0____
728        5_________
729        6_____*___
730        7______*__
731        8_________ */
732     auto sum = LoadU(df, d - xs3 - 2) + LoadU(df, d - xs - xs - 1) + center +
733                LoadU(df, d + xs + xs + 1) + LoadU(df, d + xs3 + 2);
734     retval = MulAdd(sum, sum, retval);
735   }
736   {
737     /* 0_________
738        1______*__
739        2_____*___
740        3_________
741        4____0____
742        5_________
743        6___*_____
744        7__*______
745        8_________ */
746     auto sum = LoadU(df, d - xs3 + 2) + LoadU(df, d - xs - xs + 1) + center +
747                LoadU(df, d + xs + xs - 1) + LoadU(df, d + xs3 - 2);
748     retval = MulAdd(sum, sum, retval);
749   }
750   {
751     /* 0_________
752        1_________
753        2_*_______
754        3__*______
755        4____0____
756        5______*__
757        6_______*_
758        7_________
759        8_________ */
760     auto sum = LoadU(df, d - xs - xs - 3) + LoadU(df, d - xs - 2) + center +
761                LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 3);
762     retval = MulAdd(sum, sum, retval);
763   }
764   {
765     /* 0_________
766        1_________
767        2_______*_
768        3______*__
769        4____0____
770        5__*______
771        6_*_______
772        7_________
773        8_________ */
774     auto sum = LoadU(df, d - xs - xs + 3) + LoadU(df, d - xs + 2) + center +
775                LoadU(df, d + xs - 2) + LoadU(df, d + xs + xs - 3);
776     retval = MulAdd(sum, sum, retval);
777   }
778   {
779     /* 0_________
780        1_________
781        2________*
782        3______*__
783        4____0____
784        5__*______
785        6*________
786        7_________
787        8_________ */
788 
789     auto sum = LoadU(df, d + xs + xs - 4) + LoadU(df, d + xs - 2) + center +
790                LoadU(df, d - xs + 2) + LoadU(df, d - xs - xs + 4);
791     retval = MulAdd(sum, sum, retval);
792   }
793   {
794     /* 0_________
795        1_________
796        2*________
797        3__*______
798        4____0____
799        5______*__
800        6________*
801        7_________
802        8_________ */
803     auto sum = LoadU(df, d - xs - xs - 4) + LoadU(df, d - xs - 2) + center +
804                LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 4);
805     retval = MulAdd(sum, sum, retval);
806   }
807   {
808     /* 0__*______
809        1_________
810        2___*_____
811        3_________
812        4____0____
813        5_________
814        6_____*___
815        7_________
816        8______*__ */
817     auto sum = LoadU(df, d - xs3 - xs - 2) + LoadU(df, d - xs - xs - 1) +
818                center + LoadU(df, d + xs + xs + 1) +
819                LoadU(df, d + xs3 + xs + 2);
820     retval = MulAdd(sum, sum, retval);
821   }
822   {
823     /* 0______*__
824        1_________
825        2_____*___
826        3_________
827        4____0____
828        5_________
829        6___*_____
830        7_________
831        8__*______ */
832     auto sum = LoadU(df, d - xs3 - xs + 2) + LoadU(df, d - xs - xs + 1) +
833                center + LoadU(df, d + xs + xs - 1) +
834                LoadU(df, d + xs3 + xs - 2);
835     retval = MulAdd(sum, sum, retval);
836   }
837   return retval;
838 }
839 
840 template <class D>
MaltaUnit(MaltaTag,const D df,const float * BUTTERAUGLI_RESTRICT d,const intptr_t xs)841 Vec<D> MaltaUnit(MaltaTag /*tag*/, const D df,
842                  const float* BUTTERAUGLI_RESTRICT d, const intptr_t xs) {
843   const intptr_t xs3 = 3 * xs;
844 
845   const auto center = LoadU(df, d);
846 
847   // x grows, y constant
848   const auto sum_yconst = LoadU(df, d - 4) + LoadU(df, d - 3) +
849                           LoadU(df, d - 2) + LoadU(df, d - 1) + center +
850                           LoadU(df, d + 1) + LoadU(df, d + 2) +
851                           LoadU(df, d + 3) + LoadU(df, d + 4);
852   // Will return this, sum of all line kernels
853   auto retval = sum_yconst * sum_yconst;
854 
855   {
856     // y grows, x constant
857     auto sum = LoadU(df, d - xs3 - xs) + LoadU(df, d - xs3) +
858                LoadU(df, d - xs - xs) + LoadU(df, d - xs) + center +
859                LoadU(df, d + xs) + LoadU(df, d + xs + xs) + LoadU(df, d + xs3) +
860                LoadU(df, d + xs3 + xs);
861     retval = MulAdd(sum, sum, retval);
862   }
863   {
864     // both grow
865     auto sum = LoadU(df, d - xs3 - 3) + LoadU(df, d - xs - xs - 2) +
866                LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) +
867                LoadU(df, d + xs + xs + 2) + LoadU(df, d + xs3 + 3);
868     retval = MulAdd(sum, sum, retval);
869   }
870   {
871     // y grows, x shrinks
872     auto sum = LoadU(df, d - xs3 + 3) + LoadU(df, d - xs - xs + 2) +
873                LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) +
874                LoadU(df, d + xs + xs - 2) + LoadU(df, d + xs3 - 3);
875     retval = MulAdd(sum, sum, retval);
876   }
877   {
878     // y grows -4 to 4, x shrinks 1 -> -1
879     auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs3 + 1) +
880                LoadU(df, d - xs - xs + 1) + LoadU(df, d - xs) + center +
881                LoadU(df, d + xs) + LoadU(df, d + xs + xs - 1) +
882                LoadU(df, d + xs3 - 1) + LoadU(df, d + xs3 + xs - 1);
883     retval = MulAdd(sum, sum, retval);
884   }
885   {
886     //  y grows -4 to 4, x grows -1 -> 1
887     auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs3 - 1) +
888                LoadU(df, d - xs - xs - 1) + LoadU(df, d - xs) + center +
889                LoadU(df, d + xs) + LoadU(df, d + xs + xs + 1) +
890                LoadU(df, d + xs3 + 1) + LoadU(df, d + xs3 + xs + 1);
891     retval = MulAdd(sum, sum, retval);
892   }
893   {
894     // x grows -4 to 4, y grows -1 to 1
895     auto sum = LoadU(df, d - 4 - xs) + LoadU(df, d - 3 - xs) +
896                LoadU(df, d - 2 - xs) + LoadU(df, d - 1) + center +
897                LoadU(df, d + 1) + LoadU(df, d + 2 + xs) +
898                LoadU(df, d + 3 + xs) + LoadU(df, d + 4 + xs);
899     retval = MulAdd(sum, sum, retval);
900   }
901   {
902     // x grows -4 to 4, y shrinks 1 to -1
903     auto sum = LoadU(df, d - 4 + xs) + LoadU(df, d - 3 + xs) +
904                LoadU(df, d - 2 + xs) + LoadU(df, d - 1) + center +
905                LoadU(df, d + 1) + LoadU(df, d + 2 - xs) +
906                LoadU(df, d + 3 - xs) + LoadU(df, d + 4 - xs);
907     retval = MulAdd(sum, sum, retval);
908   }
909   {
910     /* 0_________
911        1__*______
912        2___*_____
913        3___*_____
914        4____0____
915        5_____*___
916        6_____*___
917        7______*__
918        8_________ */
919     auto sum = LoadU(df, d - xs3 - 2) + LoadU(df, d - xs - xs - 1) +
920                LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) +
921                LoadU(df, d + xs + xs + 1) + LoadU(df, d + xs3 + 2);
922     retval = MulAdd(sum, sum, retval);
923   }
924   {
925     /* 0_________
926        1______*__
927        2_____*___
928        3_____*___
929        4____0____
930        5___*_____
931        6___*_____
932        7__*______
933        8_________ */
934     auto sum = LoadU(df, d - xs3 + 2) + LoadU(df, d - xs - xs + 1) +
935                LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) +
936                LoadU(df, d + xs + xs - 1) + LoadU(df, d + xs3 - 2);
937     retval = MulAdd(sum, sum, retval);
938   }
939   {
940     /* 0_________
941        1_________
942        2_*_______
943        3__**_____
944        4____0____
945        5_____**__
946        6_______*_
947        7_________
948        8_________ */
949     auto sum = LoadU(df, d - xs - xs - 3) + LoadU(df, d - xs - 2) +
950                LoadU(df, d - xs - 1) + center + LoadU(df, d + xs + 1) +
951                LoadU(df, d + xs + 2) + LoadU(df, d + xs + xs + 3);
952     retval = MulAdd(sum, sum, retval);
953   }
954   {
955     /* 0_________
956        1_________
957        2_______*_
958        3_____**__
959        4____0____
960        5__**_____
961        6_*_______
962        7_________
963        8_________ */
964     auto sum = LoadU(df, d - xs - xs + 3) + LoadU(df, d - xs + 2) +
965                LoadU(df, d - xs + 1) + center + LoadU(df, d + xs - 1) +
966                LoadU(df, d + xs - 2) + LoadU(df, d + xs + xs - 3);
967     retval = MulAdd(sum, sum, retval);
968   }
969   {
970     /* 0_________
971        1_________
972        2_________
973        3______***
974        4___*0*___
975        5***______
976        6_________
977        7_________
978        8_________ */
979 
980     auto sum = LoadU(df, d + xs - 4) + LoadU(df, d + xs - 3) +
981                LoadU(df, d + xs - 2) + LoadU(df, d - 1) + center +
982                LoadU(df, d + 1) + LoadU(df, d - xs + 2) +
983                LoadU(df, d - xs + 3) + LoadU(df, d - xs + 4);
984     retval = MulAdd(sum, sum, retval);
985   }
986   {
987     /* 0_________
988        1_________
989        2_________
990        3***______
991        4___*0*___
992        5______***
993        6_________
994        7_________
995        8_________ */
996     auto sum = LoadU(df, d - xs - 4) + LoadU(df, d - xs - 3) +
997                LoadU(df, d - xs - 2) + LoadU(df, d - 1) + center +
998                LoadU(df, d + 1) + LoadU(df, d + xs + 2) +
999                LoadU(df, d + xs + 3) + LoadU(df, d + xs + 4);
1000     retval = MulAdd(sum, sum, retval);
1001   }
1002   {
1003     /* 0___*_____
1004        1___*_____
1005        2___*_____
1006        3____*____
1007        4____0____
1008        5____*____
1009        6_____*___
1010        7_____*___
1011        8_____*___ */
1012     auto sum = LoadU(df, d - xs3 - xs - 1) + LoadU(df, d - xs3 - 1) +
1013                LoadU(df, d - xs - xs - 1) + LoadU(df, d - xs) + center +
1014                LoadU(df, d + xs) + LoadU(df, d + xs + xs + 1) +
1015                LoadU(df, d + xs3 + 1) + LoadU(df, d + xs3 + xs + 1);
1016     retval = MulAdd(sum, sum, retval);
1017   }
1018   {
1019     /* 0_____*___
1020        1_____*___
1021        2____ *___
1022        3____*____
1023        4____0____
1024        5____*____
1025        6___*_____
1026        7___*_____
1027        8___*_____ */
1028     auto sum = LoadU(df, d - xs3 - xs + 1) + LoadU(df, d - xs3 + 1) +
1029                LoadU(df, d - xs - xs + 1) + LoadU(df, d - xs) + center +
1030                LoadU(df, d + xs) + LoadU(df, d + xs + xs - 1) +
1031                LoadU(df, d + xs3 - 1) + LoadU(df, d + xs3 + xs - 1);
1032     retval = MulAdd(sum, sum, retval);
1033   }
1034   return retval;
1035 }
1036 
1037 // Returns MaltaUnit. Avoids bounds-checks when x0 and y0 are known
1038 // to be far enough from the image borders. "diffs" is a packed image.
1039 template <class Tag>
PaddedMaltaUnit(const ImageF & diffs,const size_t x0,const size_t y0)1040 static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
1041                                                 const size_t x0,
1042                                                 const size_t y0) {
1043   const float* BUTTERAUGLI_RESTRICT d = diffs.ConstRow(y0) + x0;
1044   const HWY_CAPPED(float, 1) df;
1045   if ((x0 >= 4 && y0 >= 4 && x0 < (diffs.xsize() - 4) &&
1046        y0 < (diffs.ysize() - 4))) {
1047     return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
1048   }
1049 
1050   PROFILER_ZONE("Padded Malta");
1051   float borderimage[12 * 9];  // round up to 4
1052   for (int dy = 0; dy < 9; ++dy) {
1053     int y = y0 + dy - 4;
1054     if (y < 0 || static_cast<size_t>(y) >= diffs.ysize()) {
1055       for (int dx = 0; dx < 12; ++dx) {
1056         borderimage[dy * 12 + dx] = 0.0f;
1057       }
1058       continue;
1059     }
1060 
1061     const float* row_diffs = diffs.ConstRow(y);
1062     for (int dx = 0; dx < 9; ++dx) {
1063       int x = x0 + dx - 4;
1064       if (x < 0 || static_cast<size_t>(x) >= diffs.xsize()) {
1065         borderimage[dy * 12 + dx] = 0.0f;
1066       } else {
1067         borderimage[dy * 12 + dx] = row_diffs[x];
1068       }
1069     }
1070     std::fill(borderimage + dy * 12 + 9, borderimage + dy * 12 + 12, 0.0f);
1071   }
1072   return GetLane(MaltaUnit(Tag(), df, &borderimage[4 * 12 + 4], 12));
1073 }
1074 
1075 template <class Tag>
MaltaDiffMapT(const Tag tag,const ImageF & lum0,const ImageF & lum1,const double w_0gt1,const double w_0lt1,const double norm1,const double len,const double mulli,ImageF * HWY_RESTRICT diffs,Image3F * HWY_RESTRICT block_diff_ac,size_t c)1076 static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
1077                           const double w_0gt1, const double w_0lt1,
1078                           const double norm1, const double len,
1079                           const double mulli, ImageF* HWY_RESTRICT diffs,
1080                           Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1081   JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
1082   const size_t xsize_ = lum0.xsize();
1083   const size_t ysize_ = lum0.ysize();
1084 
1085   const float kWeight0 = 0.5;
1086   const float kWeight1 = 0.33;
1087 
1088   const double w_pre0gt1 = mulli * std::sqrt(kWeight0 * w_0gt1) / (len * 2 + 1);
1089   const double w_pre0lt1 = mulli * std::sqrt(kWeight1 * w_0lt1) / (len * 2 + 1);
1090   const float norm2_0gt1 = w_pre0gt1 * norm1;
1091   const float norm2_0lt1 = w_pre0lt1 * norm1;
1092 
1093   for (size_t y = 0; y < ysize_; ++y) {
1094     const float* HWY_RESTRICT row0 = lum0.ConstRow(y);
1095     const float* HWY_RESTRICT row1 = lum1.ConstRow(y);
1096     float* HWY_RESTRICT row_diffs = diffs->Row(y);
1097     for (size_t x = 0; x < xsize_; ++x) {
1098       const float absval = 0.5f * (std::abs(row0[x]) + std::abs(row1[x]));
1099       const float diff = row0[x] - row1[x];
1100       const float scaler = norm2_0gt1 / (static_cast<float>(norm1) + absval);
1101 
1102       // Primary symmetric quadratic objective.
1103       row_diffs[x] = scaler * diff;
1104 
1105       const float scaler2 = norm2_0lt1 / (static_cast<float>(norm1) + absval);
1106       const double fabs0 = std::fabs(row0[x]);
1107 
1108       // Secondary half-open quadratic objectives.
1109       const double too_small = 0.55 * fabs0;
1110       const double too_big = 1.05 * fabs0;
1111 
1112       if (row0[x] < 0) {
1113         if (row1[x] > -too_small) {
1114           double impact = scaler2 * (row1[x] + too_small);
1115           if (diff < 0) {
1116             row_diffs[x] -= impact;
1117           } else {
1118             row_diffs[x] += impact;
1119           }
1120         } else if (row1[x] < -too_big) {
1121           double impact = scaler2 * (-row1[x] - too_big);
1122           if (diff < 0) {
1123             row_diffs[x] -= impact;
1124           } else {
1125             row_diffs[x] += impact;
1126           }
1127         }
1128       } else {
1129         if (row1[x] < too_small) {
1130           double impact = scaler2 * (too_small - row1[x]);
1131           if (diff < 0) {
1132             row_diffs[x] -= impact;
1133           } else {
1134             row_diffs[x] += impact;
1135           }
1136         } else if (row1[x] > too_big) {
1137           double impact = scaler2 * (row1[x] - too_big);
1138           if (diff < 0) {
1139             row_diffs[x] -= impact;
1140           } else {
1141             row_diffs[x] += impact;
1142           }
1143         }
1144       }
1145     }
1146   }
1147 
1148   size_t y0 = 0;
1149   // Top
1150   for (; y0 < 4; ++y0) {
1151     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1152     for (size_t x0 = 0; x0 < xsize_; ++x0) {
1153       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1154     }
1155   }
1156 
1157   const HWY_FULL(float) df;
1158   const size_t aligned_x = std::max(size_t(4), Lanes(df));
1159   const intptr_t stride = diffs->PixelsPerRow();
1160 
1161   // Middle
1162   for (; y0 < ysize_ - 4; ++y0) {
1163     const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
1164     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1165     size_t x0 = 0;
1166     for (; x0 < aligned_x; ++x0) {
1167       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1168     }
1169     for (; x0 + Lanes(df) + 4 <= xsize_; x0 += Lanes(df)) {
1170       auto diff = Load(df, row_diff + x0);
1171       diff += MaltaUnit(Tag(), df, row_in + x0, stride);
1172       Store(diff, df, row_diff + x0);
1173     }
1174 
1175     for (; x0 < xsize_; ++x0) {
1176       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1177     }
1178   }
1179 
1180   // Bottom
1181   for (; y0 < ysize_; ++y0) {
1182     float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
1183     for (size_t x0 = 0; x0 < xsize_; ++x0) {
1184       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
1185     }
1186   }
1187 }
1188 
1189 // Need non-template wrapper functions for HWY_EXPORT.
MaltaDiffMap(const ImageF & lum0,const ImageF & lum1,const double w_0gt1,const double w_0lt1,const double norm1,const double len,const double mulli,ImageF * HWY_RESTRICT diffs,Image3F * HWY_RESTRICT block_diff_ac,size_t c)1190 void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1191                   const double w_0lt1, const double norm1, const double len,
1192                   const double mulli, ImageF* HWY_RESTRICT diffs,
1193                   Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1194   MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
1195                 diffs, block_diff_ac, c);
1196 }
1197 
MaltaDiffMapLF(const ImageF & lum0,const ImageF & lum1,const double w_0gt1,const double w_0lt1,const double norm1,const double len,const double mulli,ImageF * HWY_RESTRICT diffs,Image3F * HWY_RESTRICT block_diff_ac,size_t c)1198 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1199                     const double w_0lt1, const double norm1, const double len,
1200                     const double mulli, ImageF* HWY_RESTRICT diffs,
1201                     Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1202   MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
1203                 diffs, block_diff_ac, c);
1204 }
1205 
DiffPrecompute(const ImageF & xyb,float mul,float bias_arg,ImageF * out)1206 void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
1207   PROFILER_FUNC;
1208   const size_t xsize = xyb.xsize();
1209   const size_t ysize = xyb.ysize();
1210   const float bias = mul * bias_arg;
1211   const float sqrt_bias = sqrt(bias);
1212   for (size_t y = 0; y < ysize; ++y) {
1213     const float* BUTTERAUGLI_RESTRICT row_in = xyb.Row(y);
1214     float* BUTTERAUGLI_RESTRICT row_out = out->Row(y);
1215     for (size_t x = 0; x < xsize; ++x) {
1216       // kBias makes sqrt behave more linearly.
1217       row_out[x] = sqrt(mul * std::abs(row_in[x]) + bias) - sqrt_bias;
1218     }
1219   }
1220 }
1221 
1222 // std::log(80.0) / std::log(255.0);
1223 constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
1224 static const float kInternalGoodQualityThreshold =
1225     17.1984479671f * kIntensityTargetNormalizationHack;
1226 static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
1227 
StoreMin3(const float v,float & min0,float & min1,float & min2)1228 void StoreMin3(const float v, float& min0, float& min1, float& min2) {
1229   if (v < min2) {
1230     if (v < min0) {
1231       min2 = min1;
1232       min1 = min0;
1233       min0 = v;
1234     } else if (v < min1) {
1235       min2 = min1;
1236       min1 = v;
1237     } else {
1238       min2 = v;
1239     }
1240   }
1241 }
1242 
1243 // Look for smooth areas near the area of degradation.
1244 // If the areas area generally smooth, don't do masking.
FuzzyErosion(const ImageF & from,ImageF * to)1245 void FuzzyErosion(const ImageF& from, ImageF* to) {
1246   const size_t xsize = from.xsize();
1247   const size_t ysize = from.ysize();
1248   for (size_t y = 0; y < ysize; ++y) {
1249     for (size_t x = 0; x < xsize; ++x) {
1250       float min0 = from.Row(y)[x];
1251       float min1 = 2 * min0;
1252       float min2 = min1;
1253       if (x >= 3) {
1254         float v = from.Row(y)[x - 3];
1255         StoreMin3(v, min0, min1, min2);
1256         if (y >= 3) {
1257           float v = from.Row(y - 3)[x - 3];
1258           StoreMin3(v, min0, min1, min2);
1259         }
1260         if (y < ysize - 3) {
1261           float v = from.Row(y + 3)[x - 3];
1262           StoreMin3(v, min0, min1, min2);
1263         }
1264       }
1265       if (x < xsize - 3) {
1266         float v = from.Row(y)[x + 3];
1267         StoreMin3(v, min0, min1, min2);
1268         if (y >= 3) {
1269           float v = from.Row(y - 3)[x + 3];
1270           StoreMin3(v, min0, min1, min2);
1271         }
1272         if (y < ysize - 3) {
1273           float v = from.Row(y + 3)[x + 3];
1274           StoreMin3(v, min0, min1, min2);
1275         }
1276       }
1277       if (y >= 3) {
1278         float v = from.Row(y - 3)[x];
1279         StoreMin3(v, min0, min1, min2);
1280       }
1281       if (y < ysize - 3) {
1282         float v = from.Row(y + 3)[x];
1283         StoreMin3(v, min0, min1, min2);
1284       }
1285       to->Row(y)[x] = (0.45f * min0 + 0.3f * min1 + 0.25f * min2);
1286     }
1287   }
1288 }
1289 
1290 // Compute values of local frequency and dc masking based on the activity
1291 // in the two images. img_diff_ac may be null.
Mask(const ImageF & mask0,const ImageF & mask1,const ButteraugliParams & params,BlurTemp * blur_temp,ImageF * BUTTERAUGLI_RESTRICT mask,ImageF * BUTTERAUGLI_RESTRICT diff_ac)1292 void Mask(const ImageF& mask0, const ImageF& mask1,
1293           const ButteraugliParams& params, BlurTemp* blur_temp,
1294           ImageF* BUTTERAUGLI_RESTRICT mask,
1295           ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
1296   // Only X and Y components are involved in masking. B's influence
1297   // is considered less important in the high frequency area, and we
1298   // don't model masking from lower frequency signals.
1299   PROFILER_FUNC;
1300   const size_t xsize = mask0.xsize();
1301   const size_t ysize = mask0.ysize();
1302   *mask = ImageF(xsize, ysize);
1303   static const float kMul = 6.19424080439;
1304   static const float kBias = 12.61050594197;
1305   static const float kRadius = 2.7;
1306   ImageF diff0(xsize, ysize);
1307   ImageF diff1(xsize, ysize);
1308   ImageF blurred0(xsize, ysize);
1309   ImageF blurred1(xsize, ysize);
1310   DiffPrecompute(mask0, kMul, kBias, &diff0);
1311   DiffPrecompute(mask1, kMul, kBias, &diff1);
1312   Blur(diff0, kRadius, params, blur_temp, &blurred0);
1313   FuzzyErosion(blurred0, &diff0);
1314   Blur(diff1, kRadius, params, blur_temp, &blurred1);
1315   FuzzyErosion(blurred1, &diff1);
1316   for (size_t y = 0; y < ysize; ++y) {
1317     for (size_t x = 0; x < xsize; ++x) {
1318       mask->Row(y)[x] = diff1.Row(y)[x];
1319       if (diff_ac != nullptr) {
1320         static const float kMaskToErrorMul = 10.0;
1321         float diff = blurred0.Row(y)[x] - blurred1.Row(y)[x];
1322         diff_ac->Row(y)[x] += kMaskToErrorMul * diff * diff;
1323       }
1324     }
1325   }
1326 }
1327 
1328 // `diff_ac` may be null.
MaskPsychoImage(const PsychoImage & pi0,const PsychoImage & pi1,const size_t xsize,const size_t ysize,const ButteraugliParams & params,Image3F * temp,BlurTemp * blur_temp,ImageF * BUTTERAUGLI_RESTRICT mask,ImageF * BUTTERAUGLI_RESTRICT diff_ac)1329 void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
1330                      const size_t xsize, const size_t ysize,
1331                      const ButteraugliParams& params, Image3F* temp,
1332                      BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask,
1333                      ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
1334   ImageF mask0(xsize, ysize);
1335   ImageF mask1(xsize, ysize);
1336   static const float muls[3] = {
1337       8.75000241361f,
1338       0.620978104816f,
1339       0.307585098253f,
1340   };
1341   // Silly and unoptimized approach here. TODO(jyrki): rework this.
1342   for (size_t y = 0; y < ysize; ++y) {
1343     const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y);
1344     const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y);
1345     const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y);
1346     const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y);
1347     const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y);
1348     const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y);
1349     const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y);
1350     const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y);
1351     float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y);
1352     float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y);
1353     for (size_t x = 0; x < xsize; ++x) {
1354       float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0];
1355       float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0];
1356       float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2];
1357       float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2];
1358       row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0;
1359       row0[x] = sqrt(row0[x]);
1360       row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1;
1361       row1[x] = sqrt(row1[x]);
1362     }
1363   }
1364   Mask(mask0, mask1, params, blur_temp, mask, diff_ac);
1365 }
1366 
MaskY(double delta)1367 double MaskY(double delta) {
1368   static const double offset = 0.829591754942;
1369   static const double scaler = 0.451936922203;
1370   static const double mul = 2.5485944793;
1371   const double c = mul / ((scaler * delta) + offset);
1372   const double retval = kGlobalScale * (1.0 + c);
1373   return retval * retval;
1374 }
1375 
MaskDcY(double delta)1376 double MaskDcY(double delta) {
1377   static const double offset = 0.20025578522;
1378   static const double scaler = 3.87449418804;
1379   static const double mul = 0.505054525019;
1380   const double c = mul / ((scaler * delta) + offset);
1381   const double retval = kGlobalScale * (1.0 + c);
1382   return retval * retval;
1383 }
1384 
MaskColor(const float color[3],const float mask)1385 inline float MaskColor(const float color[3], const float mask) {
1386   return color[0] * mask + color[1] * mask + color[2] * mask;
1387 }
1388 
1389 // Diffmap := sqrt of sum{diff images by multplied by X and Y/B masks}
CombineChannelsToDiffmap(const ImageF & mask,const Image3F & block_diff_dc,const Image3F & block_diff_ac,float xmul,ImageF * result)1390 void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
1391                               const Image3F& block_diff_ac, float xmul,
1392                               ImageF* result) {
1393   PROFILER_FUNC;
1394   JXL_CHECK(SameSize(mask, *result));
1395   size_t xsize = mask.xsize();
1396   size_t ysize = mask.ysize();
1397   for (size_t y = 0; y < ysize; ++y) {
1398     float* BUTTERAUGLI_RESTRICT row_out = result->Row(y);
1399     for (size_t x = 0; x < xsize; ++x) {
1400       float val = mask.Row(y)[x];
1401       float maskval = MaskY(val);
1402       float dc_maskval = MaskDcY(val);
1403       float diff_dc[3];
1404       float diff_ac[3];
1405       for (int i = 0; i < 3; ++i) {
1406         diff_dc[i] = block_diff_dc.PlaneRow(i, y)[x];
1407         diff_ac[i] = block_diff_ac.PlaneRow(i, y)[x];
1408       }
1409       diff_ac[0] *= xmul;
1410       diff_dc[0] *= xmul;
1411       row_out[x] =
1412           sqrt(MaskColor(diff_dc, dc_maskval) + MaskColor(diff_ac, maskval));
1413     }
1414   }
1415 }
1416 
1417 // Adds weighted L2 difference between i0 and i1 to diffmap.
L2Diff(const ImageF & i0,const ImageF & i1,const float w,Image3F * BUTTERAUGLI_RESTRICT diffmap,size_t c)1418 static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
1419                    Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1420   if (w == 0) return;
1421 
1422   const HWY_FULL(float) d;
1423   const auto weight = Set(d, w);
1424 
1425   for (size_t y = 0; y < i0.ysize(); ++y) {
1426     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
1427     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
1428     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1429 
1430     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1431       const auto diff = Load(d, row0 + x) - Load(d, row1 + x);
1432       const auto diff2 = diff * diff;
1433       const auto prev = Load(d, row_diff + x);
1434       Store(MulAdd(diff2, weight, prev), d, row_diff + x);
1435     }
1436   }
1437 }
1438 
1439 // Initializes diffmap to the weighted L2 difference between i0 and i1.
SetL2Diff(const ImageF & i0,const ImageF & i1,const float w,Image3F * BUTTERAUGLI_RESTRICT diffmap,size_t c)1440 static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
1441                       Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1442   if (w == 0) return;
1443 
1444   const HWY_FULL(float) d;
1445   const auto weight = Set(d, w);
1446 
1447   for (size_t y = 0; y < i0.ysize(); ++y) {
1448     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
1449     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
1450     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1451 
1452     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1453       const auto diff = Load(d, row0 + x) - Load(d, row1 + x);
1454       const auto diff2 = diff * diff;
1455       Store(diff2 * weight, d, row_diff + x);
1456     }
1457   }
1458 }
1459 
1460 // i0 is the original image.
1461 // i1 is the deformed copy.
L2DiffAsymmetric(const ImageF & i0,const ImageF & i1,float w_0gt1,float w_0lt1,Image3F * BUTTERAUGLI_RESTRICT diffmap,size_t c)1462 static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
1463                              float w_0lt1,
1464                              Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
1465   if (w_0gt1 == 0 && w_0lt1 == 0) {
1466     return;
1467   }
1468 
1469   const HWY_FULL(float) d;
1470   const auto vw_0gt1 = Set(d, w_0gt1 * 0.8);
1471   const auto vw_0lt1 = Set(d, w_0lt1 * 0.8);
1472 
1473   for (size_t y = 0; y < i0.ysize(); ++y) {
1474     const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
1475     const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
1476     float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
1477 
1478     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
1479       const auto val0 = Load(d, row0 + x);
1480       const auto val1 = Load(d, row1 + x);
1481 
1482       // Primary symmetric quadratic objective.
1483       const auto diff = val0 - val1;
1484       auto total = MulAdd(diff * diff, vw_0gt1, Load(d, row_diff + x));
1485 
1486       // Secondary half-open quadratic objectives.
1487       const auto fabs0 = Abs(val0);
1488       const auto too_small = Set(d, 0.4) * fabs0;
1489       const auto too_big = fabs0;
1490 
1491       const auto if_neg =
1492           IfThenElse(val1 > Neg(too_small), val1 + too_small,
1493                      IfThenElseZero(val1 < Neg(too_big), Neg(val1) - too_big));
1494       const auto if_pos =
1495           IfThenElse(val1 < too_small, too_small - val1,
1496                      IfThenElseZero(val1 > too_big, val1 - too_big));
1497       const auto v = IfThenElse(val0 < Zero(d), if_neg, if_pos);
1498       total += vw_0lt1 * v * v;
1499       Store(total, d, row_diff + x);
1500     }
1501   }
1502 }
1503 
1504 // A simple HDR compatible gamma function.
1505 template <class DF, class V>
Gamma(const DF df,V v)1506 V Gamma(const DF df, V v) {
1507   // ln(2) constant folded in because we want std::log but have FastLog2f.
1508   const auto kRetMul = Set(df, 19.245013259874995f * 0.693147180559945f);
1509   const auto kRetAdd = Set(df, -23.16046239805755);
1510   // This should happen rarely, but may lead to a NaN in log, which is
1511   // undesirable. Since negative photons don't exist we solve the NaNs by
1512   // clamping here.
1513   v = ZeroIfNegative(v);
1514 
1515   const auto biased = v + Set(df, 9.9710635769299145);
1516   const auto log = FastLog2f(df, biased);
1517   // We could fold this into a custom Log2 polynomial, but there would be
1518   // relatively little gain.
1519   return MulAdd(kRetMul, log, kRetAdd);
1520 }
1521 
1522 template <bool Clamp, class DF, class V>
OpsinAbsorbance(const DF df,const V & in0,const V & in1,const V & in2,V * JXL_RESTRICT out0,V * JXL_RESTRICT out1,V * JXL_RESTRICT out2)1523 BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
1524                                         const V& in2, V* JXL_RESTRICT out0,
1525                                         V* JXL_RESTRICT out1,
1526                                         V* JXL_RESTRICT out2) {
1527   // https://en.wikipedia.org/wiki/Photopsin absorbance modeling.
1528   static const double mixi0 = 0.29956550340058319;
1529   static const double mixi1 = 0.63373087833825936;
1530   static const double mixi2 = 0.077705617820981968;
1531   static const double mixi3 = 1.7557483643287353;
1532   static const double mixi4 = 0.22158691104574774;
1533   static const double mixi5 = 0.69391388044116142;
1534   static const double mixi6 = 0.0987313588422;
1535   static const double mixi7 = 1.7557483643287353;
1536   static const double mixi8 = 0.02;
1537   static const double mixi9 = 0.02;
1538   static const double mixi10 = 0.20480129041026129;
1539   static const double mixi11 = 12.226454707163354;
1540 
1541   const V mix0 = Set(df, mixi0);
1542   const V mix1 = Set(df, mixi1);
1543   const V mix2 = Set(df, mixi2);
1544   const V mix3 = Set(df, mixi3);
1545   const V mix4 = Set(df, mixi4);
1546   const V mix5 = Set(df, mixi5);
1547   const V mix6 = Set(df, mixi6);
1548   const V mix7 = Set(df, mixi7);
1549   const V mix8 = Set(df, mixi8);
1550   const V mix9 = Set(df, mixi9);
1551   const V mix10 = Set(df, mixi10);
1552   const V mix11 = Set(df, mixi11);
1553 
1554   *out0 = mix0 * in0 + mix1 * in1 + mix2 * in2 + mix3;
1555   *out1 = mix4 * in0 + mix5 * in1 + mix6 * in2 + mix7;
1556   *out2 = mix8 * in0 + mix9 * in1 + mix10 * in2 + mix11;
1557 
1558   if (Clamp) {
1559     *out0 = Max(*out0, mix3);
1560     *out1 = Max(*out1, mix7);
1561     *out2 = Max(*out2, mix11);
1562   }
1563 }
1564 
1565 // `blurred` is a temporary image used inside this function and not returned.
OpsinDynamicsImage(const Image3F & rgb,const ButteraugliParams & params,Image3F * blurred,BlurTemp * blur_temp)1566 Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
1567                            Image3F* blurred, BlurTemp* blur_temp) {
1568   PROFILER_FUNC;
1569   Image3F xyb(rgb.xsize(), rgb.ysize());
1570   const double kSigma = 1.2;
1571   Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0));
1572   Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1));
1573   Blur(rgb.Plane(2), kSigma, params, blur_temp, &blurred->Plane(2));
1574   const HWY_FULL(float) df;
1575   const auto intensity_target_multiplier = Set(df, params.intensity_target);
1576   for (size_t y = 0; y < rgb.ysize(); ++y) {
1577     const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y);
1578     const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y);
1579     const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y);
1580     const float* BUTTERAUGLI_RESTRICT row_blurred_r =
1581         blurred->ConstPlaneRow(0, y);
1582     const float* BUTTERAUGLI_RESTRICT row_blurred_g =
1583         blurred->ConstPlaneRow(1, y);
1584     const float* BUTTERAUGLI_RESTRICT row_blurred_b =
1585         blurred->ConstPlaneRow(2, y);
1586     float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y);
1587     float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y);
1588     float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y);
1589     const auto min = Set(df, 1e-4f);
1590     for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
1591       auto sensitivity0 = Undefined(df);
1592       auto sensitivity1 = Undefined(df);
1593       auto sensitivity2 = Undefined(df);
1594       {
1595         // Calculate sensitivity based on the smoothed image gamma derivative.
1596         auto pre_mixed0 = Undefined(df);
1597         auto pre_mixed1 = Undefined(df);
1598         auto pre_mixed2 = Undefined(df);
1599         OpsinAbsorbance<true>(
1600             df, Load(df, row_blurred_r + x) * intensity_target_multiplier,
1601             Load(df, row_blurred_g + x) * intensity_target_multiplier,
1602             Load(df, row_blurred_b + x) * intensity_target_multiplier,
1603             &pre_mixed0, &pre_mixed1, &pre_mixed2);
1604         pre_mixed0 = Max(pre_mixed0, min);
1605         pre_mixed1 = Max(pre_mixed1, min);
1606         pre_mixed2 = Max(pre_mixed2, min);
1607         sensitivity0 = Gamma(df, pre_mixed0) / pre_mixed0;
1608         sensitivity1 = Gamma(df, pre_mixed1) / pre_mixed1;
1609         sensitivity2 = Gamma(df, pre_mixed2) / pre_mixed2;
1610         sensitivity0 = Max(sensitivity0, min);
1611         sensitivity1 = Max(sensitivity1, min);
1612         sensitivity2 = Max(sensitivity2, min);
1613       }
1614       auto cur_mixed0 = Undefined(df);
1615       auto cur_mixed1 = Undefined(df);
1616       auto cur_mixed2 = Undefined(df);
1617       OpsinAbsorbance<false>(df,
1618                              Load(df, row_r + x) * intensity_target_multiplier,
1619                              Load(df, row_g + x) * intensity_target_multiplier,
1620                              Load(df, row_b + x) * intensity_target_multiplier,
1621                              &cur_mixed0, &cur_mixed1, &cur_mixed2);
1622       cur_mixed0 *= sensitivity0;
1623       cur_mixed1 *= sensitivity1;
1624       cur_mixed2 *= sensitivity2;
1625       // This is a kludge. The negative values should be zeroed away before
1626       // blurring. Ideally there would be no negative values in the first place.
1627       const auto min01 = Set(df, 1.7557483643287353f);
1628       const auto min2 = Set(df, 12.226454707163354f);
1629       cur_mixed0 = Max(cur_mixed0, min01);
1630       cur_mixed1 = Max(cur_mixed1, min01);
1631       cur_mixed2 = Max(cur_mixed2, min2);
1632 
1633       Store(cur_mixed0 - cur_mixed1, df, row_out_x + x);
1634       Store(cur_mixed0 + cur_mixed1, df, row_out_y + x);
1635       Store(cur_mixed2, df, row_out_b + x);
1636     }
1637   }
1638   return xyb;
1639 }
1640 
1641 // NOLINTNEXTLINE(google-readability-namespace-comments)
1642 }  // namespace HWY_NAMESPACE
1643 }  // namespace jxl
1644 HWY_AFTER_NAMESPACE();
1645 
1646 #if HWY_ONCE
1647 namespace jxl {
1648 
1649 HWY_EXPORT(SeparateFrequencies);       // Local function.
1650 HWY_EXPORT(MaskPsychoImage);           // Local function.
1651 HWY_EXPORT(L2DiffAsymmetric);          // Local function.
1652 HWY_EXPORT(L2Diff);                    // Local function.
1653 HWY_EXPORT(SetL2Diff);                 // Local function.
1654 HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
1655 HWY_EXPORT(MaltaDiffMap);              // Local function.
1656 HWY_EXPORT(MaltaDiffMapLF);            // Local function.
1657 HWY_EXPORT(OpsinDynamicsImage);        // Local function.
1658 
1659 #if BUTTERAUGLI_ENABLE_CHECKS
1660 
IsNan(const float x)1661 static inline bool IsNan(const float x) {
1662   uint32_t bits;
1663   memcpy(&bits, &x, sizeof(bits));
1664   const uint32_t bitmask_exp = 0x7F800000;
1665   return (bits & bitmask_exp) == bitmask_exp && (bits & 0x7FFFFF);
1666 }
1667 
IsNan(const double x)1668 static inline bool IsNan(const double x) {
1669   uint64_t bits;
1670   memcpy(&bits, &x, sizeof(bits));
1671   return (0x7ff0000000000001ULL <= bits && bits <= 0x7fffffffffffffffULL) ||
1672          (0xfff0000000000001ULL <= bits && bits <= 0xffffffffffffffffULL);
1673 }
1674 
CheckImage(const ImageF & image,const char * name)1675 static inline void CheckImage(const ImageF& image, const char* name) {
1676   PROFILER_FUNC;
1677   for (size_t y = 0; y < image.ysize(); ++y) {
1678     const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
1679     for (size_t x = 0; x < image.xsize(); ++x) {
1680       if (IsNan(row[x])) {
1681         printf("NAN: Image %s @ %zu,%zu (of %zu,%zu)\n", name, x, y,
1682                image.xsize(), image.ysize());
1683         exit(1);
1684       }
1685     }
1686   }
1687 }
1688 
1689 #define CHECK_NAN(x, str)                \
1690   do {                                   \
1691     if (IsNan(x)) {                      \
1692       printf("%d: %s\n", __LINE__, str); \
1693       abort();                           \
1694     }                                    \
1695   } while (0)
1696 
1697 #define CHECK_IMAGE(image, name) CheckImage(image, name)
1698 
1699 #else  // BUTTERAUGLI_ENABLE_CHECKS
1700 
1701 #define CHECK_NAN(x, str)
1702 #define CHECK_IMAGE(image, name)
1703 
1704 #endif  // BUTTERAUGLI_ENABLE_CHECKS
1705 
1706 // Calculate a 2x2 subsampled image for purposes of recursive butteraugli at
1707 // multiresolution.
SubSample2x(const Image3F & in)1708 static Image3F SubSample2x(const Image3F& in) {
1709   size_t xs = (in.xsize() + 1) / 2;
1710   size_t ys = (in.ysize() + 1) / 2;
1711   Image3F retval(xs, ys);
1712   for (size_t c = 0; c < 3; ++c) {
1713     for (size_t y = 0; y < ys; ++y) {
1714       for (size_t x = 0; x < xs; ++x) {
1715         retval.PlaneRow(c, y)[x] = 0;
1716       }
1717     }
1718   }
1719   for (size_t c = 0; c < 3; ++c) {
1720     for (size_t y = 0; y < in.ysize(); ++y) {
1721       for (size_t x = 0; x < in.xsize(); ++x) {
1722         retval.PlaneRow(c, y / 2)[x / 2] += 0.25f * in.PlaneRow(c, y)[x];
1723       }
1724     }
1725     if ((in.xsize() & 1) != 0) {
1726       for (size_t y = 0; y < retval.ysize(); ++y) {
1727         size_t last_column = retval.xsize() - 1;
1728         retval.PlaneRow(c, y)[last_column] *= 2.0f;
1729       }
1730     }
1731     if ((in.ysize() & 1) != 0) {
1732       for (size_t x = 0; x < retval.xsize(); ++x) {
1733         size_t last_row = retval.ysize() - 1;
1734         retval.PlaneRow(c, last_row)[x] *= 2.0f;
1735       }
1736     }
1737   }
1738   return retval;
1739 }
1740 
1741 // Supersample src by 2x and add it to dest.
AddSupersampled2x(const ImageF & src,float w,ImageF & dest)1742 static void AddSupersampled2x(const ImageF& src, float w, ImageF& dest) {
1743   for (size_t y = 0; y < dest.ysize(); ++y) {
1744     for (size_t x = 0; x < dest.xsize(); ++x) {
1745       // There will be less errors from the more averaged images.
1746       // We take it into account to some extent using a scaler.
1747       static const double kHeuristicMixingValue = 0.3;
1748       dest.Row(y)[x] *= 1.0 - kHeuristicMixingValue * w;
1749       dest.Row(y)[x] += w * src.Row(y / 2)[x / 2];
1750     }
1751   }
1752 }
1753 
Temp() const1754 Image3F* ButteraugliComparator::Temp() const {
1755   bool was_in_use = temp_in_use_.test_and_set(std::memory_order_acq_rel);
1756   JXL_ASSERT(!was_in_use);
1757   (void)was_in_use;
1758   return &temp_;
1759 }
1760 
ReleaseTemp() const1761 void ButteraugliComparator::ReleaseTemp() const { temp_in_use_.clear(); }
1762 
ButteraugliComparator(const Image3F & rgb0,const ButteraugliParams & params)1763 ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
1764                                              const ButteraugliParams& params)
1765     : xsize_(rgb0.xsize()),
1766       ysize_(rgb0.ysize()),
1767       params_(params),
1768       temp_(xsize_, ysize_) {
1769   if (xsize_ < 8 || ysize_ < 8) {
1770     return;
1771   }
1772 
1773   Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(),
1774                                                           &blur_temp_);
1775   ReleaseTemp();
1776   HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
1777   (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_);
1778 
1779   // Awful recursive construction of samples of different resolution.
1780   // This is an after-thought and possibly somewhat parallel in
1781   // functionality with the PsychoImage multi-resolution approach.
1782   sub_.reset(new ButteraugliComparator(SubSample2x(rgb0), params));
1783 }
1784 
Mask(ImageF * BUTTERAUGLI_RESTRICT mask) const1785 void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
1786   HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
1787   (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr);
1788   ReleaseTemp();
1789 }
1790 
Diffmap(const Image3F & rgb1,ImageF & result) const1791 void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
1792   PROFILER_FUNC;
1793   if (xsize_ < 8 || ysize_ < 8) {
1794     ZeroFillImage(&result);
1795     return;
1796   }
1797   const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
1798       rgb1, params_, Temp(), &blur_temp_);
1799   ReleaseTemp();
1800   DiffmapOpsinDynamicsImage(xyb1, result);
1801   if (sub_) {
1802     if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
1803       return;
1804     }
1805     const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
1806         SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_);
1807     sub_->ReleaseTemp();
1808     ImageF subresult;
1809     sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult);
1810     AddSupersampled2x(subresult, 0.5, result);
1811   }
1812 }
1813 
DiffmapOpsinDynamicsImage(const Image3F & xyb1,ImageF & result) const1814 void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
1815                                                       ImageF& result) const {
1816   PROFILER_FUNC;
1817   if (xsize_ < 8 || ysize_ < 8) {
1818     ZeroFillImage(&result);
1819     return;
1820   }
1821   PsychoImage pi1;
1822   HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
1823   (xsize_, ysize_, params_, &blur_temp_, xyb1, pi1);
1824   result = ImageF(xsize_, ysize_);
1825   DiffmapPsychoImage(pi1, result);
1826 }
1827 
1828 namespace {
1829 
MaltaDiffMap(const ImageF & lum0,const ImageF & lum1,const double w_0gt1,const double w_0lt1,const double norm1,ImageF * HWY_RESTRICT diffs,Image3F * HWY_RESTRICT block_diff_ac,size_t c)1830 void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1831                   const double w_0lt1, const double norm1,
1832                   ImageF* HWY_RESTRICT diffs,
1833                   Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1834   PROFILER_FUNC;
1835   const double len = 3.75;
1836   static const double mulli = 0.39905817637;
1837   HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
1838   (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
1839 }
1840 
MaltaDiffMapLF(const ImageF & lum0,const ImageF & lum1,const double w_0gt1,const double w_0lt1,const double norm1,ImageF * HWY_RESTRICT diffs,Image3F * HWY_RESTRICT block_diff_ac,size_t c)1841 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
1842                     const double w_0lt1, const double norm1,
1843                     ImageF* HWY_RESTRICT diffs,
1844                     Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
1845   PROFILER_FUNC;
1846   const double len = 3.75;
1847   static const double mulli = 0.611612573796;
1848   HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
1849   (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
1850 }
1851 
1852 }  // namespace
1853 
DiffmapPsychoImage(const PsychoImage & pi1,ImageF & diffmap) const1854 void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
1855                                                ImageF& diffmap) const {
1856   PROFILER_FUNC;
1857   if (xsize_ < 8 || ysize_ < 8) {
1858     ZeroFillImage(&diffmap);
1859     return;
1860   }
1861 
1862   const float hf_asymmetry_ = params_.hf_asymmetry;
1863   const float xmul_ = params_.xmul;
1864 
1865   ImageF diffs(xsize_, ysize_);
1866   Image3F block_diff_ac(xsize_, ysize_);
1867   ZeroFillImage(&block_diff_ac);
1868   static const double wUhfMalta = 1.10039032555;
1869   static const double norm1Uhf = 71.7800275169;
1870   MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
1871                wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
1872 
1873   static const double wUhfMaltaX = 173.5;
1874   static const double norm1UhfX = 5.0;
1875   MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
1876                wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
1877                0);
1878 
1879   static const double wHfMalta = 18.7237414387;
1880   static const double norm1Hf = 4498534.45232;
1881   MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
1882                  wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
1883                  &block_diff_ac, 1);
1884 
1885   static const double wHfMaltaX = 6923.99476109;
1886   static const double norm1HfX = 8051.15833247;
1887   MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
1888                  wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
1889                  &block_diff_ac, 0);
1890 
1891   static const double wMfMalta = 37.0819870399;
1892   static const double norm1Mf = 130262059.556;
1893   MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
1894                  &diffs, &block_diff_ac, 1);
1895 
1896   static const double wMfMaltaX = 8246.75321353;
1897   static const double norm1MfX = 1009002.70582;
1898   MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
1899                  norm1MfX, &diffs, &block_diff_ac, 0);
1900 
1901   static const double wmul[9] = {
1902       400.0,         1.50815703118,  0,
1903       2150.0,        10.6195433239,  16.2176043152,
1904       29.2353797994, 0.844626970982, 0.703646627719,
1905   };
1906   Image3F block_diff_dc(xsize_, ysize_);
1907   for (size_t c = 0; c < 3; ++c) {
1908     if (c < 2) {  // No blue channel error accumulated at HF.
1909       HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
1910       (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
1911        &block_diff_ac, c);
1912     }
1913     HWY_DYNAMIC_DISPATCH(L2Diff)
1914     (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c);
1915     HWY_DYNAMIC_DISPATCH(SetL2Diff)
1916     (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c);
1917   }
1918 
1919   ImageF mask;
1920   HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
1921   (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask,
1922    &block_diff_ac.Plane(1));
1923   ReleaseTemp();
1924 
1925   HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
1926   (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
1927 }
1928 
ButteraugliScoreFromDiffmap(const ImageF & diffmap,const ButteraugliParams * params)1929 double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
1930                                    const ButteraugliParams* params) {
1931   PROFILER_FUNC;
1932   // In approximate-border mode, skip pixels on the border likely to be affected
1933   // by FastGauss' zero-valued-boundary behavior. The border is about half of
1934   // the largest-diameter kernel (37x37 pixels), but only if the image is big.
1935   size_t border = (params != nullptr && params->approximate_border) ? 8 : 0;
1936   if (diffmap.xsize() <= 2 * border || diffmap.ysize() <= 2 * border) {
1937     border = 0;
1938   }
1939   float retval = 0.0f;
1940   for (size_t y = border; y < diffmap.ysize() - border; ++y) {
1941     const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
1942     for (size_t x = border; x < diffmap.xsize() - border; ++x) {
1943       retval = std::max(retval, row[x]);
1944     }
1945   }
1946   return retval;
1947 }
1948 
ButteraugliDiffmap(const Image3F & rgb0,const Image3F & rgb1,double hf_asymmetry,double xmul,ImageF & diffmap)1949 bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
1950                         double hf_asymmetry, double xmul, ImageF& diffmap) {
1951   ButteraugliParams params;
1952   params.hf_asymmetry = hf_asymmetry;
1953   params.xmul = xmul;
1954   return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
1955 }
1956 
ButteraugliDiffmap(const Image3F & rgb0,const Image3F & rgb1,const ButteraugliParams & params,ImageF & diffmap)1957 bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
1958                         const ButteraugliParams& params, ImageF& diffmap) {
1959   PROFILER_FUNC;
1960   const size_t xsize = rgb0.xsize();
1961   const size_t ysize = rgb0.ysize();
1962   if (xsize < 1 || ysize < 1) {
1963     return JXL_FAILURE("Zero-sized image");
1964   }
1965   if (!SameSize(rgb0, rgb1)) {
1966     return JXL_FAILURE("Size mismatch");
1967   }
1968   static const int kMax = 8;
1969   if (xsize < kMax || ysize < kMax) {
1970     // Butteraugli values for small (where xsize or ysize is smaller
1971     // than 8 pixels) images are non-sensical, but most likely it is
1972     // less disruptive to try to compute something than just give up.
1973     // Temporarily extend the borders of the image to fit 8 x 8 size.
1974     size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
1975     size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
1976     size_t xscaled = std::max<size_t>(kMax, xsize);
1977     size_t yscaled = std::max<size_t>(kMax, ysize);
1978     Image3F scaled0(xscaled, yscaled);
1979     Image3F scaled1(xscaled, yscaled);
1980     for (int i = 0; i < 3; ++i) {
1981       for (size_t y = 0; y < yscaled; ++y) {
1982         for (size_t x = 0; x < xscaled; ++x) {
1983           size_t x2 =
1984               std::min<size_t>(xsize - 1, std::max<size_t>(0, x - xborder));
1985           size_t y2 =
1986               std::min<size_t>(ysize - 1, std::max<size_t>(0, y - yborder));
1987           scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
1988           scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
1989         }
1990       }
1991     }
1992     ImageF diffmap_scaled;
1993     const bool ok =
1994         ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
1995     diffmap = ImageF(xsize, ysize);
1996     for (size_t y = 0; y < ysize; ++y) {
1997       for (size_t x = 0; x < xsize; ++x) {
1998         diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
1999       }
2000     }
2001     return ok;
2002   }
2003   ButteraugliComparator butteraugli(rgb0, params);
2004   butteraugli.Diffmap(rgb1, diffmap);
2005   return true;
2006 }
2007 
ButteraugliInterface(const Image3F & rgb0,const Image3F & rgb1,float hf_asymmetry,float xmul,ImageF & diffmap,double & diffvalue)2008 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
2009                           float hf_asymmetry, float xmul, ImageF& diffmap,
2010                           double& diffvalue) {
2011   ButteraugliParams params;
2012   params.hf_asymmetry = hf_asymmetry;
2013   params.xmul = xmul;
2014   return ButteraugliInterface(rgb0, rgb1, params, diffmap, diffvalue);
2015 }
2016 
ButteraugliInterface(const Image3F & rgb0,const Image3F & rgb1,const ButteraugliParams & params,ImageF & diffmap,double & diffvalue)2017 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
2018                           const ButteraugliParams& params, ImageF& diffmap,
2019                           double& diffvalue) {
2020 #if PROFILER_ENABLED
2021   double t0 = Now();
2022 #endif
2023   if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
2024     return false;
2025   }
2026 #if PROFILER_ENABLED
2027   double t1 = Now();
2028   const size_t mp = rgb0.xsize() * rgb0.ysize();
2029   printf("diff MP/s %f\n", mp / (t1 - t0) * 1E-6);
2030 #endif
2031   diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
2032   return true;
2033 }
2034 
ButteraugliFuzzyClass(double score)2035 double ButteraugliFuzzyClass(double score) {
2036   static const double fuzzy_width_up = 4.8;
2037   static const double fuzzy_width_down = 4.8;
2038   static const double m0 = 2.0;
2039   static const double scaler = 0.7777;
2040   double val;
2041   if (score < 1.0) {
2042     // val in [scaler .. 2.0]
2043     val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_down));
2044     val -= 1.0;           // from [1 .. 2] to [0 .. 1]
2045     val *= 2.0 - scaler;  // from [0 .. 1] to [0 .. 2.0 - scaler]
2046     val += scaler;        // from [0 .. 2.0 - scaler] to [scaler .. 2.0]
2047   } else {
2048     // val in [0 .. scaler]
2049     val = m0 / (1.0 + exp((score - 1.0) * fuzzy_width_up));
2050     val *= scaler;
2051   }
2052   return val;
2053 }
2054 
2055 // #define PRINT_OUT_NORMALIZATION
2056 
ButteraugliFuzzyInverse(double seek)2057 double ButteraugliFuzzyInverse(double seek) {
2058   double pos = 0;
2059   // NOLINTNEXTLINE(clang-analyzer-security.FloatLoopCounter)
2060   for (double range = 1.0; range >= 1e-10; range *= 0.5) {
2061     double cur = ButteraugliFuzzyClass(pos);
2062     if (cur < seek) {
2063       pos -= range;
2064     } else {
2065       pos += range;
2066     }
2067   }
2068 #ifdef PRINT_OUT_NORMALIZATION
2069   if (seek == 1.0) {
2070     fprintf(stderr, "Fuzzy inverse %g\n", pos);
2071   }
2072 #endif
2073   return pos;
2074 }
2075 
2076 #ifdef PRINT_OUT_NORMALIZATION
2077 static double print_out_normalization = ButteraugliFuzzyInverse(1.0);
2078 #endif
2079 
2080 namespace {
2081 
ScoreToRgb(double score,double good_threshold,double bad_threshold,float rgb[3])2082 void ScoreToRgb(double score, double good_threshold, double bad_threshold,
2083                 float rgb[3]) {
2084   double heatmap[12][3] = {
2085       {0, 0, 0},       {0, 0, 1},
2086       {0, 1, 1},       {0, 1, 0},  // Good level
2087       {1, 1, 0},       {1, 0, 0},  // Bad level
2088       {1, 0, 1},       {0.5, 0.5, 1.0},
2089       {1.0, 0.5, 0.5},  // Pastel colors for the very bad quality range.
2090       {1.0, 1.0, 0.5}, {1, 1, 1},
2091       {1, 1, 1},  // Last color repeated to have a solid range of white.
2092   };
2093   if (score < good_threshold) {
2094     score = (score / good_threshold) * 0.3;
2095   } else if (score < bad_threshold) {
2096     score = 0.3 +
2097             (score - good_threshold) / (bad_threshold - good_threshold) * 0.15;
2098   } else {
2099     score = 0.45 + (score - bad_threshold) / (bad_threshold * 12) * 0.5;
2100   }
2101   static const int kTableSize = sizeof(heatmap) / sizeof(heatmap[0]);
2102   score = std::min<double>(std::max<double>(score * (kTableSize - 1), 0.0),
2103                            kTableSize - 2);
2104   int ix = static_cast<int>(score);
2105   ix = std::min(std::max(0, ix), kTableSize - 2);  // Handle NaN
2106   double mix = score - ix;
2107   for (int i = 0; i < 3; ++i) {
2108     double v = mix * heatmap[ix + 1][i] + (1 - mix) * heatmap[ix][i];
2109     rgb[i] = pow(v, 0.5);
2110   }
2111 }
2112 
2113 }  // namespace
2114 
CreateHeatMapImage(const ImageF & distmap,double good_threshold,double bad_threshold)2115 Image3F CreateHeatMapImage(const ImageF& distmap, double good_threshold,
2116                            double bad_threshold) {
2117   Image3F heatmap(distmap.xsize(), distmap.ysize());
2118   for (size_t y = 0; y < distmap.ysize(); ++y) {
2119     const float* BUTTERAUGLI_RESTRICT row_distmap = distmap.ConstRow(y);
2120     float* BUTTERAUGLI_RESTRICT row_h0 = heatmap.PlaneRow(0, y);
2121     float* BUTTERAUGLI_RESTRICT row_h1 = heatmap.PlaneRow(1, y);
2122     float* BUTTERAUGLI_RESTRICT row_h2 = heatmap.PlaneRow(2, y);
2123     for (size_t x = 0; x < distmap.xsize(); ++x) {
2124       const float d = row_distmap[x];
2125       float rgb[3];
2126       ScoreToRgb(d, good_threshold, bad_threshold, rgb);
2127       row_h0[x] = rgb[0];
2128       row_h1[x] = rgb[1];
2129       row_h2[x] = rgb[2];
2130     }
2131   }
2132   return heatmap;
2133 }
2134 
2135 }  // namespace jxl
2136 #endif  // HWY_ONCE
2137