1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/enc_xyb.h"
7 
8 #include <algorithm>
9 #include <cstdlib>
10 
11 #undef HWY_TARGET_INCLUDE
12 #define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc"
13 #include <hwy/foreach_target.h>
14 #include <hwy/highway.h>
15 
16 #include "lib/jxl/aux_out_fwd.h"
17 #include "lib/jxl/base/compiler_specific.h"
18 #include "lib/jxl/base/data_parallel.h"
19 #include "lib/jxl/base/profiler.h"
20 #include "lib/jxl/base/status.h"
21 #include "lib/jxl/color_encoding_internal.h"
22 #include "lib/jxl/color_management.h"
23 #include "lib/jxl/enc_bit_writer.h"
24 #include "lib/jxl/fields.h"
25 #include "lib/jxl/image_bundle.h"
26 #include "lib/jxl/image_ops.h"
27 #include "lib/jxl/opsin_params.h"
28 #include "lib/jxl/transfer_functions-inl.h"
29 HWY_BEFORE_NAMESPACE();
30 namespace jxl {
31 namespace HWY_NAMESPACE {
32 
33 // These templates are not found via ADL.
34 using hwy::HWY_NAMESPACE::ShiftRight;
35 
36 // Returns cbrt(x) + add with 6 ulp max error.
37 // Modified from vectormath_exp.h, Apache 2 license.
38 // https://www.agner.org/optimize/vectorclass.zip
39 template <class V>
CubeRootAndAdd(const V x,const V add)40 V CubeRootAndAdd(const V x, const V add) {
41   const HWY_FULL(float) df;
42   const HWY_FULL(int32_t) di;
43 
44   const auto kExpBias = Set(di, 0x54800000);  // cast(1.) + cast(1.) / 3
45   const auto kExpMul = Set(di, 0x002AAAAA);   // shifted 1/3
46   const auto k1_3 = Set(df, 1.0f / 3);
47   const auto k4_3 = Set(df, 4.0f / 3);
48 
49   const auto xa = x;  // assume inputs never negative
50   const auto xa_3 = k1_3 * xa;
51 
52   // Multiply exponent by -1/3
53   const auto m1 = BitCast(di, xa);
54   // Special case for 0. 0 is represented with an exponent of 0, so the
55   // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse()
56   // sets those values as 0, which prevents having NaNs in the computations
57   // below.
58   const auto m2 =
59       IfThenZeroElse(m1 == Zero(di), kExpBias - (ShiftRight<23>(m1)) * kExpMul);
60   auto r = BitCast(df, m2);
61 
62   // Newton-Raphson iterations
63   for (int i = 0; i < 3; i++) {
64     const auto r2 = r * r;
65     r = NegMulAdd(xa_3, r2 * r2, k4_3 * r);
66   }
67   // Final iteration
68   auto r2 = r * r;
69   r = MulAdd(k1_3, NegMulAdd(xa, r2 * r2, r), r);
70   r2 = r * r;
71   r = MulAdd(r2, x, add);
72 
73   return r;
74 }
75 
76 // Ensures infinity norm is bounded.
TestCubeRoot()77 void TestCubeRoot() {
78   const HWY_FULL(float) d;
79   float max_err = 0.0f;
80   for (uint64_t x5 = 0; x5 < 2000000; x5++) {
81     const float x = x5 * 1E-5f;
82     const float expected = cbrtf(x);
83     HWY_ALIGN float approx[MaxLanes(d)];
84     Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx);
85 
86     // All lanes are same
87     for (size_t i = 1; i < Lanes(d); ++i) {
88       JXL_ASSERT(std::abs(approx[0] - approx[i]) <= 1.2E-7f);
89     }
90 
91     const float err = std::abs(approx[0] - expected);
92     max_err = std::max(max_err, err);
93   }
94   // printf("max err %e\n", max_err);
95   JXL_ASSERT(max_err < 8E-7f);
96 }
97 
98 // 4x3 matrix * 3x1 SIMD vectors
99 template <class V>
OpsinAbsorbance(const V r,const V g,const V b,const float * JXL_RESTRICT premul_absorb,V * JXL_RESTRICT mixed0,V * JXL_RESTRICT mixed1,V * JXL_RESTRICT mixed2)100 JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b,
101                                 const float* JXL_RESTRICT premul_absorb,
102                                 V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1,
103                                 V* JXL_RESTRICT mixed2) {
104   const float* bias = &kOpsinAbsorbanceBias[0];
105   const HWY_FULL(float) d;
106   const size_t N = Lanes(d);
107   const auto m0 = Load(d, premul_absorb + 0 * N);
108   const auto m1 = Load(d, premul_absorb + 1 * N);
109   const auto m2 = Load(d, premul_absorb + 2 * N);
110   const auto m3 = Load(d, premul_absorb + 3 * N);
111   const auto m4 = Load(d, premul_absorb + 4 * N);
112   const auto m5 = Load(d, premul_absorb + 5 * N);
113   const auto m6 = Load(d, premul_absorb + 6 * N);
114   const auto m7 = Load(d, premul_absorb + 7 * N);
115   const auto m8 = Load(d, premul_absorb + 8 * N);
116   *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0]))));
117   *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1]))));
118   *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2]))));
119 }
120 
121 template <class V>
StoreXYB(const V r,V g,const V b,float * JXL_RESTRICT valx,float * JXL_RESTRICT valy,float * JXL_RESTRICT valz)122 void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx,
123               float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) {
124   const HWY_FULL(float) d;
125   const V half = Set(d, 0.5f);
126   Store(half * (r - g), d, valx);
127   Store(half * (r + g), d, valy);
128   Store(b, d, valz);
129 }
130 
131 // Converts one RGB vector to XYB.
132 template <class V>
LinearRGBToXYB(const V r,const V g,const V b,const float * JXL_RESTRICT premul_absorb,float * JXL_RESTRICT valx,float * JXL_RESTRICT valy,float * JXL_RESTRICT valz)133 void LinearRGBToXYB(const V r, const V g, const V b,
134                     const float* JXL_RESTRICT premul_absorb,
135                     float* JXL_RESTRICT valx, float* JXL_RESTRICT valy,
136                     float* JXL_RESTRICT valz) {
137   V mixed0, mixed1, mixed2;
138   OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2);
139 
140   // mixed* should be non-negative even for wide-gamut, so clamp to zero.
141   mixed0 = ZeroIfNegative(mixed0);
142   mixed1 = ZeroIfNegative(mixed1);
143   mixed2 = ZeroIfNegative(mixed2);
144 
145   const HWY_FULL(float) d;
146   const size_t N = Lanes(d);
147   mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N));
148   mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N));
149   mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N));
150   StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz);
151 
152   // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative.
153 }
154 
155 // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut.
156 template <class V>
LinearFromSRGB(V encoded)157 V LinearFromSRGB(V encoded) {
158   return TF_SRGB().DisplayFromEncoded(encoded);
159 }
160 
LinearSRGBToXYB(const Image3F & linear,const float * JXL_RESTRICT premul_absorb,ThreadPool * pool,Image3F * JXL_RESTRICT xyb)161 void LinearSRGBToXYB(const Image3F& linear,
162                      const float* JXL_RESTRICT premul_absorb, ThreadPool* pool,
163                      Image3F* JXL_RESTRICT xyb) {
164   const size_t xsize = linear.xsize();
165 
166   const HWY_FULL(float) d;
167   RunOnPool(
168       pool, 0, static_cast<uint32_t>(linear.ysize()), ThreadPool::SkipInit(),
169       [&](const int task, const int /*thread*/) {
170         const size_t y = static_cast<size_t>(task);
171         const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y);
172         const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y);
173         const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y);
174         float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
175         float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
176         float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
177 
178         for (size_t x = 0; x < xsize; x += Lanes(d)) {
179           const auto in_r = Load(d, row_in0 + x);
180           const auto in_g = Load(d, row_in1 + x);
181           const auto in_b = Load(d, row_in2 + x);
182           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
183                          row_xyb1 + x, row_xyb2 + x);
184         }
185       },
186       "LinearToXYB");
187 }
188 
SRGBToXYB(const Image3F & srgb,const float * JXL_RESTRICT premul_absorb,ThreadPool * pool,Image3F * JXL_RESTRICT xyb)189 void SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb,
190                ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
191   const size_t xsize = srgb.xsize();
192 
193   const HWY_FULL(float) d;
194   RunOnPool(
195       pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::SkipInit(),
196       [&](const int task, const int /*thread*/) {
197         const size_t y = static_cast<size_t>(task);
198         const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
199         const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
200         const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
201         float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
202         float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
203         float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
204 
205         for (size_t x = 0; x < xsize; x += Lanes(d)) {
206           const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
207           const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
208           const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
209           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
210                          row_xyb1 + x, row_xyb2 + x);
211         }
212       },
213       "SRGBToXYB");
214 }
215 
SRGBToXYBAndLinear(const Image3F & srgb,const float * JXL_RESTRICT premul_absorb,ThreadPool * pool,Image3F * JXL_RESTRICT xyb,Image3F * JXL_RESTRICT linear)216 void SRGBToXYBAndLinear(const Image3F& srgb,
217                         const float* JXL_RESTRICT premul_absorb,
218                         ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
219                         Image3F* JXL_RESTRICT linear) {
220   const size_t xsize = srgb.xsize();
221 
222   const HWY_FULL(float) d;
223   RunOnPool(
224       pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::SkipInit(),
225       [&](const int task, const int /*thread*/) {
226         const size_t y = static_cast<size_t>(task);
227         const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
228         const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
229         const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
230 
231         float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y);
232         float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y);
233         float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y);
234 
235         float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
236         float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
237         float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
238 
239         for (size_t x = 0; x < xsize; x += Lanes(d)) {
240           const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
241           const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
242           const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
243 
244           Store(in_r, d, row_linear0 + x);
245           Store(in_g, d, row_linear1 + x);
246           Store(in_b, d, row_linear2 + x);
247 
248           LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
249                          row_xyb1 + x, row_xyb2 + x);
250         }
251       },
252       "SRGBToXYBAndLinear");
253 }
254 
255 // This is different from Butteraugli's OpsinDynamicsImage() in the sense that
256 // it does not contain a sensitivity multiplier based on the blurred image.
ToXYB(const ImageBundle & in,ThreadPool * pool,Image3F * JXL_RESTRICT xyb,ImageBundle * const JXL_RESTRICT linear)257 const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
258                          Image3F* JXL_RESTRICT xyb,
259                          ImageBundle* const JXL_RESTRICT linear) {
260   PROFILER_FUNC;
261 
262   const size_t xsize = in.xsize();
263   const size_t ysize = in.ysize();
264   JXL_ASSERT(SameSize(in, *xyb));
265 
266   const HWY_FULL(float) d;
267   // Pre-broadcasted constants
268   HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
269   const size_t N = Lanes(d);
270   for (size_t i = 0; i < 9; ++i) {
271     const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] *
272                                    (in.metadata()->IntensityTarget() / 255.0f));
273     Store(absorb, d, premul_absorb + i * N);
274   }
275   for (size_t i = 0; i < 3; ++i) {
276     const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i]));
277     Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
278   }
279 
280   const bool want_linear = linear != nullptr;
281 
282   const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray());
283   // Linear sRGB inputs are rare but can be useful for the fastest encoders, for
284   // which undoing the sRGB transfer function would be a large part of the cost.
285   if (c_linear_srgb.SameColorEncoding(in.c_current())) {
286     LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb);
287     // This only happens if kitten or slower, moving ImageBundle might be
288     // possible but the encoder is much slower than this copy.
289     if (want_linear) {
290       *linear = in.Copy();
291       return linear;
292     }
293     return &in;
294   }
295 
296   // Common case: already sRGB, can avoid the color transform
297   if (in.IsSRGB()) {
298     // Common case: can avoid allocating/copying
299     if (!want_linear) {
300       SRGBToXYB(in.color(), premul_absorb, pool, xyb);
301       return &in;
302     }
303 
304     // Slow encoder also wants linear sRGB.
305     linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb);
306     SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb, linear->color());
307     return linear;
308   }
309 
310   // General case: not sRGB, need color transform.
311   ImageBundle linear_storage;  // Local storage only used if !want_linear.
312 
313   ImageBundle* linear_storage_ptr;
314   if (want_linear) {
315     // Caller asked for linear, use that storage directly.
316     linear_storage_ptr = linear;
317   } else {
318     // Caller didn't ask for linear, create our own local storage
319     // OK to reuse metadata, it will not be changed.
320     linear_storage = ImageBundle(const_cast<ImageMetadata*>(in.metadata()));
321     linear_storage_ptr = &linear_storage;
322   }
323 
324   const ImageBundle* ptr;
325   JXL_CHECK(
326       TransformIfNeeded(in, c_linear_srgb, pool, linear_storage_ptr, &ptr));
327   // If no transform was necessary, should have taken the above codepath.
328   JXL_ASSERT(ptr == linear_storage_ptr);
329 
330   LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb);
331   return want_linear ? linear : &in;
332 }
333 
334 // Transform RGB to YCbCr.
335 // Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B).
RgbToYcbcr(const ImageF & r_plane,const ImageF & g_plane,const ImageF & b_plane,ImageF * y_plane,ImageF * cb_plane,ImageF * cr_plane,ThreadPool * pool)336 void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
337                 const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
338                 ImageF* cr_plane, ThreadPool* pool) {
339   const HWY_FULL(float) df;
340   const size_t S = Lanes(df);  // Step.
341 
342   const size_t xsize = r_plane.xsize();
343   const size_t ysize = r_plane.ysize();
344   if ((xsize == 0) || (ysize == 0)) return;
345 
346   // Full-range BT.601 as defined by JFIF Clause 7:
347   // https://www.itu.int/rec/T-REC-T.871-201105-I/en
348   const auto k128 = Set(df, 128.0f / 255);
349   const auto kR = Set(df, 0.299f);  // NTSC luma
350   const auto kG = Set(df, 0.587f);
351   const auto kB = Set(df, 0.114f);
352   const auto kAmpR = Set(df, 0.701f);
353   const auto kAmpB = Set(df, 0.886f);
354   const auto kDiffR = kAmpR + kR;
355   const auto kDiffB = kAmpB + kB;
356   const auto kNormR = Set(df, 1.0f) / (kAmpR + kG + kB);
357   const auto kNormB = Set(df, 1.0f) / (kR + kG + kAmpB);
358 
359   constexpr size_t kGroupArea = kGroupDim * kGroupDim;
360   const size_t lines_per_group = DivCeil(kGroupArea, xsize);
361   const size_t num_stripes = DivCeil(ysize, lines_per_group);
362   const auto transform = [&](int idx, int /* thread*/) {
363     const size_t y0 = idx * lines_per_group;
364     const size_t y1 = std::min<size_t>(y0 + lines_per_group, ysize);
365     for (size_t y = y0; y < y1; ++y) {
366       const float* r_row = r_plane.ConstRow(y);
367       const float* g_row = g_plane.ConstRow(y);
368       const float* b_row = b_plane.ConstRow(y);
369       float* y_row = y_plane->Row(y);
370       float* cb_row = cb_plane->Row(y);
371       float* cr_row = cr_plane->Row(y);
372       for (size_t x = 0; x < xsize; x += S) {
373         const auto r = Load(df, r_row + x);
374         const auto g = Load(df, g_row + x);
375         const auto b = Load(df, b_row + x);
376         const auto r_base = r * kR;
377         const auto r_diff = r * kDiffR;
378         const auto g_base = g * kG;
379         const auto b_base = b * kB;
380         const auto b_diff = b * kDiffB;
381         const auto y_base = r_base + g_base + b_base;
382         const auto y_vec = y_base - k128;
383         const auto cb_vec = (b_diff - y_base) * kNormB;
384         const auto cr_vec = (r_diff - y_base) * kNormR;
385         Store(y_vec, df, y_row + x);
386         Store(cb_vec, df, cb_row + x);
387         Store(cr_vec, df, cr_row + x);
388       }
389     }
390   };
391   RunOnPool(pool, 0, static_cast<int>(num_stripes), ThreadPool::SkipInit(),
392             transform, "RgbToYcbCr");
393 }
394 
395 // NOLINTNEXTLINE(google-readability-namespace-comments)
396 }  // namespace HWY_NAMESPACE
397 }  // namespace jxl
398 HWY_AFTER_NAMESPACE();
399 
400 #if HWY_ONCE
401 namespace jxl {
402 HWY_EXPORT(ToXYB);
ToXYB(const ImageBundle & in,ThreadPool * pool,Image3F * JXL_RESTRICT xyb,ImageBundle * JXL_RESTRICT linear_storage)403 const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
404                          Image3F* JXL_RESTRICT xyb,
405                          ImageBundle* JXL_RESTRICT linear_storage) {
406   return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, linear_storage);
407 }
408 
409 HWY_EXPORT(RgbToYcbcr);
RgbToYcbcr(const ImageF & r_plane,const ImageF & g_plane,const ImageF & b_plane,ImageF * y_plane,ImageF * cb_plane,ImageF * cr_plane,ThreadPool * pool)410 void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
411                 const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
412                 ImageF* cr_plane, ThreadPool* pool) {
413   return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane,
414                                           cb_plane, cr_plane, pool);
415 }
416 
417 HWY_EXPORT(TestCubeRoot);
TestCubeRoot()418 void TestCubeRoot() { return HWY_DYNAMIC_DISPATCH(TestCubeRoot)(); }
419 
420 // DEPRECATED
OpsinDynamicsImage(const Image3B & srgb8)421 Image3F OpsinDynamicsImage(const Image3B& srgb8) {
422   ImageMetadata metadata;
423   metadata.SetUintSamples(8);
424   metadata.color_encoding = ColorEncoding::SRGB();
425   ImageBundle ib(&metadata);
426   ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding);
427   JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray())));
428   ThreadPool* null_pool = nullptr;
429   Image3F xyb(srgb8.xsize(), srgb8.ysize());
430 
431   ImageBundle linear_storage(&metadata);
432   (void)ToXYB(ib, null_pool, &xyb, &linear_storage);
433   return xyb;
434 }
435 
436 }  // namespace jxl
437 #endif  // HWY_ONCE
438