// Copyright (c) the JPEG XL Project Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "lib/jxl/enc_xyb.h" #include #include #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "lib/jxl/enc_xyb.cc" #include #include #include "lib/jxl/aux_out_fwd.h" #include "lib/jxl/base/compiler_specific.h" #include "lib/jxl/base/data_parallel.h" #include "lib/jxl/base/profiler.h" #include "lib/jxl/base/status.h" #include "lib/jxl/color_encoding_internal.h" #include "lib/jxl/color_management.h" #include "lib/jxl/enc_bit_writer.h" #include "lib/jxl/fields.h" #include "lib/jxl/image_bundle.h" #include "lib/jxl/image_ops.h" #include "lib/jxl/opsin_params.h" #include "lib/jxl/transfer_functions-inl.h" HWY_BEFORE_NAMESPACE(); namespace jxl { namespace HWY_NAMESPACE { // These templates are not found via ADL. using hwy::HWY_NAMESPACE::ShiftRight; // Returns cbrt(x) + add with 6 ulp max error. // Modified from vectormath_exp.h, Apache 2 license. // https://www.agner.org/optimize/vectorclass.zip template V CubeRootAndAdd(const V x, const V add) { const HWY_FULL(float) df; const HWY_FULL(int32_t) di; const auto kExpBias = Set(di, 0x54800000); // cast(1.) + cast(1.) / 3 const auto kExpMul = Set(di, 0x002AAAAA); // shifted 1/3 const auto k1_3 = Set(df, 1.0f / 3); const auto k4_3 = Set(df, 4.0f / 3); const auto xa = x; // assume inputs never negative const auto xa_3 = k1_3 * xa; // Multiply exponent by -1/3 const auto m1 = BitCast(di, xa); // Special case for 0. 0 is represented with an exponent of 0, so the // "kExpBias - 1/3 * exp" below gives the wrong result. The IfThenZeroElse() // sets those values as 0, which prevents having NaNs in the computations // below. const auto m2 = IfThenZeroElse(m1 == Zero(di), kExpBias - (ShiftRight<23>(m1)) * kExpMul); auto r = BitCast(df, m2); // Newton-Raphson iterations for (int i = 0; i < 3; i++) { const auto r2 = r * r; r = NegMulAdd(xa_3, r2 * r2, k4_3 * r); } // Final iteration auto r2 = r * r; r = MulAdd(k1_3, NegMulAdd(xa, r2 * r2, r), r); r2 = r * r; r = MulAdd(r2, x, add); return r; } // Ensures infinity norm is bounded. void TestCubeRoot() { const HWY_FULL(float) d; float max_err = 0.0f; for (uint64_t x5 = 0; x5 < 2000000; x5++) { const float x = x5 * 1E-5f; const float expected = cbrtf(x); HWY_ALIGN float approx[MaxLanes(d)]; Store(CubeRootAndAdd(Set(d, x), Zero(d)), d, approx); // All lanes are same for (size_t i = 1; i < Lanes(d); ++i) { JXL_ASSERT(std::abs(approx[0] - approx[i]) <= 1.2E-7f); } const float err = std::abs(approx[0] - expected); max_err = std::max(max_err, err); } // printf("max err %e\n", max_err); JXL_ASSERT(max_err < 8E-7f); } // 4x3 matrix * 3x1 SIMD vectors template JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b, const float* JXL_RESTRICT premul_absorb, V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1, V* JXL_RESTRICT mixed2) { const float* bias = &kOpsinAbsorbanceBias[0]; const HWY_FULL(float) d; const size_t N = Lanes(d); const auto m0 = Load(d, premul_absorb + 0 * N); const auto m1 = Load(d, premul_absorb + 1 * N); const auto m2 = Load(d, premul_absorb + 2 * N); const auto m3 = Load(d, premul_absorb + 3 * N); const auto m4 = Load(d, premul_absorb + 4 * N); const auto m5 = Load(d, premul_absorb + 5 * N); const auto m6 = Load(d, premul_absorb + 6 * N); const auto m7 = Load(d, premul_absorb + 7 * N); const auto m8 = Load(d, premul_absorb + 8 * N); *mixed0 = MulAdd(m0, r, MulAdd(m1, g, MulAdd(m2, b, Set(d, bias[0])))); *mixed1 = MulAdd(m3, r, MulAdd(m4, g, MulAdd(m5, b, Set(d, bias[1])))); *mixed2 = MulAdd(m6, r, MulAdd(m7, g, MulAdd(m8, b, Set(d, bias[2])))); } template void StoreXYB(const V r, V g, const V b, float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { const HWY_FULL(float) d; const V half = Set(d, 0.5f); Store(half * (r - g), d, valx); Store(half * (r + g), d, valy); Store(b, d, valz); } // Converts one RGB vector to XYB. template void LinearRGBToXYB(const V r, const V g, const V b, const float* JXL_RESTRICT premul_absorb, float* JXL_RESTRICT valx, float* JXL_RESTRICT valy, float* JXL_RESTRICT valz) { V mixed0, mixed1, mixed2; OpsinAbsorbance(r, g, b, premul_absorb, &mixed0, &mixed1, &mixed2); // mixed* should be non-negative even for wide-gamut, so clamp to zero. mixed0 = ZeroIfNegative(mixed0); mixed1 = ZeroIfNegative(mixed1); mixed2 = ZeroIfNegative(mixed2); const HWY_FULL(float) d; const size_t N = Lanes(d); mixed0 = CubeRootAndAdd(mixed0, Load(d, premul_absorb + 9 * N)); mixed1 = CubeRootAndAdd(mixed1, Load(d, premul_absorb + 10 * N)); mixed2 = CubeRootAndAdd(mixed2, Load(d, premul_absorb + 11 * N)); StoreXYB(mixed0, mixed1, mixed2, valx, valy, valz); // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative. } // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut. template V LinearFromSRGB(V encoded) { return TF_SRGB().DisplayFromEncoded(encoded); } void LinearSRGBToXYB(const Image3F& linear, const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, Image3F* JXL_RESTRICT xyb) { const size_t xsize = linear.xsize(); const HWY_FULL(float) d; RunOnPool( pool, 0, static_cast(linear.ysize()), ThreadPool::SkipInit(), [&](const int task, const int /*thread*/) { const size_t y = static_cast(task); const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y); const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y); const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y); float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); for (size_t x = 0; x < xsize; x += Lanes(d)) { const auto in_r = Load(d, row_in0 + x); const auto in_g = Load(d, row_in1 + x); const auto in_b = Load(d, row_in2 + x); LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, row_xyb1 + x, row_xyb2 + x); } }, "LinearToXYB"); } void SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, Image3F* JXL_RESTRICT xyb) { const size_t xsize = srgb.xsize(); const HWY_FULL(float) d; RunOnPool( pool, 0, static_cast(srgb.ysize()), ThreadPool::SkipInit(), [&](const int task, const int /*thread*/) { const size_t y = static_cast(task); const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); for (size_t x = 0; x < xsize; x += Lanes(d)) { const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, row_xyb1 + x, row_xyb2 + x); } }, "SRGBToXYB"); } void SRGBToXYBAndLinear(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb, ThreadPool* pool, Image3F* JXL_RESTRICT xyb, Image3F* JXL_RESTRICT linear) { const size_t xsize = srgb.xsize(); const HWY_FULL(float) d; RunOnPool( pool, 0, static_cast(srgb.ysize()), ThreadPool::SkipInit(), [&](const int task, const int /*thread*/) { const size_t y = static_cast(task); const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y); const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y); const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y); float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y); float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y); float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y); float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y); float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y); float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y); for (size_t x = 0; x < xsize; x += Lanes(d)) { const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x)); const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x)); const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x)); Store(in_r, d, row_linear0 + x); Store(in_g, d, row_linear1 + x); Store(in_b, d, row_linear2 + x); LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x, row_xyb1 + x, row_xyb2 + x); } }, "SRGBToXYBAndLinear"); } // This is different from Butteraugli's OpsinDynamicsImage() in the sense that // it does not contain a sensitivity multiplier based on the blurred image. const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb, ImageBundle* const JXL_RESTRICT linear) { PROFILER_FUNC; const size_t xsize = in.xsize(); const size_t ysize = in.ysize(); JXL_ASSERT(SameSize(in, *xyb)); const HWY_FULL(float) d; // Pre-broadcasted constants HWY_ALIGN float premul_absorb[MaxLanes(d) * 12]; const size_t N = Lanes(d); for (size_t i = 0; i < 9; ++i) { const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] * (in.metadata()->IntensityTarget() / 255.0f)); Store(absorb, d, premul_absorb + i * N); } for (size_t i = 0; i < 3; ++i) { const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i])); Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N); } const bool want_linear = linear != nullptr; const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(in.IsGray()); // Linear sRGB inputs are rare but can be useful for the fastest encoders, for // which undoing the sRGB transfer function would be a large part of the cost. if (c_linear_srgb.SameColorEncoding(in.c_current())) { LinearSRGBToXYB(in.color(), premul_absorb, pool, xyb); // This only happens if kitten or slower, moving ImageBundle might be // possible but the encoder is much slower than this copy. if (want_linear) { *linear = in.Copy(); return linear; } return ∈ } // Common case: already sRGB, can avoid the color transform if (in.IsSRGB()) { // Common case: can avoid allocating/copying if (!want_linear) { SRGBToXYB(in.color(), premul_absorb, pool, xyb); return ∈ } // Slow encoder also wants linear sRGB. linear->SetFromImage(Image3F(xsize, ysize), c_linear_srgb); SRGBToXYBAndLinear(in.color(), premul_absorb, pool, xyb, linear->color()); return linear; } // General case: not sRGB, need color transform. ImageBundle linear_storage; // Local storage only used if !want_linear. ImageBundle* linear_storage_ptr; if (want_linear) { // Caller asked for linear, use that storage directly. linear_storage_ptr = linear; } else { // Caller didn't ask for linear, create our own local storage // OK to reuse metadata, it will not be changed. linear_storage = ImageBundle(const_cast(in.metadata())); linear_storage_ptr = &linear_storage; } const ImageBundle* ptr; JXL_CHECK( TransformIfNeeded(in, c_linear_srgb, pool, linear_storage_ptr, &ptr)); // If no transform was necessary, should have taken the above codepath. JXL_ASSERT(ptr == linear_storage_ptr); LinearSRGBToXYB(*linear_storage_ptr->color(), premul_absorb, pool, xyb); return want_linear ? linear : ∈ } // Transform RGB to YCbCr. // Could be performed in-place (i.e. Y, Cb and Cr could alias R, B and B). void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, ImageF* cr_plane, ThreadPool* pool) { const HWY_FULL(float) df; const size_t S = Lanes(df); // Step. const size_t xsize = r_plane.xsize(); const size_t ysize = r_plane.ysize(); if ((xsize == 0) || (ysize == 0)) return; // Full-range BT.601 as defined by JFIF Clause 7: // https://www.itu.int/rec/T-REC-T.871-201105-I/en const auto k128 = Set(df, 128.0f / 255); const auto kR = Set(df, 0.299f); // NTSC luma const auto kG = Set(df, 0.587f); const auto kB = Set(df, 0.114f); const auto kAmpR = Set(df, 0.701f); const auto kAmpB = Set(df, 0.886f); const auto kDiffR = kAmpR + kR; const auto kDiffB = kAmpB + kB; const auto kNormR = Set(df, 1.0f) / (kAmpR + kG + kB); const auto kNormB = Set(df, 1.0f) / (kR + kG + kAmpB); constexpr size_t kGroupArea = kGroupDim * kGroupDim; const size_t lines_per_group = DivCeil(kGroupArea, xsize); const size_t num_stripes = DivCeil(ysize, lines_per_group); const auto transform = [&](int idx, int /* thread*/) { const size_t y0 = idx * lines_per_group; const size_t y1 = std::min(y0 + lines_per_group, ysize); for (size_t y = y0; y < y1; ++y) { const float* r_row = r_plane.ConstRow(y); const float* g_row = g_plane.ConstRow(y); const float* b_row = b_plane.ConstRow(y); float* y_row = y_plane->Row(y); float* cb_row = cb_plane->Row(y); float* cr_row = cr_plane->Row(y); for (size_t x = 0; x < xsize; x += S) { const auto r = Load(df, r_row + x); const auto g = Load(df, g_row + x); const auto b = Load(df, b_row + x); const auto r_base = r * kR; const auto r_diff = r * kDiffR; const auto g_base = g * kG; const auto b_base = b * kB; const auto b_diff = b * kDiffB; const auto y_base = r_base + g_base + b_base; const auto y_vec = y_base - k128; const auto cb_vec = (b_diff - y_base) * kNormB; const auto cr_vec = (r_diff - y_base) * kNormR; Store(y_vec, df, y_row + x); Store(cb_vec, df, cb_row + x); Store(cr_vec, df, cr_row + x); } } }; RunOnPool(pool, 0, static_cast(num_stripes), ThreadPool::SkipInit(), transform, "RgbToYcbCr"); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace jxl HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace jxl { HWY_EXPORT(ToXYB); const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb, ImageBundle* JXL_RESTRICT linear_storage) { return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, linear_storage); } HWY_EXPORT(RgbToYcbcr); void RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane, const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane, ImageF* cr_plane, ThreadPool* pool) { return HWY_DYNAMIC_DISPATCH(RgbToYcbcr)(r_plane, g_plane, b_plane, y_plane, cb_plane, cr_plane, pool); } HWY_EXPORT(TestCubeRoot); void TestCubeRoot() { return HWY_DYNAMIC_DISPATCH(TestCubeRoot)(); } // DEPRECATED Image3F OpsinDynamicsImage(const Image3B& srgb8) { ImageMetadata metadata; metadata.SetUintSamples(8); metadata.color_encoding = ColorEncoding::SRGB(); ImageBundle ib(&metadata); ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding); JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()))); ThreadPool* null_pool = nullptr; Image3F xyb(srgb8.xsize(), srgb8.ysize()); ImageBundle linear_storage(&metadata); (void)ToXYB(ib, null_pool, &xyb, &linear_storage); return xyb; } } // namespace jxl #endif // HWY_ONCE