1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/render_pipeline/stage_noise.h"
7 
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
12 
13 #include "lib/jxl/fast_math-inl.h"
14 #include "lib/jxl/transfer_functions-inl.h"
15 
16 HWY_BEFORE_NAMESPACE();
17 namespace jxl {
18 namespace HWY_NAMESPACE {
19 
20 // These templates are not found via ADL.
21 using hwy::HWY_NAMESPACE::ShiftRight;
22 using hwy::HWY_NAMESPACE::Vec;
23 
24 using D = HWY_CAPPED(float, kBlockDim);
25 using DI = hwy::HWY_NAMESPACE::Rebind<int, D>;
26 using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
27 
28 // [0, max_value]
29 template <class D, class V>
Clamp0ToMax(D d,const V x,const V max_value)30 static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
31   const auto clamped = Min(x, max_value);
32   return ZeroIfNegative(clamped);
33 }
34 
35 // x is in [0+delta, 1+delta], delta ~= 0.06
36 template <class StrengthEval>
NoiseStrength(const StrengthEval & eval,const typename StrengthEval::V x)37 typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
38                                        const typename StrengthEval::V x) {
39   return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
40 }
41 
42 // TODO(veluca): SIMD-fy.
43 class StrengthEvalLut {
44  public:
45   using V = Vec<D>;
46 
StrengthEvalLut(const NoiseParams & noise_params)47   explicit StrengthEvalLut(const NoiseParams& noise_params)
48 #if HWY_TARGET == HWY_SCALAR
49       : noise_params_(noise_params)
50 #endif
51   {
52 #if HWY_TARGET != HWY_SCALAR
53     uint32_t lut[8];
54     memcpy(lut, noise_params.lut, sizeof(lut));
55     for (size_t i = 0; i < 8; i++) {
56       low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
57       low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
58       high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
59       high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
60     }
61 #endif
62   }
63 
operator ()(const V vx) const64   V operator()(const V vx) const {
65     constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
66     auto scaled_vx = Max(Zero(D()), vx * Set(D(), kScale));
67     auto floor_x = Floor(scaled_vx);
68     auto frac_x = scaled_vx - floor_x;
69     floor_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), kScale - 1),
70                          floor_x);
71     frac_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), 1), frac_x);
72     auto floor_x_int = ConvertTo(DI(), floor_x);
73 #if HWY_TARGET == HWY_SCALAR
74     auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
75     auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
76 #else
77     // Set each lane's bytes to {0, 0, 2x+1, 2x}.
78     auto floorx_indices_low =
79         floor_x_int * Set(DI(), 0x0202) + Set(DI(), 0x0100);
80     // Set each lane's bytes to {2x+1, 2x, 0, 0}.
81     auto floorx_indices_hi =
82         floor_x_int * Set(DI(), 0x02020000) + Set(DI(), 0x01000000);
83     // load LUT
84     auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
85     auto lowm = Set(DI(), 0xFFFF);
86     auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
87     auto him = Set(DI(), 0xFFFF0000);
88     // low = noise_params.lut[floor_x]
89     auto low =
90         BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) |
91                          (TableLookupBytes(hi16, floorx_indices_hi) & him));
92     // hi = noise_params.lut[floor_x+1]
93     floorx_indices_low += Set(DI(), 0x0202);
94     floorx_indices_hi += Set(DI(), 0x02020000);
95     auto hi =
96         BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) |
97                          (TableLookupBytes(hi16, floorx_indices_hi) & him));
98 #endif
99     return MulAdd(hi - low, frac_x, low);
100   }
101 
102  private:
103 #if HWY_TARGET != HWY_SCALAR
104   // noise_params.lut transformed into two 16-bit lookup tables.
105   HWY_ALIGN uint8_t high16_lut[16];
106   HWY_ALIGN uint8_t low16_lut[16];
107 #else
108   const NoiseParams& noise_params_;
109 #endif
110 };
111 
112 template <class D>
AddNoiseToRGB(const D d,const Vec<D> rnd_noise_r,const Vec<D> rnd_noise_g,const Vec<D> rnd_noise_cor,const Vec<D> noise_strength_g,const Vec<D> noise_strength_r,float ytox,float ytob,float * JXL_RESTRICT out_x,float * JXL_RESTRICT out_y,float * JXL_RESTRICT out_b)113 void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
114                    const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
115                    const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
116                    float ytox, float ytob, float* JXL_RESTRICT out_x,
117                    float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
118   const auto kRGCorr = Set(d, 0.9921875f);   // 127/128
119   const auto kRGNCorr = Set(d, 0.0078125f);  // 1/128
120 
121   const auto red_noise = kRGNCorr * rnd_noise_r * noise_strength_r +
122                          kRGCorr * rnd_noise_cor * noise_strength_r;
123   const auto green_noise = kRGNCorr * rnd_noise_g * noise_strength_g +
124                            kRGCorr * rnd_noise_cor * noise_strength_g;
125 
126   auto vx = Load(d, out_x);
127   auto vy = Load(d, out_y);
128   auto vb = Load(d, out_b);
129 
130   vx += red_noise - green_noise + Set(d, ytox) * (red_noise + green_noise);
131   vy += red_noise + green_noise;
132   vb += Set(d, ytob) * (red_noise + green_noise);
133 
134   Store(vx, d, out_x);
135   Store(vy, d, out_y);
136   Store(vb, d, out_b);
137 }
138 
139 class AddNoiseStage : public RenderPipelineStage {
140  public:
AddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t first_c)141   AddNoiseStage(const NoiseParams& noise_params,
142                 const ColorCorrelationMap& cmap, size_t first_c)
143       : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
144             /*shift=*/0, /*border=*/2)),
145         noise_params_(noise_params),
146         cmap_(cmap),
147         first_c_(first_c) {}
148 
ProcessRow(const RowInfo & input_rows,const RowInfo & output_rows,size_t xextra,size_t xsize,size_t xpos,size_t ypos,float * JXL_RESTRICT temp) const149   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
150                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
151                   float* JXL_RESTRICT temp) const final {
152     PROFILER_ZONE("Noise apply");
153 
154     if (!noise_params_.HasAny()) return;
155     const StrengthEvalLut noise_model(noise_params_);
156     D d;
157     const auto half = Set(d, 0.5f);
158 
159     // With the prior subtract-random Laplacian approximation, rnd_* ranges were
160     // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
161     // normalizer is half of what it was before (0.5).
162     const auto norm_const = Set(d, 0.22f);
163 
164     float ytox = cmap_.YtoXRatio(0);
165     float ytob = cmap_.YtoBRatio(0);
166 
167     const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
168 
169     float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
170     float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
171     float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
172     const float* JXL_RESTRICT row_rnd_r =
173         GetInputRow(input_rows, first_c_ + 0, 0);
174     const float* JXL_RESTRICT row_rnd_g =
175         GetInputRow(input_rows, first_c_ + 1, 0);
176     const float* JXL_RESTRICT row_rnd_c =
177         GetInputRow(input_rows, first_c_ + 2, 0);
178     // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
179     // shuffles are otherwise done on the data, so this is safe.
180     msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
181     msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
182     for (size_t x = 0; x < xsize; x += Lanes(d)) {
183       const auto vx = Load(d, row_x + x);
184       const auto vy = Load(d, row_y + x);
185       const auto in_g = vy - vx;
186       const auto in_r = vy + vx;
187       const auto noise_strength_g = NoiseStrength(noise_model, in_g * half);
188       const auto noise_strength_r = NoiseStrength(noise_model, in_r * half);
189       const auto addit_rnd_noise_red = Load(d, row_rnd_r + x) * norm_const;
190       const auto addit_rnd_noise_green = Load(d, row_rnd_g + x) * norm_const;
191       const auto addit_rnd_noise_correlated =
192           Load(d, row_rnd_c + x) * norm_const;
193       AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
194                     addit_rnd_noise_correlated, noise_strength_g,
195                     noise_strength_r, ytox, ytob, row_x + x, row_y + x,
196                     row_b + x);
197     }
198     msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
199     msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
200     msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
201   }
202 
GetChannelMode(size_t c) const203   RenderPipelineChannelMode GetChannelMode(size_t c) const final {
204     return c >= first_c_ ? RenderPipelineChannelMode::kInput
205            : c < 3       ? RenderPipelineChannelMode::kInPlace
206                          : RenderPipelineChannelMode::kIgnored;
207   }
208 
209  private:
210   const NoiseParams& noise_params_;
211   const ColorCorrelationMap& cmap_;
212   size_t first_c_;
213 };
214 
GetAddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t noise_c_start)215 std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
216     const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
217     size_t noise_c_start) {
218   return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
219 }
220 
221 class ConvolveNoiseStage : public RenderPipelineStage {
222  public:
ConvolveNoiseStage(size_t first_c)223   explicit ConvolveNoiseStage(size_t first_c)
224       : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
225             /*shift=*/0, /*border=*/2)),
226         first_c_(first_c) {}
227 
ProcessRow(const RowInfo & input_rows,const RowInfo & output_rows,size_t xextra,size_t xsize,size_t xpos,size_t ypos,float * JXL_RESTRICT temp) const228   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
229                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
230                   float* JXL_RESTRICT temp) const final {
231     PROFILER_ZONE("Noise convolve");
232 
233     const HWY_FULL(float) d;
234     for (size_t c = first_c_; c < first_c_ + 3; c++) {
235       float* JXL_RESTRICT rows[5];
236       for (size_t i = 0; i < 5; i++) {
237         rows[i] = GetInputRow(input_rows, c, i - 2);
238       }
239       float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
240       for (int64_t x = -RoundUpTo(xextra, Lanes(d));
241            x < (int64_t)(xsize + xextra); x += Lanes(d)) {
242         const auto p00 = Load(d, rows[2] + x);
243         auto others = Zero(d);
244         for (ssize_t i = -2; i <= 2; i++) {
245           others += LoadU(d, rows[0] + x + i);
246           others += LoadU(d, rows[1] + x + i);
247           others += LoadU(d, rows[3] + x + i);
248           others += LoadU(d, rows[4] + x + i);
249         }
250         others += LoadU(d, rows[2] + x - 2);
251         others += LoadU(d, rows[2] + x - 1);
252         others += LoadU(d, rows[2] + x + 1);
253         others += LoadU(d, rows[2] + x + 2);
254         // 4 * (1 - box kernel)
255         auto pixels = MulAdd(others, Set(d, 0.16), p00 * Set(d, -3.84));
256         Store(pixels, d, row_out + x);
257       }
258     }
259   }
260 
GetChannelMode(size_t c) const261   RenderPipelineChannelMode GetChannelMode(size_t c) const final {
262     return c >= first_c_ ? RenderPipelineChannelMode::kInOut
263                          : RenderPipelineChannelMode::kIgnored;
264   }
265 
266  private:
267   size_t first_c_;
268 };
269 
GetConvolveNoiseStage(size_t noise_c_start)270 std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
271     size_t noise_c_start) {
272   return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
273 }
274 
275 // NOLINTNEXTLINE(google-readability-namespace-comments)
276 }  // namespace HWY_NAMESPACE
277 }  // namespace jxl
278 HWY_AFTER_NAMESPACE();
279 
280 #if HWY_ONCE
281 namespace jxl {
282 
283 HWY_EXPORT(GetAddNoiseStage);
284 HWY_EXPORT(GetConvolveNoiseStage);
285 
GetAddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t noise_c_start)286 std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
287     const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
288     size_t noise_c_start) {
289   return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
290                                                 noise_c_start);
291 }
292 
GetConvolveNoiseStage(size_t noise_c_start)293 std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
294     size_t noise_c_start) {
295   return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
296 }
297 
298 }  // namespace jxl
299 #endif
300