1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 #include "lib/jxl/render_pipeline/stage_noise.h"
7
8 #undef HWY_TARGET_INCLUDE
9 #define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_noise.cc"
10 #include <hwy/foreach_target.h>
11 #include <hwy/highway.h>
12
13 #include "lib/jxl/fast_math-inl.h"
14 #include "lib/jxl/transfer_functions-inl.h"
15
16 HWY_BEFORE_NAMESPACE();
17 namespace jxl {
18 namespace HWY_NAMESPACE {
19
20 // These templates are not found via ADL.
21 using hwy::HWY_NAMESPACE::ShiftRight;
22 using hwy::HWY_NAMESPACE::Vec;
23
24 using D = HWY_CAPPED(float, kBlockDim);
25 using DI = hwy::HWY_NAMESPACE::Rebind<int, D>;
26 using DI8 = hwy::HWY_NAMESPACE::Repartition<uint8_t, D>;
27
28 // [0, max_value]
29 template <class D, class V>
Clamp0ToMax(D d,const V x,const V max_value)30 static HWY_INLINE V Clamp0ToMax(D d, const V x, const V max_value) {
31 const auto clamped = Min(x, max_value);
32 return ZeroIfNegative(clamped);
33 }
34
35 // x is in [0+delta, 1+delta], delta ~= 0.06
36 template <class StrengthEval>
NoiseStrength(const StrengthEval & eval,const typename StrengthEval::V x)37 typename StrengthEval::V NoiseStrength(const StrengthEval& eval,
38 const typename StrengthEval::V x) {
39 return Clamp0ToMax(D(), eval(x), Set(D(), 1.0f));
40 }
41
42 // TODO(veluca): SIMD-fy.
43 class StrengthEvalLut {
44 public:
45 using V = Vec<D>;
46
StrengthEvalLut(const NoiseParams & noise_params)47 explicit StrengthEvalLut(const NoiseParams& noise_params)
48 #if HWY_TARGET == HWY_SCALAR
49 : noise_params_(noise_params)
50 #endif
51 {
52 #if HWY_TARGET != HWY_SCALAR
53 uint32_t lut[8];
54 memcpy(lut, noise_params.lut, sizeof(lut));
55 for (size_t i = 0; i < 8; i++) {
56 low16_lut[2 * i] = (lut[i] >> 0) & 0xFF;
57 low16_lut[2 * i + 1] = (lut[i] >> 8) & 0xFF;
58 high16_lut[2 * i] = (lut[i] >> 16) & 0xFF;
59 high16_lut[2 * i + 1] = (lut[i] >> 24) & 0xFF;
60 }
61 #endif
62 }
63
operator ()(const V vx) const64 V operator()(const V vx) const {
65 constexpr size_t kScale = NoiseParams::kNumNoisePoints - 2;
66 auto scaled_vx = Max(Zero(D()), vx * Set(D(), kScale));
67 auto floor_x = Floor(scaled_vx);
68 auto frac_x = scaled_vx - floor_x;
69 floor_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), kScale - 1),
70 floor_x);
71 frac_x = IfThenElse(scaled_vx >= Set(D(), kScale), Set(D(), 1), frac_x);
72 auto floor_x_int = ConvertTo(DI(), floor_x);
73 #if HWY_TARGET == HWY_SCALAR
74 auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
75 auto hi = Set(D(), noise_params_.lut[floor_x_int.raw + 1]);
76 #else
77 // Set each lane's bytes to {0, 0, 2x+1, 2x}.
78 auto floorx_indices_low =
79 floor_x_int * Set(DI(), 0x0202) + Set(DI(), 0x0100);
80 // Set each lane's bytes to {2x+1, 2x, 0, 0}.
81 auto floorx_indices_hi =
82 floor_x_int * Set(DI(), 0x02020000) + Set(DI(), 0x01000000);
83 // load LUT
84 auto low16 = BitCast(DI(), LoadDup128(DI8(), low16_lut));
85 auto lowm = Set(DI(), 0xFFFF);
86 auto hi16 = BitCast(DI(), LoadDup128(DI8(), high16_lut));
87 auto him = Set(DI(), 0xFFFF0000);
88 // low = noise_params.lut[floor_x]
89 auto low =
90 BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) |
91 (TableLookupBytes(hi16, floorx_indices_hi) & him));
92 // hi = noise_params.lut[floor_x+1]
93 floorx_indices_low += Set(DI(), 0x0202);
94 floorx_indices_hi += Set(DI(), 0x02020000);
95 auto hi =
96 BitCast(D(), (TableLookupBytes(low16, floorx_indices_low) & lowm) |
97 (TableLookupBytes(hi16, floorx_indices_hi) & him));
98 #endif
99 return MulAdd(hi - low, frac_x, low);
100 }
101
102 private:
103 #if HWY_TARGET != HWY_SCALAR
104 // noise_params.lut transformed into two 16-bit lookup tables.
105 HWY_ALIGN uint8_t high16_lut[16];
106 HWY_ALIGN uint8_t low16_lut[16];
107 #else
108 const NoiseParams& noise_params_;
109 #endif
110 };
111
112 template <class D>
AddNoiseToRGB(const D d,const Vec<D> rnd_noise_r,const Vec<D> rnd_noise_g,const Vec<D> rnd_noise_cor,const Vec<D> noise_strength_g,const Vec<D> noise_strength_r,float ytox,float ytob,float * JXL_RESTRICT out_x,float * JXL_RESTRICT out_y,float * JXL_RESTRICT out_b)113 void AddNoiseToRGB(const D d, const Vec<D> rnd_noise_r,
114 const Vec<D> rnd_noise_g, const Vec<D> rnd_noise_cor,
115 const Vec<D> noise_strength_g, const Vec<D> noise_strength_r,
116 float ytox, float ytob, float* JXL_RESTRICT out_x,
117 float* JXL_RESTRICT out_y, float* JXL_RESTRICT out_b) {
118 const auto kRGCorr = Set(d, 0.9921875f); // 127/128
119 const auto kRGNCorr = Set(d, 0.0078125f); // 1/128
120
121 const auto red_noise = kRGNCorr * rnd_noise_r * noise_strength_r +
122 kRGCorr * rnd_noise_cor * noise_strength_r;
123 const auto green_noise = kRGNCorr * rnd_noise_g * noise_strength_g +
124 kRGCorr * rnd_noise_cor * noise_strength_g;
125
126 auto vx = Load(d, out_x);
127 auto vy = Load(d, out_y);
128 auto vb = Load(d, out_b);
129
130 vx += red_noise - green_noise + Set(d, ytox) * (red_noise + green_noise);
131 vy += red_noise + green_noise;
132 vb += Set(d, ytob) * (red_noise + green_noise);
133
134 Store(vx, d, out_x);
135 Store(vy, d, out_y);
136 Store(vb, d, out_b);
137 }
138
139 class AddNoiseStage : public RenderPipelineStage {
140 public:
AddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t first_c)141 AddNoiseStage(const NoiseParams& noise_params,
142 const ColorCorrelationMap& cmap, size_t first_c)
143 : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
144 /*shift=*/0, /*border=*/2)),
145 noise_params_(noise_params),
146 cmap_(cmap),
147 first_c_(first_c) {}
148
ProcessRow(const RowInfo & input_rows,const RowInfo & output_rows,size_t xextra,size_t xsize,size_t xpos,size_t ypos,float * JXL_RESTRICT temp) const149 void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
150 size_t xextra, size_t xsize, size_t xpos, size_t ypos,
151 float* JXL_RESTRICT temp) const final {
152 PROFILER_ZONE("Noise apply");
153
154 if (!noise_params_.HasAny()) return;
155 const StrengthEvalLut noise_model(noise_params_);
156 D d;
157 const auto half = Set(d, 0.5f);
158
159 // With the prior subtract-random Laplacian approximation, rnd_* ranges were
160 // about [-1.5, 1.6]; Laplacian3 about doubles this to [-3.6, 3.6], so the
161 // normalizer is half of what it was before (0.5).
162 const auto norm_const = Set(d, 0.22f);
163
164 float ytox = cmap_.YtoXRatio(0);
165 float ytob = cmap_.YtoBRatio(0);
166
167 const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
168
169 float* JXL_RESTRICT row_x = GetInputRow(input_rows, 0, 0);
170 float* JXL_RESTRICT row_y = GetInputRow(input_rows, 1, 0);
171 float* JXL_RESTRICT row_b = GetInputRow(input_rows, 2, 0);
172 const float* JXL_RESTRICT row_rnd_r =
173 GetInputRow(input_rows, first_c_ + 0, 0);
174 const float* JXL_RESTRICT row_rnd_g =
175 GetInputRow(input_rows, first_c_ + 1, 0);
176 const float* JXL_RESTRICT row_rnd_c =
177 GetInputRow(input_rows, first_c_ + 2, 0);
178 // Needed by the calls to Floor() in StrengthEvalLut. Only arithmetic and
179 // shuffles are otherwise done on the data, so this is safe.
180 msan::UnpoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
181 msan::UnpoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
182 for (size_t x = 0; x < xsize; x += Lanes(d)) {
183 const auto vx = Load(d, row_x + x);
184 const auto vy = Load(d, row_y + x);
185 const auto in_g = vy - vx;
186 const auto in_r = vy + vx;
187 const auto noise_strength_g = NoiseStrength(noise_model, in_g * half);
188 const auto noise_strength_r = NoiseStrength(noise_model, in_r * half);
189 const auto addit_rnd_noise_red = Load(d, row_rnd_r + x) * norm_const;
190 const auto addit_rnd_noise_green = Load(d, row_rnd_g + x) * norm_const;
191 const auto addit_rnd_noise_correlated =
192 Load(d, row_rnd_c + x) * norm_const;
193 AddNoiseToRGB(D(), addit_rnd_noise_red, addit_rnd_noise_green,
194 addit_rnd_noise_correlated, noise_strength_g,
195 noise_strength_r, ytox, ytob, row_x + x, row_y + x,
196 row_b + x);
197 }
198 msan::PoisonMemory(row_x + xsize, (xsize_v - xsize) * sizeof(float));
199 msan::PoisonMemory(row_y + xsize, (xsize_v - xsize) * sizeof(float));
200 msan::PoisonMemory(row_b + xsize, (xsize_v - xsize) * sizeof(float));
201 }
202
GetChannelMode(size_t c) const203 RenderPipelineChannelMode GetChannelMode(size_t c) const final {
204 return c >= first_c_ ? RenderPipelineChannelMode::kInput
205 : c < 3 ? RenderPipelineChannelMode::kInPlace
206 : RenderPipelineChannelMode::kIgnored;
207 }
208
209 private:
210 const NoiseParams& noise_params_;
211 const ColorCorrelationMap& cmap_;
212 size_t first_c_;
213 };
214
GetAddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t noise_c_start)215 std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
216 const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
217 size_t noise_c_start) {
218 return jxl::make_unique<AddNoiseStage>(noise_params, cmap, noise_c_start);
219 }
220
221 class ConvolveNoiseStage : public RenderPipelineStage {
222 public:
ConvolveNoiseStage(size_t first_c)223 explicit ConvolveNoiseStage(size_t first_c)
224 : RenderPipelineStage(RenderPipelineStage::Settings::Symmetric(
225 /*shift=*/0, /*border=*/2)),
226 first_c_(first_c) {}
227
ProcessRow(const RowInfo & input_rows,const RowInfo & output_rows,size_t xextra,size_t xsize,size_t xpos,size_t ypos,float * JXL_RESTRICT temp) const228 void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
229 size_t xextra, size_t xsize, size_t xpos, size_t ypos,
230 float* JXL_RESTRICT temp) const final {
231 PROFILER_ZONE("Noise convolve");
232
233 const HWY_FULL(float) d;
234 for (size_t c = first_c_; c < first_c_ + 3; c++) {
235 float* JXL_RESTRICT rows[5];
236 for (size_t i = 0; i < 5; i++) {
237 rows[i] = GetInputRow(input_rows, c, i - 2);
238 }
239 float* JXL_RESTRICT row_out = GetOutputRow(output_rows, c, 0);
240 for (int64_t x = -RoundUpTo(xextra, Lanes(d));
241 x < (int64_t)(xsize + xextra); x += Lanes(d)) {
242 const auto p00 = Load(d, rows[2] + x);
243 auto others = Zero(d);
244 for (ssize_t i = -2; i <= 2; i++) {
245 others += LoadU(d, rows[0] + x + i);
246 others += LoadU(d, rows[1] + x + i);
247 others += LoadU(d, rows[3] + x + i);
248 others += LoadU(d, rows[4] + x + i);
249 }
250 others += LoadU(d, rows[2] + x - 2);
251 others += LoadU(d, rows[2] + x - 1);
252 others += LoadU(d, rows[2] + x + 1);
253 others += LoadU(d, rows[2] + x + 2);
254 // 4 * (1 - box kernel)
255 auto pixels = MulAdd(others, Set(d, 0.16), p00 * Set(d, -3.84));
256 Store(pixels, d, row_out + x);
257 }
258 }
259 }
260
GetChannelMode(size_t c) const261 RenderPipelineChannelMode GetChannelMode(size_t c) const final {
262 return c >= first_c_ ? RenderPipelineChannelMode::kInOut
263 : RenderPipelineChannelMode::kIgnored;
264 }
265
266 private:
267 size_t first_c_;
268 };
269
GetConvolveNoiseStage(size_t noise_c_start)270 std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
271 size_t noise_c_start) {
272 return jxl::make_unique<ConvolveNoiseStage>(noise_c_start);
273 }
274
275 // NOLINTNEXTLINE(google-readability-namespace-comments)
276 } // namespace HWY_NAMESPACE
277 } // namespace jxl
278 HWY_AFTER_NAMESPACE();
279
280 #if HWY_ONCE
281 namespace jxl {
282
283 HWY_EXPORT(GetAddNoiseStage);
284 HWY_EXPORT(GetConvolveNoiseStage);
285
GetAddNoiseStage(const NoiseParams & noise_params,const ColorCorrelationMap & cmap,size_t noise_c_start)286 std::unique_ptr<RenderPipelineStage> GetAddNoiseStage(
287 const NoiseParams& noise_params, const ColorCorrelationMap& cmap,
288 size_t noise_c_start) {
289 return HWY_DYNAMIC_DISPATCH(GetAddNoiseStage)(noise_params, cmap,
290 noise_c_start);
291 }
292
GetConvolveNoiseStage(size_t noise_c_start)293 std::unique_ptr<RenderPipelineStage> GetConvolveNoiseStage(
294 size_t noise_c_start) {
295 return HWY_DYNAMIC_DISPATCH(GetConvolveNoiseStage)(noise_c_start);
296 }
297
298 } // namespace jxl
299 #endif
300