1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/gauss_blur.h"
7 
8 #include <string.h>
9 
10 #include <algorithm>
11 #include <cmath>
12 
13 #undef HWY_TARGET_INCLUDE
14 #define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc"
15 #include <hwy/cache_control.h>
16 #include <hwy/foreach_target.h>
17 #include <hwy/highway.h>
18 
19 #include "lib/jxl/base/compiler_specific.h"
20 #include "lib/jxl/base/profiler.h"
21 #include "lib/jxl/common.h"
22 #include "lib/jxl/image_ops.h"
23 #include "lib/jxl/linalg.h"
24 HWY_BEFORE_NAMESPACE();
25 namespace jxl {
26 namespace HWY_NAMESPACE {
27 
28 // These templates are not found via ADL.
29 using hwy::HWY_NAMESPACE::Broadcast;
30 #if HWY_TARGET != HWY_SCALAR
31 using hwy::HWY_NAMESPACE::ShiftLeftLanes;
32 #endif
33 using hwy::HWY_NAMESPACE::Vec;
34 
FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const float * JXL_RESTRICT in,intptr_t width,float * JXL_RESTRICT out)35 void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
36                     const float* JXL_RESTRICT in, intptr_t width,
37                     float* JXL_RESTRICT out) {
38   // Although the current output depends on the previous output, we can unroll
39   // up to 4x by precomputing up to fourth powers of the constants. Beyond that,
40   // numerical precision might become a problem. Macro because this is tested
41   // in #if alongside HWY_TARGET.
42 #define JXL_GAUSS_MAX_LANES 4
43   using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES);
44   using V = Vec<D>;
45   const D d;
46   const V mul_in_1 = Load(d, rg->mul_in + 0 * 4);
47   const V mul_in_3 = Load(d, rg->mul_in + 1 * 4);
48   const V mul_in_5 = Load(d, rg->mul_in + 2 * 4);
49   const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4);
50   const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4);
51   const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4);
52   const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4);
53   const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4);
54   const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4);
55   V prev_1 = Zero(d);
56   V prev_3 = Zero(d);
57   V prev_5 = Zero(d);
58   V prev2_1 = Zero(d);
59   V prev2_3 = Zero(d);
60   V prev2_5 = Zero(d);
61 
62   const intptr_t N = rg->radius;
63 
64   intptr_t n = -N + 1;
65   // Left side with bounds checks and only write output after n >= 0.
66   const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d));
67   for (; n < std::min(first_aligned, width); ++n) {
68     const intptr_t left = n - N - 1;
69     const intptr_t right = n + N - 1;
70     const float left_val = left >= 0 ? in[left] : 0.0f;
71     const float right_val = right < width ? in[right] : 0.0f;
72     const V sum = Set(d, left_val + right_val);
73 
74     // (Only processing a single lane here, no need to broadcast)
75     V out_1 = sum * mul_in_1;
76     V out_3 = sum * mul_in_3;
77     V out_5 = sum * mul_in_5;
78 
79     out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
80     out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
81     out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
82     prev2_1 = prev_1;
83     prev2_3 = prev_3;
84     prev2_5 = prev_5;
85 
86     out_1 = MulAdd(mul_prev_1, prev_1, out_1);
87     out_3 = MulAdd(mul_prev_3, prev_3, out_3);
88     out_5 = MulAdd(mul_prev_5, prev_5, out_5);
89     prev_1 = out_1;
90     prev_3 = out_3;
91     prev_5 = out_5;
92 
93     if (n >= 0) {
94       out[n] = GetLane(out_1 + out_3 + out_5);
95     }
96   }
97 
98   // The above loop is effectively scalar but it is convenient to use the same
99   // prev/prev2 variables, so broadcast to each lane before the unrolled loop.
100 #if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1
101   prev2_1 = Broadcast<0>(prev2_1);
102   prev2_3 = Broadcast<0>(prev2_3);
103   prev2_5 = Broadcast<0>(prev2_5);
104   prev_1 = Broadcast<0>(prev_1);
105   prev_3 = Broadcast<0>(prev_3);
106   prev_5 = Broadcast<0>(prev_5);
107 #endif
108 
109   // Unrolled, no bounds checking needed.
110   for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) {
111     const V sum = LoadU(d, in + n - N - 1) + LoadU(d, in + n + N - 1);
112 
113     // To get a vector of output(s), we multiply broadcasted vectors (of each
114     // input plus the two previous outputs) and add them all together.
115     // Incremental broadcasting and shifting is expected to be cheaper than
116     // horizontal adds or transposing 4x4 values because they run on a different
117     // port, concurrently with the FMA.
118     const V in0 = Broadcast<0>(sum);
119     V out_1 = in0 * mul_in_1;
120     V out_3 = in0 * mul_in_3;
121     V out_5 = in0 * mul_in_5;
122 
123 #if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2
124     const V in1 = Broadcast<1>(sum);
125     out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1);
126     out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3);
127     out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5);
128 
129 #if JXL_GAUSS_MAX_LANES >= 4
130     const V in2 = Broadcast<2>(sum);
131     out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1);
132     out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3);
133     out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5);
134 
135     const V in3 = Broadcast<3>(sum);
136     out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1);
137     out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3);
138     out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5);
139 #endif
140 #endif
141 
142     out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
143     out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
144     out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
145 
146     out_1 = MulAdd(mul_prev_1, prev_1, out_1);
147     out_3 = MulAdd(mul_prev_3, prev_3, out_3);
148     out_5 = MulAdd(mul_prev_5, prev_5, out_5);
149 #if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1
150     prev2_1 = prev_1;
151     prev2_3 = prev_3;
152     prev2_5 = prev_5;
153     prev_1 = out_1;
154     prev_3 = out_3;
155     prev_5 = out_5;
156 #else
157     prev2_1 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_1);
158     prev2_3 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_3);
159     prev2_5 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_5);
160     prev_1 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_1);
161     prev_3 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_3);
162     prev_5 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_5);
163 #endif
164 
165     Store(out_1 + out_3 + out_5, d, out + n);
166   }
167 
168   // Remainder handling with bounds checks
169   for (; n < width; ++n) {
170     const intptr_t left = n - N - 1;
171     const intptr_t right = n + N - 1;
172     const float left_val = left >= 0 ? in[left] : 0.0f;
173     const float right_val = right < width ? in[right] : 0.0f;
174     const V sum = Set(d, left_val + right_val);
175 
176     // (Only processing a single lane here, no need to broadcast)
177     V out_1 = sum * mul_in_1;
178     V out_3 = sum * mul_in_3;
179     V out_5 = sum * mul_in_5;
180 
181     out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
182     out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
183     out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
184     prev2_1 = prev_1;
185     prev2_3 = prev_3;
186     prev2_5 = prev_5;
187 
188     out_1 = MulAdd(mul_prev_1, prev_1, out_1);
189     out_3 = MulAdd(mul_prev_3, prev_3, out_3);
190     out_5 = MulAdd(mul_prev_5, prev_5, out_5);
191     prev_1 = out_1;
192     prev_3 = out_3;
193     prev_5 = out_5;
194 
195     out[n] = GetLane(out_1 + out_3 + out_5);
196   }
197 }
198 
199 // Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo.
200 constexpr size_t kMod = 4;
201 
202 // Avoids an unnecessary store during warmup.
203 struct OutputNone {
204   template <class V>
operator ()jxl::HWY_NAMESPACE::OutputNone205   void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/,
206                   ptrdiff_t /*offset*/) const {}
207 };
208 
209 // Common case: write output vectors in all VerticalBlock except warmup.
210 struct OutputStore {
211   template <class V>
operator ()jxl::HWY_NAMESPACE::OutputStore212   void operator()(const V& out, float* JXL_RESTRICT pos,
213                   ptrdiff_t offset) const {
214     // Stream helps for large images but is slower for images that fit in cache.
215     Store(out, HWY_FULL(float)(), pos + offset);
216   }
217 };
218 
219 // At top/bottom borders, we don't have two inputs to load, so avoid addition.
220 // pos may even point to all zeros if the row is outside the input image.
221 class SingleInput {
222  public:
SingleInput(const float * pos)223   explicit SingleInput(const float* pos) : pos_(pos) {}
operator ()(const size_t offset) const224   Vec<HWY_FULL(float)> operator()(const size_t offset) const {
225     return Load(HWY_FULL(float)(), pos_ + offset);
226   }
227   const float* pos_;
228 };
229 
230 // In the middle of the image, we need to load from a row above and below, and
231 // return the sum.
232 class TwoInputs {
233  public:
TwoInputs(const float * pos1,const float * pos2)234   TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {}
operator ()(const size_t offset) const235   Vec<HWY_FULL(float)> operator()(const size_t offset) const {
236     const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset);
237     const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset);
238     return in1 + in2;
239   }
240 
241  private:
242   const float* pos1_;
243   const float* pos2_;
244 };
245 
246 // Block := kVectors consecutive full vectors (one cache line except on the
247 // right boundary, where we can only rely on having one vector). Unrolling to
248 // the cache line size improves cache utilization.
249 template <size_t kVectors, class V, class Input, class Output>
VerticalBlock(const V & d1_1,const V & d1_3,const V & d1_5,const V & n2_1,const V & n2_3,const V & n2_5,const Input & input,size_t & ctr,float * ring_buffer,const Output output,float * JXL_RESTRICT out_pos)250 void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1,
251                    const V& n2_3, const V& n2_5, const Input& input,
252                    size_t& ctr, float* ring_buffer, const Output output,
253                    float* JXL_RESTRICT out_pos) {
254   const HWY_FULL(float) d;
255   constexpr size_t kVN = MaxLanes(d);
256   // More cache-friendly to process an entirely cache line at a time
257   constexpr size_t kLanes = kVectors * kVN;
258 
259   float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod;
260   float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod;
261   float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod;
262 
263   const size_t n_0 = (++ctr) % kMod;
264   const size_t n_1 = (ctr - 1) % kMod;
265   const size_t n_2 = (ctr - 2) % kMod;
266 
267   for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) {
268     const V sum = input(idx_vec * kVN);
269 
270     const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN);
271     const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN);
272     const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN);
273     const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN);
274     const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN);
275     const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN);
276     // (35)
277     const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1));
278     const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3));
279     const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5));
280     Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN);
281     Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN);
282     Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN);
283     output(y1 + y3 + y5, out_pos, idx_vec * kVN);
284   }
285   // NOTE: flushing cache line out_pos hurts performance - less so with
286   // clflushopt than clflush but still a significant slowdown.
287 }
288 
289 // Reads/writes one block (kVectors full vectors) in each row.
290 template <size_t kVectors>
VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const ImageF & in,const size_t x,ImageF * JXL_RESTRICT out)291 void VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
292                    const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) {
293   // We're iterating vertically, so use multiple full-length vectors (each lane
294   // is one column of row n).
295   using D = HWY_FULL(float);
296   using V = Vec<D>;
297   const D d;
298   constexpr size_t kVN = MaxLanes(d);
299   // More cache-friendly to process an entirely cache line at a time
300   constexpr size_t kLanes = kVectors * kVN;
301 #if HWY_TARGET == HWY_SCALAR
302   const V d1_1 = Set(d, rg->d1[0 * 4]);
303   const V d1_3 = Set(d, rg->d1[1 * 4]);
304   const V d1_5 = Set(d, rg->d1[2 * 4]);
305   const V n2_1 = Set(d, rg->n2[0 * 4]);
306   const V n2_3 = Set(d, rg->n2[1 * 4]);
307   const V n2_5 = Set(d, rg->n2[2 * 4]);
308 #else
309   const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4);
310   const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4);
311   const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4);
312   const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4);
313   const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4);
314   const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4);
315 #endif
316 
317   const size_t N = rg->radius;
318   const size_t ysize = in.ysize();
319 
320   size_t ctr = 0;
321   HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0};
322   HWY_ALIGN static constexpr float zero[kLanes] = {0};
323 
324   // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds.
325   ssize_t n = -static_cast<ssize_t>(N) + 1;
326   for (; n < 0; ++n) {
327     // bottom is always non-negative since n is initialized in -N + 1.
328     const size_t bottom = n + N - 1;
329     VerticalBlock<kVectors>(
330         d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
331         SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
332         ring_buffer, OutputNone(), nullptr);
333   }
334   JXL_DASSERT(n >= 0);
335 
336   // Start producing output; top is still out of bounds.
337   for (; static_cast<size_t>(n) < std::min(N + 1, ysize); ++n) {
338     const size_t bottom = n + N - 1;
339     VerticalBlock<kVectors>(
340         d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
341         SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
342         ring_buffer, OutputStore(), out->Row(n) + x);
343   }
344 
345   // Interior outputs with prefetching and without bounds checks.
346   constexpr size_t kPrefetchRows = 8;
347   for (; n < static_cast<ssize_t>(ysize - N + 1 - kPrefetchRows); ++n) {
348     const size_t top = n - N - 1;
349     const size_t bottom = n + N - 1;
350     VerticalBlock<kVectors>(
351         d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
352         TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr,
353         ring_buffer, OutputStore(), out->Row(n) + x);
354     hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x);
355     hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x);
356   }
357 
358   // Bottom border without prefetching and with bounds checks.
359   for (; static_cast<size_t>(n) < ysize; ++n) {
360     const size_t top = n - N - 1;
361     const size_t bottom = n + N - 1;
362     VerticalBlock<kVectors>(
363         d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
364         TwoInputs(in.ConstRow(top) + x,
365                   bottom < ysize ? in.ConstRow(bottom) + x : zero),
366         ctr, ring_buffer, OutputStore(), out->Row(n) + x);
367   }
368 }
369 
370 // Apply 1D vertical scan to multiple columns (one per vector lane).
371 // Not yet parallelized.
FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const ImageF & in,ThreadPool *,ImageF * JXL_RESTRICT out)372 void FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
373                           const ImageF& in, ThreadPool* /*pool*/,
374                           ImageF* JXL_RESTRICT out) {
375   PROFILER_FUNC;
376   JXL_CHECK(SameSize(in, *out));
377 
378   constexpr size_t kCacheLineLanes = 64 / sizeof(float);
379   constexpr size_t kVN = MaxLanes(HWY_FULL(float)());
380   constexpr size_t kCacheLineVectors = kCacheLineLanes / kVN;
381 
382   size_t x = 0;
383   for (; x + kCacheLineLanes <= in.xsize(); x += kCacheLineLanes) {
384     VerticalStrip<kCacheLineVectors>(rg, in, x, out);
385   }
386   for (; x < in.xsize(); x += kVN) {
387     VerticalStrip<1>(rg, in, x, out);
388   }
389 }
390 
391 // TODO(veluca): consider replacing with FastGaussian.
ConvolveXSampleAndTranspose(const ImageF & in,const std::vector<float> & kernel,const size_t res)392 ImageF ConvolveXSampleAndTranspose(const ImageF& in,
393                                    const std::vector<float>& kernel,
394                                    const size_t res) {
395   JXL_ASSERT(kernel.size() % 2 == 1);
396   JXL_ASSERT(in.xsize() % res == 0);
397   const size_t offset = res / 2;
398   const size_t out_xsize = in.xsize() / res;
399   ImageF out(in.ysize(), out_xsize);
400   const int r = kernel.size() / 2;
401   HWY_FULL(float) df;
402   std::vector<float> row_tmp(in.xsize() + 2 * r + Lanes(df));
403   float* const JXL_RESTRICT rowp = &row_tmp[r];
404   std::vector<float> padded_k = kernel;
405   padded_k.resize(padded_k.size() + Lanes(df));
406   const float* const kernelp = &padded_k[r];
407   for (size_t y = 0; y < in.ysize(); ++y) {
408     ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
409     size_t x = offset, ox = 0;
410     for (; x < static_cast<uint32_t>(r) && x < in.xsize(); x += res, ++ox) {
411       float sum = 0.0f;
412       for (int i = -r; i <= r; ++i) {
413         sum += rowp[std::max<int>(
414                    0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
415                kernelp[i];
416       }
417       out.Row(ox)[y] = sum;
418     }
419     for (; x + r < in.xsize(); x += res, ++ox) {
420       auto sum = Zero(df);
421       for (int i = -r; i <= r; i += Lanes(df)) {
422         sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum);
423       }
424       out.Row(ox)[y] = GetLane(SumOfLanes(df, sum));
425     }
426     for (; x < in.xsize(); x += res, ++ox) {
427       float sum = 0.0f;
428       for (int i = -r; i <= r; ++i) {
429         sum += rowp[std::max<int>(
430                    0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
431                kernelp[i];
432       }
433       out.Row(ox)[y] = sum;
434     }
435   }
436   return out;
437 }
438 
439 // NOLINTNEXTLINE(google-readability-namespace-comments)
440 }  // namespace HWY_NAMESPACE
441 }  // namespace jxl
442 HWY_AFTER_NAMESPACE();
443 
444 #if HWY_ONCE
445 namespace jxl {
446 
447 HWY_EXPORT(FastGaussian1D);
448 HWY_EXPORT(ConvolveXSampleAndTranspose);
FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const float * JXL_RESTRICT in,intptr_t width,float * JXL_RESTRICT out)449 void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
450                     const float* JXL_RESTRICT in, intptr_t width,
451                     float* JXL_RESTRICT out) {
452   return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out);
453 }
454 
455 HWY_EXPORT(FastGaussianVertical);  // Local function.
456 
ExtrapolateBorders(const float * const JXL_RESTRICT row_in,float * const JXL_RESTRICT row_out,const int xsize,const int radius)457 void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
458                         float* const JXL_RESTRICT row_out, const int xsize,
459                         const int radius) {
460   const int lastcol = xsize - 1;
461   for (int x = 1; x <= radius; ++x) {
462     row_out[-x] = row_in[std::min(x, xsize - 1)];
463   }
464   memcpy(row_out, row_in, xsize * sizeof(row_out[0]));
465   for (int x = 1; x <= radius; ++x) {
466     row_out[lastcol + x] = row_in[std::max(0, lastcol - x)];
467   }
468 }
469 
ConvolveXSampleAndTranspose(const ImageF & in,const std::vector<float> & kernel,const size_t res)470 ImageF ConvolveXSampleAndTranspose(const ImageF& in,
471                                    const std::vector<float>& kernel,
472                                    const size_t res) {
473   return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res);
474 }
475 
ConvolveXSampleAndTranspose(const Image3F & in,const std::vector<float> & kernel,const size_t res)476 Image3F ConvolveXSampleAndTranspose(const Image3F& in,
477                                     const std::vector<float>& kernel,
478                                     const size_t res) {
479   return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res),
480                  ConvolveXSampleAndTranspose(in.Plane(1), kernel, res),
481                  ConvolveXSampleAndTranspose(in.Plane(2), kernel, res));
482 }
483 
ConvolveAndSample(const ImageF & in,const std::vector<float> & kernel,const size_t res)484 ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
485                          const size_t res) {
486   ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res);
487   return ConvolveXSampleAndTranspose(tmp, kernel, res);
488 }
489 
490 // Implements "Recursive Implementation of the Gaussian Filter Using Truncated
491 // Cosine Functions" by Charalampidis [2016].
CreateRecursiveGaussian(double sigma)492 hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
493   PROFILER_FUNC;
494   auto rg = hwy::MakeUniqueAligned<RecursiveGaussian>();
495   constexpr double kPi = 3.141592653589793238;
496 
497   const double radius = roundf(3.2795 * sigma + 0.2546);  // (57), "N"
498 
499   // Table I, first row
500   const double pi_div_2r = kPi / (2.0 * radius);
501   const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r};
502 
503   // (37), k={1,3,5}
504   const double p_1 = +1.0 / std::tan(0.5 * omega[0]);
505   const double p_3 = -1.0 / std::tan(0.5 * omega[1]);
506   const double p_5 = +1.0 / std::tan(0.5 * omega[2]);
507 
508   // (44), k={1,3,5}
509   const double r_1 = +p_1 * p_1 / std::sin(omega[0]);
510   const double r_3 = -p_3 * p_3 / std::sin(omega[1]);
511   const double r_5 = +p_5 * p_5 / std::sin(omega[2]);
512 
513   // (50), k={1,3,5}
514   const double neg_half_sigma2 = -0.5 * sigma * sigma;
515   const double recip_radius = 1.0 / radius;
516   double rho[3];
517   for (size_t i = 0; i < 3; ++i) {
518     rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius;
519   }
520 
521   // second part of (52), k1,k2 = 1,3; 3,5; 5,1
522   const double D_13 = p_1 * r_3 - r_1 * p_3;
523   const double D_35 = p_3 * r_5 - r_3 * p_5;
524   const double D_51 = p_5 * r_1 - r_5 * p_1;
525 
526   // (52), k=5
527   const double recip_d13 = 1.0 / D_13;
528   const double zeta_15 = D_35 * recip_d13;
529   const double zeta_35 = D_51 * recip_d13;
530 
531   double A[9] = {p_1,     p_3,     p_5,  //
532                  r_1,     r_3,     r_5,  //  (56)
533                  zeta_15, zeta_35, 1};
534   JXL_CHECK(Inv3x3Matrix(A));
535   const double gamma[3] = {1, radius * radius - sigma * sigma,  // (55)
536                            zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]};
537   double beta[3];
538   MatMul(A, gamma, 3, 3, 1, beta);  // (53)
539 
540   // Sanity check: correctly solved for beta (IIR filter weights are normalized)
541   const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5;  // (39)
542   JXL_ASSERT(std::abs(sum - 1) < 1E-12);
543   (void)sum;
544 
545   rg->radius = static_cast<int>(radius);
546 
547   double n2[3];
548   double d1[3];
549   for (size_t i = 0; i < 3; ++i) {
550     n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0));  // (33)
551     d1[i] = -2.0 * std::cos(omega[i]);                       // (33)
552 
553     for (size_t lane = 0; lane < 4; ++lane) {
554       rg->n2[4 * i + lane] = static_cast<float>(n2[i]);
555       rg->d1[4 * i + lane] = static_cast<float>(d1[i]);
556     }
557 
558     const double d_2 = d1[i] * d1[i];
559 
560     // Obtained by expanding (35) for four consecutive outputs via sympy:
561     // n, d, p, pp = symbols('n d p pp')
562     // i0, i1, i2, i3 = symbols('i0 i1 i2 i3')
563     // o0, o1, o2, o3 = symbols('o0 o1 o2 o3')
564     // o0 = n*i0 - d*p - pp
565     // o1 = n*i1 - d*o0 - p
566     // o2 = n*i2 - d*o1 - o0
567     // o3 = n*i3 - d*o2 - o1
568     // Then expand(o3) and gather terms for p(prev), pp(prev2) etc.
569     rg->mul_prev[4 * i + 0] = -d1[i];
570     rg->mul_prev[4 * i + 1] = d_2 - 1.0;
571     rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i];
572     rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0;
573     rg->mul_prev2[4 * i + 0] = -1.0;
574     rg->mul_prev2[4 * i + 1] = d1[i];
575     rg->mul_prev2[4 * i + 2] = -d_2 + 1.0;
576     rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i];
577     rg->mul_in[4 * i + 0] = n2[i];
578     rg->mul_in[4 * i + 1] = -d1[i] * n2[i];
579     rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i];
580     rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i];
581   }
582   return rg;
583 }
584 
585 namespace {
586 
587 // Apply 1D horizontal scan to each row.
FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const ImageF & in,ThreadPool * pool,ImageF * JXL_RESTRICT out)588 void FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
589                             const ImageF& in, ThreadPool* pool,
590                             ImageF* JXL_RESTRICT out) {
591   PROFILER_FUNC;
592   JXL_CHECK(SameSize(in, *out));
593 
594   const intptr_t xsize = in.xsize();
595   JXL_CHECK(RunOnPool(
596       pool, 0, in.ysize(), ThreadPool::NoInit,
597       [&](const uint32_t task, size_t /*thread*/) {
598         const size_t y = task;
599         const float* row_in = in.ConstRow(y);
600         float* JXL_RESTRICT row_out = out->Row(y);
601         FastGaussian1D(rg, row_in, xsize, row_out);
602       },
603       "FastGaussianHorizontal"));
604 }
605 
606 }  // namespace
607 
FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian> & rg,const ImageF & in,ThreadPool * pool,ImageF * JXL_RESTRICT temp,ImageF * JXL_RESTRICT out)608 void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
609                   const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
610                   ImageF* JXL_RESTRICT out) {
611   FastGaussianHorizontal(rg, in, pool, temp);
612   HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out);
613 }
614 
615 }  // namespace jxl
616 #endif  // HWY_ONCE
617