1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/dec_reconstruct.h"
7 
8 #include <atomic>
9 #include <utility>
10 
11 #include "lib/jxl/filters.h"
12 #include "lib/jxl/image_ops.h"
13 
14 #undef HWY_TARGET_INCLUDE
15 #define HWY_TARGET_INCLUDE "lib/jxl/dec_reconstruct.cc"
16 #include <hwy/foreach_target.h>
17 #include <hwy/highway.h>
18 
19 #include "lib/jxl/aux_out.h"
20 #include "lib/jxl/base/data_parallel.h"
21 #include "lib/jxl/base/profiler.h"
22 #include "lib/jxl/blending.h"
23 #include "lib/jxl/color_management.h"
24 #include "lib/jxl/common.h"
25 #include "lib/jxl/dec_noise.h"
26 #include "lib/jxl/dec_upsample.h"
27 #include "lib/jxl/dec_xyb-inl.h"
28 #include "lib/jxl/dec_xyb.h"
29 #include "lib/jxl/epf.h"
30 #include "lib/jxl/fast_math-inl.h"
31 #include "lib/jxl/frame_header.h"
32 #include "lib/jxl/loop_filter.h"
33 #include "lib/jxl/passes_state.h"
34 #include "lib/jxl/transfer_functions-inl.h"
35 HWY_BEFORE_NAMESPACE();
36 namespace jxl {
37 namespace HWY_NAMESPACE {
38 
UndoXYBInPlace(Image3F * idct,const Rect & rect,const OutputEncodingInfo & output_encoding_info)39 Status UndoXYBInPlace(Image3F* idct, const Rect& rect,
40                       const OutputEncodingInfo& output_encoding_info) {
41   PROFILER_ZONE("UndoXYB");
42 
43   // The size of `rect` might not be a multiple of Lanes(d), but is guaranteed
44   // to be a multiple of kBlockDim or at the margin of the image.
45   for (size_t y = 0; y < rect.ysize(); y++) {
46     float* JXL_RESTRICT row0 = rect.PlaneRow(idct, 0, y);
47     float* JXL_RESTRICT row1 = rect.PlaneRow(idct, 1, y);
48     float* JXL_RESTRICT row2 = rect.PlaneRow(idct, 2, y);
49 
50     const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) d;
51 
52     if (output_encoding_info.color_encoding.tf.IsLinear()) {
53       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
54         const auto in_opsin_x = Load(d, row0 + x);
55         const auto in_opsin_y = Load(d, row1 + x);
56         const auto in_opsin_b = Load(d, row2 + x);
57         JXL_COMPILER_FENCE;
58         auto linear_r = Undefined(d);
59         auto linear_g = Undefined(d);
60         auto linear_b = Undefined(d);
61         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
62                  output_encoding_info.opsin_params, &linear_r, &linear_g,
63                  &linear_b);
64 
65         Store(linear_r, d, row0 + x);
66         Store(linear_g, d, row1 + x);
67         Store(linear_b, d, row2 + x);
68       }
69     } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
70       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
71         const auto in_opsin_x = Load(d, row0 + x);
72         const auto in_opsin_y = Load(d, row1 + x);
73         const auto in_opsin_b = Load(d, row2 + x);
74         JXL_COMPILER_FENCE;
75         auto linear_r = Undefined(d);
76         auto linear_g = Undefined(d);
77         auto linear_b = Undefined(d);
78         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
79                  output_encoding_info.opsin_params, &linear_r, &linear_g,
80                  &linear_b);
81 
82 #if JXL_HIGH_PRECISION
83         Store(TF_SRGB().EncodedFromDisplay(d, linear_r), d, row0 + x);
84         Store(TF_SRGB().EncodedFromDisplay(d, linear_g), d, row1 + x);
85         Store(TF_SRGB().EncodedFromDisplay(d, linear_b), d, row2 + x);
86 #else
87         Store(FastLinearToSRGB(d, linear_r), d, row0 + x);
88         Store(FastLinearToSRGB(d, linear_g), d, row1 + x);
89         Store(FastLinearToSRGB(d, linear_b), d, row2 + x);
90 #endif
91       }
92     } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
93       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
94         const auto in_opsin_x = Load(d, row0 + x);
95         const auto in_opsin_y = Load(d, row1 + x);
96         const auto in_opsin_b = Load(d, row2 + x);
97         JXL_COMPILER_FENCE;
98         auto linear_r = Undefined(d);
99         auto linear_g = Undefined(d);
100         auto linear_b = Undefined(d);
101         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
102                  output_encoding_info.opsin_params, &linear_r, &linear_g,
103                  &linear_b);
104         Store(TF_PQ().EncodedFromDisplay(d, linear_r), d, row0 + x);
105         Store(TF_PQ().EncodedFromDisplay(d, linear_g), d, row1 + x);
106         Store(TF_PQ().EncodedFromDisplay(d, linear_b), d, row2 + x);
107       }
108     } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
109       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
110         const auto in_opsin_x = Load(d, row0 + x);
111         const auto in_opsin_y = Load(d, row1 + x);
112         const auto in_opsin_b = Load(d, row2 + x);
113         JXL_COMPILER_FENCE;
114         auto linear_r = Undefined(d);
115         auto linear_g = Undefined(d);
116         auto linear_b = Undefined(d);
117         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
118                  output_encoding_info.opsin_params, &linear_r, &linear_g,
119                  &linear_b);
120         Store(TF_HLG().EncodedFromDisplay(d, linear_r), d, row0 + x);
121         Store(TF_HLG().EncodedFromDisplay(d, linear_g), d, row1 + x);
122         Store(TF_HLG().EncodedFromDisplay(d, linear_b), d, row2 + x);
123       }
124     } else if (output_encoding_info.color_encoding.tf.Is709()) {
125       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
126         const auto in_opsin_x = Load(d, row0 + x);
127         const auto in_opsin_y = Load(d, row1 + x);
128         const auto in_opsin_b = Load(d, row2 + x);
129         JXL_COMPILER_FENCE;
130         auto linear_r = Undefined(d);
131         auto linear_g = Undefined(d);
132         auto linear_b = Undefined(d);
133         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
134                  output_encoding_info.opsin_params, &linear_r, &linear_g,
135                  &linear_b);
136         Store(TF_709().EncodedFromDisplay(d, linear_r), d, row0 + x);
137         Store(TF_709().EncodedFromDisplay(d, linear_g), d, row1 + x);
138         Store(TF_709().EncodedFromDisplay(d, linear_b), d, row2 + x);
139       }
140     } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
141                output_encoding_info.color_encoding.tf.IsDCI()) {
142       auto gamma_tf = [&](hwy::HWY_NAMESPACE::Vec<decltype(d)> v) {
143         return IfThenZeroElse(
144             v <= Set(d, 1e-5f),
145             FastPowf(d, v, Set(d, output_encoding_info.inverse_gamma)));
146       };
147       for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
148 #if MEMORY_SANITIZER
149         const auto mask = Iota(d, x) < Set(d, rect.xsize());
150         const auto in_opsin_x = IfThenElseZero(mask, Load(d, row0 + x));
151         const auto in_opsin_y = IfThenElseZero(mask, Load(d, row1 + x));
152         const auto in_opsin_b = IfThenElseZero(mask, Load(d, row2 + x));
153 #else
154         const auto in_opsin_x = Load(d, row0 + x);
155         const auto in_opsin_y = Load(d, row1 + x);
156         const auto in_opsin_b = Load(d, row2 + x);
157 #endif
158         JXL_COMPILER_FENCE;
159         auto linear_r = Undefined(d);
160         auto linear_g = Undefined(d);
161         auto linear_b = Undefined(d);
162         XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
163                  output_encoding_info.opsin_params, &linear_r, &linear_g,
164                  &linear_b);
165         Store(gamma_tf(linear_r), d, row0 + x);
166         Store(gamma_tf(linear_g), d, row1 + x);
167         Store(gamma_tf(linear_b), d, row2 + x);
168       }
169     } else {
170       // This is a programming error.
171       JXL_ABORT("Invalid target encoding");
172     }
173   }
174   return true;
175 }
176 
177 template <typename D, typename V>
StoreRGBA(D d,V r,V g,V b,V a,bool alpha,size_t n,size_t extra,uint8_t * buf)178 void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra,
179                uint8_t* buf) {
180 #if HWY_TARGET == HWY_SCALAR
181   buf[0] = r.raw;
182   buf[1] = g.raw;
183   buf[2] = b.raw;
184   if (alpha) {
185     buf[3] = a.raw;
186   }
187 #elif HWY_TARGET == HWY_NEON
188   if (alpha) {
189     uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw};
190     if (extra >= 8) {
191       vst4_u8(buf, data);
192     } else {
193       uint8_t tmp[8 * 4];
194       vst4_u8(tmp, data);
195       memcpy(buf, tmp, n * 4);
196     }
197   } else {
198     uint8x8x3_t data = {r.raw, g.raw, b.raw};
199     if (extra >= 8) {
200       vst3_u8(buf, data);
201     } else {
202       uint8_t tmp[8 * 3];
203       vst3_u8(tmp, data);
204       memcpy(buf, tmp, n * 3);
205     }
206   }
207 #else
208   // TODO(veluca): implement this for x86.
209   size_t mul = alpha ? 4 : 3;
210   HWY_ALIGN uint8_t bytes[16];
211   Store(r, d, bytes);
212   for (size_t i = 0; i < n; i++) {
213     buf[mul * i] = bytes[i];
214   }
215   Store(g, d, bytes);
216   for (size_t i = 0; i < n; i++) {
217     buf[mul * i + 1] = bytes[i];
218   }
219   Store(b, d, bytes);
220   for (size_t i = 0; i < n; i++) {
221     buf[mul * i + 2] = bytes[i];
222   }
223   if (alpha) {
224     Store(a, d, bytes);
225     for (size_t i = 0; i < n; i++) {
226       buf[4 * i + 3] = bytes[i];
227     }
228   }
229 #endif
230 }
231 
232 // Outputs floating point image to RGBA 8-bit buffer. Does not support alpha
233 // channel in the input, but outputs opaque alpha channel for the case where the
234 // output buffer to write to is in the 4-byte per pixel RGBA format.
FloatToRGBA8(const Image3F & input,const Rect & input_rect,bool is_rgba,const ImageF * alpha_in,const Rect & alpha_rect,const Rect & output_buf_rect,uint8_t * JXL_RESTRICT output_buf,size_t stride)235 void FloatToRGBA8(const Image3F& input, const Rect& input_rect, bool is_rgba,
236                   const ImageF* alpha_in, const Rect& alpha_rect,
237                   const Rect& output_buf_rect, uint8_t* JXL_RESTRICT output_buf,
238                   size_t stride) {
239   size_t bytes = is_rgba ? 4 : 3;
240   for (size_t y = 0; y < output_buf_rect.ysize(); y++) {
241     const float* JXL_RESTRICT row_in_r = input_rect.ConstPlaneRow(input, 0, y);
242     const float* JXL_RESTRICT row_in_g = input_rect.ConstPlaneRow(input, 1, y);
243     const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y);
244     const float* JXL_RESTRICT row_in_a =
245         alpha_in ? alpha_rect.ConstRow(*alpha_in, y) : nullptr;
246     size_t base_ptr =
247         (y + output_buf_rect.y0()) * stride + bytes * output_buf_rect.x0();
248     using D = HWY_CAPPED(float, 4);
249     const D d;
250     D::Rebind<uint32_t> du;
251     auto zero = Zero(d);
252     auto one = Set(d, 1.0f);
253     auto mul = Set(d, 255.0f);
254 #if MEMORY_SANITIZER
255     // Avoid use-of-uninitialized-value for loads past the end of the image's
256     // initialized data.
257     auto safe_load = [&](const float* ptr, size_t x) {
258       uint32_t kMask[8] = {~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0};
259       size_t n = std::min<size_t>(Lanes(d), output_buf_rect.xsize() - x);
260       auto mask = BitCast(d, LoadU(du, kMask + Lanes(d) - n));
261       return Load(d, ptr + x) & mask;
262     };
263 #else
264     auto safe_load = [](const float* ptr, size_t x) {
265       return Load(D(), ptr + x);
266     };
267 #endif
268     for (size_t x = 0; x < output_buf_rect.xsize(); x += Lanes(d)) {
269       auto rf = Clamp(zero, safe_load(row_in_r, x), one) * mul;
270       auto gf = Clamp(zero, safe_load(row_in_g, x), one) * mul;
271       auto bf = Clamp(zero, safe_load(row_in_b, x), one) * mul;
272       auto af = row_in_a ? Clamp(zero, safe_load(row_in_a, x), one) * mul
273                          : Set(d, 255.0f);
274       auto r8 = U8FromU32(BitCast(du, NearestInt(rf)));
275       auto g8 = U8FromU32(BitCast(du, NearestInt(gf)));
276       auto b8 = U8FromU32(BitCast(du, NearestInt(bf)));
277       auto a8 = U8FromU32(BitCast(du, NearestInt(af)));
278       size_t n = output_buf_rect.xsize() - x;
279       if (JXL_LIKELY(n >= Lanes(d))) {
280         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, Lanes(d), n,
281                   output_buf + base_ptr + bytes * x);
282       } else {
283         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, n, n,
284                   output_buf + base_ptr + bytes * x);
285       }
286     }
287   }
288 }
289 
290 // NOLINTNEXTLINE(google-readability-namespace-comments)
291 }  // namespace HWY_NAMESPACE
292 }  // namespace jxl
293 HWY_AFTER_NAMESPACE();
294 
295 #if HWY_ONCE
296 namespace jxl {
297 
298 HWY_EXPORT(UndoXYBInPlace);
299 HWY_EXPORT(FloatToRGBA8);
300 
UndoXYB(const Image3F & src,Image3F * dst,const OutputEncodingInfo & output_info,ThreadPool * pool)301 void UndoXYB(const Image3F& src, Image3F* dst,
302              const OutputEncodingInfo& output_info, ThreadPool* pool) {
303   CopyImageTo(src, dst);
304   pool->Run(0, src.ysize(), ThreadPool::SkipInit(), [&](int y, int /*thread*/) {
305     JXL_CHECK(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(dst, Rect(*dst).Line(y),
306                                                    output_info));
307   });
308 }
309 
310 namespace {
ScaleRectForEC(Rect in,const FrameHeader & frame_header,size_t ec)311 Rect ScaleRectForEC(Rect in, const FrameHeader& frame_header, size_t ec) {
312   auto s = [&](size_t x) {
313     return DivCeil(x * frame_header.upsampling,
314                    frame_header.extra_channel_upsampling[ec]);
315   };
316   return Rect(s(in.x0()), s(in.y0()), s(in.xsize()), s(in.ysize()));
317 }
318 
319 // Implements EnsurePaddingInPlace, but allows processing data one row at a
320 // time.
321 class EnsurePaddingInPlaceRowByRow {
Init(const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)322   void Init(const Rect& rect, const Rect& image_rect, size_t image_xsize,
323             size_t image_ysize, size_t xpadding, size_t ypadding, ssize_t* y0,
324             ssize_t* y1) {
325     // coordinates relative to rect.
326     JXL_DASSERT(SameSize(rect, image_rect));
327     *y0 = -std::min(image_rect.y0(), ypadding);
328     *y1 = rect.ysize() + std::min(ypadding, image_ysize - image_rect.ysize() -
329                                                 image_rect.y0());
330     if (image_rect.x0() >= xpadding &&
331         image_rect.x0() + image_rect.xsize() + xpadding <= image_xsize) {
332       // Nothing to do.
333       strategy_ = kSkip;
334     } else if (image_xsize >= 2 * xpadding) {
335       strategy_ = kFast;
336     } else {
337       strategy_ = kSlow;
338     }
339     y0_ = rect.y0();
340     JXL_DASSERT(rect.x0() >= xpadding);
341     x0_ = x1_ = rect.x0() - xpadding;
342     // If close to the left border - do mirroring.
343     if (image_rect.x0() < xpadding) x1_ = rect.x0() - image_rect.x0();
344     x2_ = x3_ = rect.x0() + rect.xsize() + xpadding;
345     // If close to the right border - do mirroring.
346     if (image_rect.x0() + image_rect.xsize() + xpadding > image_xsize) {
347       x2_ = rect.x0() + image_xsize - image_rect.x0();
348     }
349     JXL_DASSERT(image_xsize == (x2_ - x1_) ||
350                 (x1_ - x0_ <= x2_ - x1_ && x3_ - x2_ <= x2_ - x1_));
351   }
352 
353  public:
Init(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)354   void Init(Image3F* img, const Rect& rect, const Rect& image_rect,
355             size_t image_xsize, size_t image_ysize, size_t xpadding,
356             size_t ypadding, ssize_t* y0, ssize_t* y1) {
357     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
358          y1);
359     img3_ = img;
360     JXL_DASSERT(x3_ <= img->xsize());
361   }
Init(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)362   void Init(ImageF* img, const Rect& rect, const Rect& image_rect,
363             size_t image_xsize, size_t image_ysize, size_t xpadding,
364             size_t ypadding, ssize_t* y0, ssize_t* y1) {
365     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
366          y1);
367     img_ = img;
368     JXL_DASSERT(x3_ <= img->xsize());
369   }
370   // To be called when row `y` of the input is available, for all the values in
371   // [*y0, *y1).
Process3(ssize_t y)372   void Process3(ssize_t y) {
373     JXL_DASSERT(img3_);
374     for (size_t c = 0; c < 3; c++) {
375       img_ = &img3_->Plane(c);
376       Process(y);
377     }
378   }
Process(ssize_t y)379   void Process(ssize_t y) {
380     JXL_DASSERT(img_);
381     switch (strategy_) {
382       case kSkip:
383         break;
384       case kFast: {
385         // Image is wide enough that a single Mirror() step is sufficient.
386         float* JXL_RESTRICT row = img_->Row(y + y0_);
387         for (size_t x = x0_; x < x1_; x++) {
388           row[x] = row[2 * x1_ - x - 1];
389         }
390         for (size_t x = x2_; x < x3_; x++) {
391           row[x] = row[2 * x2_ - x - 1];
392         }
393         break;
394       }
395       case kSlow: {
396         // Slow case for small images.
397         float* JXL_RESTRICT row = img_->Row(y + y0_) + x1_;
398         for (ssize_t x = x0_ - x1_; x < 0; x++) {
399           *(row + x) = row[Mirror(x, x2_ - x1_)];
400         }
401         for (size_t x = x2_ - x1_; x < x3_ - x1_; x++) {
402           *(row + x) = row[Mirror(x, x2_ - x1_)];
403         }
404         break;
405       }
406     }
407   }
408 
409  private:
410   // Initialized to silence spurious compiler warnings.
411   Image3F* img3_ = nullptr;
412   ImageF* img_ = nullptr;
413   // Will fill [x0_, x1_) and [x2_, x3_) on every row.
414   // The [x1_, x2_) range contains valid image pixels. We guarantee that either
415   // x1_ - x0_ <= x2_ - x1_, (and similarly for x2_, x3_), or that the [x1_,
416   // x2_) contains a full horizontal line of the original image.
417   size_t x0_ = 0, x1_ = 0, x2_ = 0, x3_ = 0;
418   size_t y0_ = 0;
419   // kSlow: use calls to Mirror(), for the case where the border might be larger
420   // than the image.
421   // kFast: directly use the result of Mirror() when it can be computed in a
422   // single iteration.
423   // kSkip: do nothing.
424   enum Strategy { kFast, kSlow, kSkip };
425   Strategy strategy_ = kSkip;
426 };
427 }  // namespace
428 
EnsurePaddingInPlace(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)429 void EnsurePaddingInPlace(Image3F* img, const Rect& rect,
430                           const Rect& image_rect, size_t image_xsize,
431                           size_t image_ysize, size_t xpadding,
432                           size_t ypadding) {
433   ssize_t y0, y1;
434   EnsurePaddingInPlaceRowByRow impl;
435   impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
436             &y0, &y1);
437   for (ssize_t y = y0; y < y1; y++) {
438     impl.Process3(y);
439   }
440 }
441 
FinalizeImageRect(Image3F * input_image,const Rect & input_rect,const std::vector<std::pair<ImageF *,Rect>> & extra_channels,PassesDecoderState * dec_state,size_t thread,ImageBundle * JXL_RESTRICT output_image,const Rect & frame_rect)442 Status FinalizeImageRect(
443     Image3F* input_image, const Rect& input_rect,
444     const std::vector<std::pair<ImageF*, Rect>>& extra_channels,
445     PassesDecoderState* dec_state, size_t thread,
446     ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect) {
447   const ImageFeatures& image_features = dec_state->shared->image_features;
448   const FrameHeader& frame_header = dec_state->shared->frame_header;
449   const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
450   const LoopFilter& lf = frame_header.loop_filter;
451   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
452   JXL_DASSERT(frame_rect.xsize() <= kApplyImageFeaturesTileDim);
453   JXL_DASSERT(frame_rect.ysize() <= kApplyImageFeaturesTileDim);
454   JXL_DASSERT(input_rect.xsize() == frame_rect.xsize());
455   JXL_DASSERT(input_rect.ysize() == frame_rect.ysize());
456   JXL_DASSERT(frame_rect.x0() % GroupBorderAssigner::kPaddingXRound == 0);
457   JXL_DASSERT(frame_rect.xsize() % GroupBorderAssigner::kPaddingXRound == 0 ||
458               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize ||
459               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize_padded);
460 
461   // +----------------------------- STEP 1 ------------------------------+
462   // | Compute the rects on which patches and splines will be applied.   |
463   // | In case we are applying upsampling, we need to apply patches on a |
464   // | slightly larger image.                                            |
465   // +-------------------------------------------------------------------+
466 
467   // If we are applying upsampling, we need 2 more pixels around the actual rect
468   // for border. Thus, we also need to apply patches and splines to those
469   // pixels. We compute here
470   // - The portion of image that corresponds to the area we are applying IF.
471   //   (rect_for_if)
472   // - The rect where that pixel data is stored in upsampling_input_storage.
473   //   (rect_for_if_storage)
474   // - The rect where the pixel data that we need to upsample is stored.
475   //   (rect_for_upsampling)
476   // - The source rect for the pixel data in `input_image`. It is assumed that,
477   //   if `frame_rect` is not on an image border, `input_image:input_rect` has
478   //   enough border available. (rect_for_if_input)
479 
480   Image3F* output_color =
481       dec_state->rgb_output == nullptr && dec_state->pixel_callback == nullptr
482           ? output_image->color()
483           : nullptr;
484 
485   Image3F* storage_for_if = output_color;
486   Rect rect_for_if = frame_rect;
487   Rect rect_for_if_storage = frame_rect;
488   Rect rect_for_upsampling = frame_rect;
489   Rect rect_for_if_input = input_rect;
490   size_t extra_rows_t = 0;
491   size_t extra_rows_b = 0;
492   if (frame_header.upsampling != 1) {
493     size_t ifbx0 = 0;
494     size_t ifbx1 = 0;
495     size_t ifby0 = 0;
496     size_t ifby1 = 0;
497     if (frame_rect.x0() >= 2) {
498       JXL_DASSERT(input_rect.x0() >= 2);
499       ifbx0 = 2;
500     }
501     if (frame_rect.y0() >= 2) {
502       JXL_DASSERT(input_rect.y0() >= 2);
503       extra_rows_t = ifby0 = 2;
504     }
505     for (size_t extra : {1, 2}) {
506       if (frame_rect.x0() + frame_rect.xsize() + extra <=
507           dec_state->shared->frame_dim.xsize_padded) {
508         JXL_DASSERT(input_rect.x0() + input_rect.xsize() + extra <=
509                     input_image->xsize());
510         ifbx1 = extra;
511       }
512       if (frame_rect.y0() + frame_rect.ysize() + extra <=
513           dec_state->shared->frame_dim.ysize_padded) {
514         JXL_DASSERT(input_rect.y0() + input_rect.ysize() + extra <=
515                     input_image->ysize());
516         extra_rows_b = ifby1 = extra;
517       }
518     }
519     rect_for_if = Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
520                        frame_rect.xsize() + ifbx0 + ifbx1,
521                        frame_rect.ysize() + ifby0 + ifby1);
522     // Storage for pixel data does not necessarily start at (0, 0) as we need to
523     // have the left border of upsampling_rect aligned to a multiple of
524     // GroupBorderAssigner::kPaddingXRound.
525     rect_for_if_storage =
526         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound) -
527                  ifbx0,
528              kBlockDim, rect_for_if.xsize(), rect_for_if.ysize());
529     rect_for_upsampling =
530         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound),
531              kBlockDim + ifby0, frame_rect.xsize(), frame_rect.ysize());
532     rect_for_if_input =
533         Rect(input_rect.x0() - ifbx0, input_rect.y0() - ifby0,
534              rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
535     storage_for_if = &dec_state->upsampling_input_storage[thread];
536   }
537 
538   // Variables for upsampling and filtering.
539   Rect upsampled_frame_rect(frame_rect.x0() * frame_header.upsampling,
540                             frame_rect.y0() * frame_header.upsampling,
541                             frame_rect.xsize() * frame_header.upsampling,
542                             frame_rect.ysize() * frame_header.upsampling);
543   Rect full_frame_rect(0, 0, frame_dim.xsize_upsampled,
544                        frame_dim.ysize_upsampled);
545   upsampled_frame_rect = upsampled_frame_rect.Crop(full_frame_rect);
546   EnsurePaddingInPlaceRowByRow ensure_padding_upsampling;
547   ssize_t ensure_padding_upsampling_y0 = 0;
548   ssize_t ensure_padding_upsampling_y1 = 0;
549 
550   EnsurePaddingInPlaceRowByRow ensure_padding_filter;
551   FilterPipeline* fp = nullptr;
552   ssize_t ensure_padding_filter_y0 = 0;
553   ssize_t ensure_padding_filter_y1 = 0;
554   Rect image_padded_rect;
555   if (lf.epf_iters != 0 || lf.gab) {
556     fp = &dec_state->filter_pipelines[thread];
557     size_t xextra =
558         rect_for_if_input.x0() % GroupBorderAssigner::kPaddingXRound;
559     image_padded_rect = Rect(rect_for_if.x0() - xextra, rect_for_if.y0(),
560                              rect_for_if.xsize() + xextra, rect_for_if.ysize());
561   }
562 
563   // +----------------------------- STEP 2 ------------------------------+
564   // | Change rects and buffer to not use `output_image` if direct       |
565   // | output to rgb8 is requested.                                      |
566   // +-------------------------------------------------------------------+
567   Image3F* output_pixel_data_storage = output_color;
568   Rect upsampled_frame_rect_for_storage = upsampled_frame_rect;
569   if (dec_state->rgb_output || dec_state->pixel_callback) {
570     size_t log2_upsampling = CeilLog2Nonzero(frame_header.upsampling);
571     if (storage_for_if == output_color) {
572       storage_for_if =
573           &dec_state->output_pixel_data_storage[log2_upsampling][thread];
574       rect_for_if_storage =
575           Rect(0, 0, rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
576     }
577     output_pixel_data_storage =
578         &dec_state->output_pixel_data_storage[log2_upsampling][thread];
579     upsampled_frame_rect_for_storage =
580         Rect(0, 0, upsampled_frame_rect.xsize(), upsampled_frame_rect.ysize());
581     if (frame_header.upsampling == 1 && fp == nullptr) {
582       upsampled_frame_rect_for_storage = rect_for_if_storage =
583           rect_for_if_input;
584       output_pixel_data_storage = storage_for_if = input_image;
585     }
586   }
587   // Set up alpha channel.
588   const size_t ec =
589       metadata.Find(ExtraChannel::kAlpha) - metadata.extra_channel_info.data();
590   const ImageF* alpha = nullptr;
591   Rect alpha_rect = upsampled_frame_rect;
592   if (ec < metadata.extra_channel_info.size()) {
593     JXL_ASSERT(ec < extra_channels.size());
594     if (frame_header.extra_channel_upsampling[ec] == 1) {
595       alpha = extra_channels[ec].first;
596       alpha_rect = extra_channels[ec].second;
597     } else {
598       alpha = &output_image->extra_channels()[ec];
599       alpha_rect = upsampled_frame_rect;
600     }
601   }
602 
603   // +----------------------------- STEP 3 ------------------------------+
604   // | Set up upsampling and upsample extra channels.                    |
605   // +-------------------------------------------------------------------+
606   Upsampler* color_upsampler = nullptr;
607   if (frame_header.upsampling != 1) {
608     color_upsampler =
609         &dec_state->upsamplers[CeilLog2Nonzero(frame_header.upsampling) - 1];
610     ensure_padding_upsampling.Init(
611         storage_for_if, rect_for_upsampling, frame_rect, frame_dim.xsize_padded,
612         frame_dim.ysize_padded, 2, 2, &ensure_padding_upsampling_y0,
613         &ensure_padding_upsampling_y1);
614   }
615 
616   std::vector<std::pair<ImageF*, Rect>> extra_channels_for_patches;
617   std::vector<EnsurePaddingInPlaceRowByRow> ec_padding;
618 
619   bool late_ec_upsample = frame_header.upsampling != 1;
620   for (auto ecups : frame_header.extra_channel_upsampling) {
621     if (ecups != frame_header.upsampling) {
622       // If patches are applied, either frame_header.upsampling == 1 or
623       // late_ec_upsample is true.
624       late_ec_upsample = false;
625     }
626   }
627 
628   ssize_t ensure_padding_upsampling_ec_y0 = 0;
629   ssize_t ensure_padding_upsampling_ec_y1 = 0;
630 
631   // TODO(veluca) do not upsample extra channels to a full-image-sized buffer if
632   // we are not outputting to an ImageBundle.
633   if (!late_ec_upsample) {
634     // Upsample extra channels first if not all channels have the same
635     // upsampling factor.
636     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
637       size_t ecups = frame_header.extra_channel_upsampling[ec];
638       if (ecups == 1) {
639         extra_channels_for_patches.push_back(extra_channels[ec]);
640         continue;
641       }
642       ssize_t ensure_padding_y0, ensure_padding_y1;
643       EnsurePaddingInPlaceRowByRow ensure_padding;
644       Rect ec_image_rect = ScaleRectForEC(frame_rect, frame_header, ec);
645       size_t ecxs = DivCeil(frame_dim.xsize_upsampled,
646                             frame_header.extra_channel_upsampling[ec]);
647       size_t ecys = DivCeil(frame_dim.ysize_upsampled,
648                             frame_header.extra_channel_upsampling[ec]);
649       ensure_padding.Init(extra_channels[ec].first, extra_channels[ec].second,
650                           ec_image_rect, ecxs, ecys, 2, 2, &ensure_padding_y0,
651                           &ensure_padding_y1);
652       for (ssize_t y = ensure_padding_y0; y < ensure_padding_y1; y++) {
653         ensure_padding.Process(y);
654       }
655       Upsampler& upsampler =
656           dec_state->upsamplers[CeilLog2Nonzero(
657                                     frame_header.extra_channel_upsampling[ec]) -
658                                 1];
659       upsampler.UpsampleRect(
660           *extra_channels[ec].first, extra_channels[ec].second,
661           &output_image->extra_channels()[ec], upsampled_frame_rect,
662           static_cast<ssize_t>(ec_image_rect.y0()) -
663               static_cast<ssize_t>(extra_channels[ec].second.y0()),
664           ecys);
665       extra_channels_for_patches.emplace_back(
666           &output_image->extra_channels()[ec], upsampled_frame_rect);
667     }
668   } else {
669     // Upsample extra channels last if color channels are upsampled and all the
670     // extra channels have the same upsampling as them.
671     ec_padding.resize(extra_channels.size());
672     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
673       // Add a border to the extra channel rect for when patches are applied.
674       // This ensures that the correct row is accessed (y values for patches are
675       // relative to rect_for_if, not to input_rect).
676       // As the rect is extended by 0 or 2 pixels, and the patches input has,
677       // accordingly, the same padding, this is safe.
678       Rect r(extra_channels[ec].second.x0() + rect_for_upsampling.x0() -
679                  rect_for_if_storage.x0(),
680              extra_channels[ec].second.y0() + rect_for_upsampling.y0() -
681                  rect_for_if_storage.y0(),
682              extra_channels[ec].second.xsize() + rect_for_if_storage.xsize() -
683                  rect_for_upsampling.xsize(),
684              extra_channels[ec].second.ysize() + rect_for_if_storage.ysize() -
685                  rect_for_upsampling.ysize());
686       extra_channels_for_patches.emplace_back(extra_channels[ec].first, r);
687       ec_padding[ec].Init(extra_channels[ec].first, extra_channels[ec].second,
688                           frame_rect, frame_dim.xsize, frame_dim.ysize, 2, 2,
689                           &ensure_padding_upsampling_ec_y0,
690                           &ensure_padding_upsampling_ec_y1);
691     }
692   }
693 
694   // Initialized to a valid non-null ptr to avoid UB if arithmetic is done with
695   // the pointer value (which would then not be used).
696   std::vector<float*> ec_ptrs_for_patches(extra_channels.size(),
697                                           input_image->PlaneRow(0, 0));
698 
699   // +----------------------------- STEP 4 ------------------------------+
700   // | Set up the filter pipeline.                                       |
701   // +-------------------------------------------------------------------+
702   if (fp) {
703     // If `rect_for_if_input` does not start at a multiple of
704     // GroupBorderAssigner::kPaddingXRound, we extend the rect we run EPF on by
705     // one full padding length to ensure sigma is handled correctly. We also
706     // extend the output and image rects accordingly. To do this, we need 2x the
707     // border.
708     size_t xextra =
709         rect_for_if_input.x0() % GroupBorderAssigner::kPaddingXRound;
710     Rect filter_input_padded_rect(
711         rect_for_if_input.x0() - xextra, rect_for_if_input.y0(),
712         rect_for_if_input.xsize() + xextra, rect_for_if_input.ysize());
713     ensure_padding_filter.Init(
714         input_image, rect_for_if_input, rect_for_if, frame_dim.xsize_padded,
715         frame_dim.ysize_padded, lf.Padding(), lf.Padding(),
716         &ensure_padding_filter_y0, &ensure_padding_filter_y1);
717     Rect filter_output_padded_rect(
718         rect_for_if_storage.x0() - xextra, rect_for_if_storage.y0(),
719         rect_for_if_storage.xsize() + xextra, rect_for_if_storage.ysize());
720     fp = PrepareFilterPipeline(dec_state, image_padded_rect, *input_image,
721                                filter_input_padded_rect, frame_dim.ysize_padded,
722                                thread, storage_for_if,
723                                filter_output_padded_rect);
724   }
725 
726   // +----------------------------- STEP 5 ------------------------------+
727   // | Run the prepared pipeline of operations.                          |
728   // +-------------------------------------------------------------------+
729 
730   // y values are relative to rect_for_if.
731   // Automatic mirroring in fp->ApplyFiltersRow() implies that we should ensure
732   // that padding for the first lines of the image is already present before
733   // calling ApplyFiltersRow() with "virtual" rows.
734   // Here we rely on the fact that virtual rows at the beginning of the image
735   // are only present if input_rect.y0() == 0.
736   ssize_t first_ensure_padding_y = ensure_padding_filter_y0;
737   if (frame_rect.y0() == 0) {
738     JXL_DASSERT(ensure_padding_filter_y0 == 0);
739     first_ensure_padding_y =
740         std::min<ssize_t>(lf.Padding(), ensure_padding_filter_y1);
741     for (ssize_t y = 0; y < first_ensure_padding_y; y++) {
742       ensure_padding_filter.Process3(y);
743     }
744   }
745 
746   for (ssize_t y = -lf.Padding();
747        y < static_cast<ssize_t>(lf.Padding() + rect_for_if.ysize()); y++) {
748     if (fp) {
749       if (y >= first_ensure_padding_y && y < ensure_padding_filter_y1) {
750         ensure_padding_filter.Process3(y);
751       }
752       fp->ApplyFiltersRow(lf, dec_state->filter_weights, image_padded_rect, y);
753     } else if (output_pixel_data_storage != input_image) {
754       for (size_t c = 0; c < 3; c++) {
755         memcpy(rect_for_if_storage.PlaneRow(storage_for_if, c, y),
756                rect_for_if_input.ConstPlaneRow(*input_image, c, y),
757                rect_for_if_input.xsize() * sizeof(float));
758       }
759     }
760     if (y < static_cast<ssize_t>(lf.Padding())) continue;
761     // At this point, row `y - lf.Padding()` of `rect_for_if` has been produced
762     // by the filters.
763     ssize_t available_y = y - lf.Padding();
764     if (frame_header.upsampling == 1) {
765       for (size_t i = 0; i < extra_channels.size(); i++) {
766         ec_ptrs_for_patches[i] = extra_channels_for_patches[i].second.Row(
767             extra_channels_for_patches[i].first, available_y);
768       }
769     }
770     JXL_RETURN_IF_ERROR(image_features.patches.AddTo(
771         storage_for_if, rect_for_if_storage.Line(available_y),
772         ec_ptrs_for_patches.data(), rect_for_if.Line(available_y)));
773     JXL_RETURN_IF_ERROR(image_features.splines.AddTo(
774         storage_for_if, rect_for_if_storage.Line(available_y),
775         rect_for_if.Line(available_y), dec_state->shared->cmap));
776     size_t num_ys = 1;
777     if (frame_header.upsampling != 1) {
778       // Upsampling `y` values are relative to `rect_for_upsampling`, not to
779       // `rect_for_if`.
780       ssize_t shifted_y = available_y - extra_rows_t;
781       if (shifted_y >= ensure_padding_upsampling_y0 &&
782           shifted_y < ensure_padding_upsampling_y1) {
783         ensure_padding_upsampling.Process3(shifted_y);
784       }
785       if (late_ec_upsample && shifted_y >= ensure_padding_upsampling_ec_y0 &&
786           shifted_y < ensure_padding_upsampling_ec_y1) {
787         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
788           ec_padding[ec].Process(shifted_y);
789         }
790       }
791       // Upsampling will access two rows of border, so the first upsampling
792       // output will be available after shifted_y is at least 2, *unless* image
793       // height is <= 2.
794       if (shifted_y < 2 &&
795           shifted_y + 1 != static_cast<ssize_t>(frame_dim.ysize_padded)) {
796         continue;
797       }
798       // Value relative to upsampled_frame_rect.
799       size_t input_y = std::max<ssize_t>(shifted_y - 2, 0);
800       size_t upsampled_available_y = frame_header.upsampling * input_y;
801       size_t num_input_rows = 1;
802       // If we are going to mirror the last output rows, then we already have 3
803       // input lines ready. This happens iff we did not extend rect_for_if on
804       // the bottom *and* we are at the last `y` value.
805       if (extra_rows_b != 2 &&
806           static_cast<size_t>(y) + 1 == lf.Padding() + rect_for_if.ysize()) {
807         num_input_rows = 3;
808       }
809       num_input_rows = std::min(num_input_rows, frame_dim.ysize_padded);
810       num_ys = num_input_rows * frame_header.upsampling;
811 
812       if (static_cast<size_t>(upsampled_available_y) >=
813           upsampled_frame_rect.ysize()) {
814         continue;
815       }
816 
817       if (upsampled_available_y + num_ys >= upsampled_frame_rect.ysize()) {
818         num_ys = upsampled_frame_rect.ysize() - upsampled_available_y;
819       }
820 
821       Rect upsample_input_rect = rect_for_upsampling.Lines(input_y, 1);
822       color_upsampler->UpsampleRect(
823           *storage_for_if, upsample_input_rect, output_pixel_data_storage,
824           upsampled_frame_rect_for_storage.Lines(upsampled_available_y, num_ys),
825           static_cast<ssize_t>(frame_rect.y0()) -
826               static_cast<ssize_t>(rect_for_upsampling.y0()),
827           frame_dim.ysize_padded);
828       if (late_ec_upsample) {
829         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
830           color_upsampler->UpsampleRect(
831               *extra_channels[ec].first,
832               extra_channels[ec].second.Lines(input_y, num_input_rows),
833               &output_image->extra_channels()[ec],
834               upsampled_frame_rect.Lines(upsampled_available_y, num_ys),
835               static_cast<ssize_t>(frame_rect.y0()) -
836                   static_cast<ssize_t>(extra_channels[ec].second.y0()),
837               frame_dim.ysize);
838         }
839       }
840       available_y = upsampled_available_y;
841     }
842 
843     if (static_cast<size_t>(available_y) >= upsampled_frame_rect.ysize()) {
844       continue;
845     }
846 
847     // The image data is now unconditionally in
848     // `output_image_storage:upsampled_frame_rect_for_storage`.
849     if (frame_header.flags & FrameHeader::kNoise) {
850       PROFILER_ZONE("AddNoise");
851       AddNoise(image_features.noise_params,
852                upsampled_frame_rect.Lines(available_y, num_ys),
853                dec_state->noise,
854                upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
855                dec_state->shared_storage.cmap, output_pixel_data_storage);
856     }
857 
858     if (dec_state->pre_color_transform_frame.xsize() != 0) {
859       for (size_t c = 0; c < 3; c++) {
860         for (size_t y = available_y; y < available_y + num_ys; y++) {
861           float* JXL_RESTRICT row_out = upsampled_frame_rect.PlaneRow(
862               &dec_state->pre_color_transform_frame, c, y);
863           const float* JXL_RESTRICT row_in =
864               upsampled_frame_rect_for_storage.ConstPlaneRow(
865                   *output_pixel_data_storage, c, y);
866           memcpy(row_out, row_in,
867                  upsampled_frame_rect.xsize() * sizeof(*row_in));
868         }
869       }
870     }
871 
872     // We skip the color transform entirely if save_before_color_transform and
873     // the frame is not supposed to be displayed.
874 
875     if (dec_state->fast_xyb_srgb8_conversion) {
876       FastXYBTosRGB8(
877           *output_pixel_data_storage,
878           upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
879           upsampled_frame_rect.Lines(available_y, num_ys)
880               .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)),
881           alpha, alpha_rect.Lines(available_y, num_ys),
882           dec_state->rgb_output_is_rgba, dec_state->rgb_output, frame_dim.xsize,
883           dec_state->rgb_stride);
884     } else {
885       if (frame_header.needs_color_transform()) {
886         if (frame_header.color_transform == ColorTransform::kXYB) {
887           JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(
888               output_pixel_data_storage,
889               upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
890               dec_state->output_encoding_info));
891         } else if (frame_header.color_transform == ColorTransform::kYCbCr) {
892           YcbcrToRgb(
893               *output_pixel_data_storage, output_pixel_data_storage,
894               upsampled_frame_rect_for_storage.Lines(available_y, num_ys));
895         }
896       }
897 
898       // TODO(veluca): all blending should happen here.
899 
900       if (dec_state->rgb_output != nullptr) {
901         HWY_DYNAMIC_DISPATCH(FloatToRGBA8)
902         (*output_pixel_data_storage,
903          upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
904          dec_state->rgb_output_is_rgba, alpha,
905          alpha_rect.Lines(available_y, num_ys),
906          upsampled_frame_rect.Lines(available_y, num_ys)
907              .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)),
908          dec_state->rgb_output, dec_state->rgb_stride);
909       }
910       if (dec_state->pixel_callback != nullptr) {
911         Rect alpha_line_rect = alpha_rect.Lines(available_y, num_ys);
912         Rect color_input_line_rect =
913             upsampled_frame_rect_for_storage.Lines(available_y, num_ys);
914         Rect image_line_rect =
915             upsampled_frame_rect.Lines(available_y, num_ys)
916                 .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize));
917         const float* line_buffers[4];
918         for (size_t iy = 0; iy < image_line_rect.ysize(); iy++) {
919           for (size_t c = 0; c < 3; c++) {
920             line_buffers[c] = color_input_line_rect.ConstPlaneRow(
921                 *output_pixel_data_storage, c, iy);
922           }
923           if (alpha) {
924             line_buffers[3] = alpha_line_rect.ConstRow(*alpha, iy);
925           } else {
926             line_buffers[3] = dec_state->opaque_alpha.data();
927           }
928           std::vector<float>& interleaved =
929               dec_state->pixel_callback_rows[thread];
930           size_t j = 0;
931           for (size_t i = 0; i < image_line_rect.xsize(); i++) {
932             interleaved[j++] = line_buffers[0][i];
933             interleaved[j++] = line_buffers[1][i];
934             interleaved[j++] = line_buffers[2][i];
935             if (dec_state->rgb_output_is_rgba) {
936               interleaved[j++] = line_buffers[3][i];
937             }
938           }
939           dec_state->pixel_callback(interleaved.data(), image_line_rect.x0(),
940                                     image_line_rect.y0() + iy,
941                                     image_line_rect.xsize());
942         }
943       }
944     }
945   }
946 
947   return true;
948 }
949 
FinalizeFrameDecoding(ImageBundle * decoded,PassesDecoderState * dec_state,ThreadPool * pool,bool force_fir,bool skip_blending)950 Status FinalizeFrameDecoding(ImageBundle* decoded,
951                              PassesDecoderState* dec_state, ThreadPool* pool,
952                              bool force_fir, bool skip_blending) {
953   const LoopFilter& lf = dec_state->shared->frame_header.loop_filter;
954   const FrameHeader& frame_header = dec_state->shared->frame_header;
955   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
956 
957   const Image3F* finalize_image_rect_input = &dec_state->decoded;
958   Image3F chroma_upsampled_image;
959   // If we used chroma subsampling, we upsample chroma now and run
960   // ApplyImageFeatures after.
961   // TODO(veluca): this should part of the FinalizeImageRect() pipeline.
962   if (!frame_header.chroma_subsampling.Is444()) {
963     chroma_upsampled_image = CopyImage(dec_state->decoded);
964     finalize_image_rect_input = &chroma_upsampled_image;
965     for (size_t c = 0; c < 3; c++) {
966       ImageF& plane = const_cast<ImageF&>(chroma_upsampled_image.Plane(c));
967       plane.ShrinkTo(DivCeil(frame_dim.xsize_padded,
968                              1 << frame_header.chroma_subsampling.HShift(c)),
969                      DivCeil(frame_dim.ysize_padded,
970                              1 << frame_header.chroma_subsampling.VShift(c)));
971       for (size_t i = 0; i < frame_header.chroma_subsampling.HShift(c); i++) {
972         plane.InitializePaddingForUnalignedAccesses();
973         const size_t output_xsize =
974             DivCeil(frame_dim.xsize_padded,
975                     1 << (frame_header.chroma_subsampling.HShift(c) - i - 1));
976         plane = UpsampleH2(plane, output_xsize, pool);
977       }
978       for (size_t i = 0; i < frame_header.chroma_subsampling.VShift(c); i++) {
979         plane.InitializePaddingForUnalignedAccesses();
980         plane = UpsampleV2(plane, pool);
981       }
982       plane.ShrinkTo(frame_dim.xsize_padded, frame_dim.ysize_padded);
983       JXL_DASSERT(plane.PixelsPerRow() == dec_state->decoded.PixelsPerRow());
984     }
985   }
986   // FinalizeImageRect was not yet run, or we are forcing a run.
987   if (!dec_state->EagerFinalizeImageRect() || force_fir) {
988     if (lf.epf_iters > 0 && frame_header.encoding == FrameEncoding::kModular) {
989       FillImage(kInvSigmaNum / lf.epf_sigma_for_modular,
990                 &dec_state->filter_weights.sigma);
991     }
992     std::vector<Rect> rects_to_process;
993     for (size_t y = 0; y < frame_dim.ysize_padded; y += kGroupDim) {
994       for (size_t x = 0; x < frame_dim.xsize_padded; x += kGroupDim) {
995         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_padded,
996                   frame_dim.ysize_padded);
997         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
998         rects_to_process.push_back(rect);
999       }
1000     }
1001     const auto allocate_storage = [&](size_t num_threads) {
1002       dec_state->EnsureStorage(num_threads);
1003       return true;
1004     };
1005 
1006     {
1007       std::vector<ImageF> ecs;
1008       const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
1009       for (size_t i = 0; i < metadata.num_extra_channels; i++) {
1010         if (frame_header.extra_channel_upsampling[i] == 1) {
1011           ecs.push_back(std::move(dec_state->extra_channels[i]));
1012         } else {
1013           ecs.emplace_back(frame_dim.xsize_upsampled_padded,
1014                            frame_dim.ysize_upsampled_padded);
1015         }
1016       }
1017       decoded->SetExtraChannels(std::move(ecs));
1018     }
1019 
1020     std::atomic<bool> apply_features_ok{true};
1021     auto run_apply_features = [&](size_t rect_id, size_t thread) {
1022       size_t xstart = kBlockDim + dec_state->group_border_assigner.PaddingX(
1023                                       dec_state->FinalizeRectPadding());
1024       size_t ystart = dec_state->FinalizeRectPadding();
1025       Rect group_data_rect(xstart, ystart, rects_to_process[rect_id].xsize(),
1026                            rects_to_process[rect_id].ysize());
1027       CopyImageToWithPadding(rects_to_process[rect_id],
1028                              *finalize_image_rect_input,
1029                              dec_state->FinalizeRectPadding(), group_data_rect,
1030                              &dec_state->group_data[thread]);
1031       std::vector<std::pair<ImageF*, Rect>> ec_rects;
1032       ec_rects.reserve(decoded->extra_channels().size());
1033       for (size_t i = 0; i < decoded->extra_channels().size(); i++) {
1034         Rect r = ScaleRectForEC(rects_to_process[rect_id], frame_header, i);
1035         if (frame_header.extra_channel_upsampling[i] != 1) {
1036           Rect ec_input_rect(kBlockDim, 2, r.xsize(), r.ysize());
1037           auto eti =
1038               &dec_state
1039                    ->ec_temp_images[thread * decoded->extra_channels().size() +
1040                                     i];
1041           CopyImageToWithPadding(r, dec_state->extra_channels[i],
1042                                  /*padding=*/2, ec_input_rect, eti);
1043           ec_rects.emplace_back(eti, ec_input_rect);
1044         } else {
1045           ec_rects.emplace_back(&decoded->extra_channels()[i], r);
1046         }
1047       }
1048       if (!FinalizeImageRect(&dec_state->group_data[thread], group_data_rect,
1049                              ec_rects, dec_state, thread, decoded,
1050                              rects_to_process[rect_id])) {
1051         apply_features_ok = false;
1052       }
1053     };
1054 
1055     RunOnPool(pool, 0, rects_to_process.size(), allocate_storage,
1056               run_apply_features, "ApplyFeatures");
1057 
1058     if (!apply_features_ok) {
1059       return JXL_FAILURE("FinalizeImageRect failed");
1060     }
1061   }
1062 
1063   const size_t xsize = frame_dim.xsize_upsampled;
1064   const size_t ysize = frame_dim.ysize_upsampled;
1065 
1066   decoded->ShrinkTo(xsize, ysize);
1067   if (dec_state->pre_color_transform_frame.xsize() != 0) {
1068     dec_state->pre_color_transform_frame.ShrinkTo(xsize, ysize);
1069   }
1070 
1071   if (!skip_blending && ImageBlender::NeedsBlending(dec_state)) {
1072     if (dec_state->pre_color_transform_frame.xsize() != 0) {
1073       // Extra channels are going to be modified. Make a copy.
1074       dec_state->pre_color_transform_ec.clear();
1075       for (const auto& ec : decoded->extra_channels()) {
1076         dec_state->pre_color_transform_ec.emplace_back(CopyImage(ec));
1077       }
1078     }
1079     ImageBlender blender;
1080     ImageBundle foreground = std::move(*decoded);
1081     JXL_RETURN_IF_ERROR(blender.PrepareBlending(
1082         dec_state, foreground.origin, foreground.xsize(), foreground.ysize(),
1083         foreground.c_current(), /*output=*/decoded));
1084 
1085     std::vector<Rect> rects_to_process;
1086     for (size_t y = 0; y < frame_dim.ysize; y += kGroupDim) {
1087       for (size_t x = 0; x < frame_dim.xsize; x += kGroupDim) {
1088         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize, frame_dim.ysize);
1089         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1090         rects_to_process.push_back(rect);
1091       }
1092     }
1093 
1094     std::atomic<bool> blending_ok{true};
1095     JXL_RETURN_IF_ERROR(RunOnPool(
1096         pool, 0, rects_to_process.size(), ThreadPool::SkipInit(),
1097         [&](size_t i, size_t /*thread*/) {
1098           const Rect& rect = rects_to_process[i];
1099           auto rect_blender = blender.PrepareRect(
1100               rect, *foreground.color(), foreground.extra_channels(), rect);
1101           for (size_t y = 0; y < rect.ysize(); ++y) {
1102             if (!rect_blender.DoBlending(y)) {
1103               blending_ok = false;
1104               return;
1105             }
1106           }
1107         },
1108         "Blend"));
1109     JXL_RETURN_IF_ERROR(blending_ok.load());
1110   }
1111 
1112   return true;
1113 }
1114 
1115 }  // namespace jxl
1116 #endif  // HWY_ONCE
1117