1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/dec_reconstruct.h"
7 
8 #include <atomic>
9 #include <utility>
10 
11 #include "lib/jxl/filters.h"
12 #include "lib/jxl/image_ops.h"
13 
14 #undef HWY_TARGET_INCLUDE
15 #define HWY_TARGET_INCLUDE "lib/jxl/dec_reconstruct.cc"
16 #include <hwy/foreach_target.h>
17 #include <hwy/highway.h>
18 
19 #include "lib/jxl/aux_out.h"
20 #include "lib/jxl/base/data_parallel.h"
21 #include "lib/jxl/base/profiler.h"
22 #include "lib/jxl/blending.h"
23 #include "lib/jxl/color_management.h"
24 #include "lib/jxl/common.h"
25 #include "lib/jxl/dec_noise.h"
26 #include "lib/jxl/dec_upsample.h"
27 #include "lib/jxl/dec_xyb-inl.h"
28 #include "lib/jxl/dec_xyb.h"
29 #include "lib/jxl/epf.h"
30 #include "lib/jxl/fast_math-inl.h"
31 #include "lib/jxl/frame_header.h"
32 #include "lib/jxl/loop_filter.h"
33 #include "lib/jxl/passes_state.h"
34 #include "lib/jxl/sanitizers.h"
35 #include "lib/jxl/transfer_functions-inl.h"
36 HWY_BEFORE_NAMESPACE();
37 namespace jxl {
38 namespace HWY_NAMESPACE {
39 
40 template <typename Op>
DoUndoXYBInPlace(Image3F * idct,const Rect & rect,Op op,const OutputEncodingInfo & output_encoding_info)41 void DoUndoXYBInPlace(Image3F* idct, const Rect& rect, Op op,
42                       const OutputEncodingInfo& output_encoding_info) {
43   // TODO(eustas): should it still be capped?
44   const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) d;
45   const size_t xsize = rect.xsize();
46   const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
47   // The size of `rect` might not be a multiple of Lanes(d), but is guaranteed
48   // to be a multiple of kBlockDim or at the margin of the image.
49   for (size_t y = 0; y < rect.ysize(); y++) {
50     float* JXL_RESTRICT row0 = rect.PlaneRow(idct, 0, y);
51     float* JXL_RESTRICT row1 = rect.PlaneRow(idct, 1, y);
52     float* JXL_RESTRICT row2 = rect.PlaneRow(idct, 2, y);
53     // All calculations are lane-wise, still some might require value-dependent
54     // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
55     msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
56     msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
57     msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
58     for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
59       const auto in_opsin_x = Load(d, row0 + x);
60       const auto in_opsin_y = Load(d, row1 + x);
61       const auto in_opsin_b = Load(d, row2 + x);
62       JXL_COMPILER_FENCE;
63       auto linear_r = Undefined(d);
64       auto linear_g = Undefined(d);
65       auto linear_b = Undefined(d);
66       XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
67                output_encoding_info.opsin_params, &linear_r, &linear_g,
68                &linear_b);
69       Store(op.Transform(d, linear_r), d, row0 + x);
70       Store(op.Transform(d, linear_g), d, row1 + x);
71       Store(op.Transform(d, linear_b), d, row2 + x);
72     }
73     msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
74     msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
75     msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
76   }
77 }
78 
79 struct OpLinear {
80   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpLinear81   T Transform(D d, const T& linear) {
82     return linear;
83   }
84 };
85 
86 struct OpRgb {
87   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpRgb88   T Transform(D d, const T& linear) {
89 #if JXL_HIGH_PRECISION
90     return TF_SRGB().EncodedFromDisplay(d, linear);
91 #else
92     return FastLinearToSRGB(d, linear);
93 #endif
94   }
95 };
96 
97 struct OpPq {
98   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpPq99   T Transform(D d, const T& linear) {
100     return TF_PQ().EncodedFromDisplay(d, linear);
101   }
102 };
103 
104 struct OpHlg {
105   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpHlg106   T Transform(D d, const T& linear) {
107     return TF_HLG().EncodedFromDisplay(d, linear);
108   }
109 };
110 
111 struct Op709 {
112   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::Op709113   T Transform(D d, const T& linear) {
114     return TF_709().EncodedFromDisplay(d, linear);
115   }
116 };
117 
118 struct OpGamma {
119   const float inverse_gamma;
120   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpGamma121   T Transform(D d, const T& linear) {
122     return IfThenZeroElse(linear <= Set(d, 1e-5f),
123                           FastPowf(d, linear, Set(d, inverse_gamma)));
124   }
125 };
126 
UndoXYBInPlace(Image3F * idct,const Rect & rect,const OutputEncodingInfo & output_encoding_info)127 Status UndoXYBInPlace(Image3F* idct, const Rect& rect,
128                       const OutputEncodingInfo& output_encoding_info) {
129   PROFILER_ZONE("UndoXYB");
130 
131   if (output_encoding_info.color_encoding.tf.IsLinear()) {
132     DoUndoXYBInPlace(idct, rect, OpLinear(), output_encoding_info);
133   } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
134     DoUndoXYBInPlace(idct, rect, OpRgb(), output_encoding_info);
135   } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
136     DoUndoXYBInPlace(idct, rect, OpPq(), output_encoding_info);
137   } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
138     DoUndoXYBInPlace(idct, rect, OpHlg(), output_encoding_info);
139   } else if (output_encoding_info.color_encoding.tf.Is709()) {
140     DoUndoXYBInPlace(idct, rect, Op709(), output_encoding_info);
141   } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
142              output_encoding_info.color_encoding.tf.IsDCI()) {
143     OpGamma op = {output_encoding_info.inverse_gamma};
144     DoUndoXYBInPlace(idct, rect, op, output_encoding_info);
145   } else {
146     // This is a programming error.
147     JXL_ABORT("Invalid target encoding");
148   }
149   return true;
150 }
151 
152 template <typename D, typename V>
StoreRGBA(D d,V r,V g,V b,V a,bool alpha,size_t n,size_t extra,uint8_t * buf)153 void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra,
154                uint8_t* buf) {
155 #if HWY_TARGET == HWY_SCALAR
156   buf[0] = r.raw;
157   buf[1] = g.raw;
158   buf[2] = b.raw;
159   if (alpha) {
160     buf[3] = a.raw;
161   }
162 #elif HWY_TARGET == HWY_NEON
163   if (alpha) {
164     uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw};
165     if (extra >= 8) {
166       vst4_u8(buf, data);
167     } else {
168       uint8_t tmp[8 * 4];
169       vst4_u8(tmp, data);
170       memcpy(buf, tmp, n * 4);
171     }
172   } else {
173     uint8x8x3_t data = {r.raw, g.raw, b.raw};
174     if (extra >= 8) {
175       vst3_u8(buf, data);
176     } else {
177       uint8_t tmp[8 * 3];
178       vst3_u8(tmp, data);
179       memcpy(buf, tmp, n * 3);
180     }
181   }
182 #else
183   // TODO(veluca): implement this for x86.
184   size_t mul = alpha ? 4 : 3;
185   HWY_ALIGN uint8_t bytes[16];
186   Store(r, d, bytes);
187   for (size_t i = 0; i < n; i++) {
188     buf[mul * i] = bytes[i];
189   }
190   Store(g, d, bytes);
191   for (size_t i = 0; i < n; i++) {
192     buf[mul * i + 1] = bytes[i];
193   }
194   Store(b, d, bytes);
195   for (size_t i = 0; i < n; i++) {
196     buf[mul * i + 2] = bytes[i];
197   }
198   if (alpha) {
199     Store(a, d, bytes);
200     for (size_t i = 0; i < n; i++) {
201       buf[4 * i + 3] = bytes[i];
202     }
203   }
204 #endif
205 }
206 
207 // Outputs floating point image to RGBA 8-bit buffer. Does not support alpha
208 // channel in the input, but outputs opaque alpha channel for the case where the
209 // output buffer to write to is in the 4-byte per pixel RGBA format.
FloatToRGBA8(const Image3F & input,const Rect & input_rect,bool is_rgba,const ImageF * alpha_in,const Rect & alpha_rect,const Rect & output_buf_rect,uint8_t * JXL_RESTRICT output_buf,size_t stride)210 void FloatToRGBA8(const Image3F& input, const Rect& input_rect, bool is_rgba,
211                   const ImageF* alpha_in, const Rect& alpha_rect,
212                   const Rect& output_buf_rect, uint8_t* JXL_RESTRICT output_buf,
213                   size_t stride) {
214   size_t bytes = is_rgba ? 4 : 3;
215   for (size_t y = 0; y < output_buf_rect.ysize(); y++) {
216     const float* JXL_RESTRICT row_in_r = input_rect.ConstPlaneRow(input, 0, y);
217     const float* JXL_RESTRICT row_in_g = input_rect.ConstPlaneRow(input, 1, y);
218     const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y);
219     const float* JXL_RESTRICT row_in_a =
220         alpha_in ? alpha_rect.ConstRow(*alpha_in, y) : nullptr;
221     size_t base_ptr =
222         (y + output_buf_rect.y0()) * stride + bytes * output_buf_rect.x0();
223     using D = HWY_CAPPED(float, 4);
224     const D d;
225     D::Rebind<uint32_t> du;
226     auto zero = Zero(d);
227     auto one = Set(d, 1.0f);
228     auto mul = Set(d, 255.0f);
229 
230     // All calculations are lane-wise, still some might require value-dependent
231     // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
232     size_t xsize = output_buf_rect.xsize();
233     size_t xsize_v = RoundUpTo(xsize, Lanes(d));
234     msan::UnpoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
235     msan::UnpoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
236     msan::UnpoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
237     if (row_in_a)
238       msan::UnpoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
239     for (size_t x = 0; x < xsize; x += Lanes(d)) {
240       auto rf = Clamp(zero, Load(d, row_in_r + x), one) * mul;
241       auto gf = Clamp(zero, Load(d, row_in_g + x), one) * mul;
242       auto bf = Clamp(zero, Load(d, row_in_b + x), one) * mul;
243       auto af = row_in_a ? Clamp(zero, Load(d, row_in_a + x), one) * mul
244                          : Set(d, 255.0f);
245       auto r8 = U8FromU32(BitCast(du, NearestInt(rf)));
246       auto g8 = U8FromU32(BitCast(du, NearestInt(gf)));
247       auto b8 = U8FromU32(BitCast(du, NearestInt(bf)));
248       auto a8 = U8FromU32(BitCast(du, NearestInt(af)));
249       size_t n = output_buf_rect.xsize() - x;
250       if (JXL_LIKELY(n >= Lanes(d))) {
251         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, Lanes(d), n,
252                   output_buf + base_ptr + bytes * x);
253       } else {
254         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, n, n,
255                   output_buf + base_ptr + bytes * x);
256       }
257     }
258     msan::PoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
259     msan::PoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
260     msan::PoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
261     if (row_in_a)
262       msan::PoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
263   }
264 }
265 
266 // Upsample in horizonal (if hs=1) and vertical (if vs=1) the plane_in image
267 // to the output plane_out image.
268 // The output region "rect" in plane_out and a border around it of lf.Padding()
269 // will be generated, as long as those pixels fall inside the image frame.
270 // Otherwise the border pixels that fall outside the image frame in plane_out
271 // are undefined.
272 // "rect" is an area inside the plane_out image which corresponds to the
273 // "frame_rect" area in the frame. plane_in and plane_out both are expected to
274 // have a padding of kGroupDataXBorder and kGroupDataYBorder on either side of
275 // X and Y coordinates. This means that when upsampling vertically the plane_out
276 // row `kGroupDataXBorder + N` will be generated from the plane_in row
277 // `kGroupDataXBorder + N / 2` (and a previous or next row).
DoYCbCrUpsampling(size_t hs,size_t vs,ImageF * plane_in,const Rect & rect,const Rect & frame_rect,const FrameDimensions & frame_dim,ImageF * plane_out,const LoopFilter & lf,ImageF * temp)278 void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect,
279                        const Rect& frame_rect, const FrameDimensions& frame_dim,
280                        ImageF* plane_out, const LoopFilter& lf, ImageF* temp) {
281   JXL_DASSERT(SameSize(rect, frame_rect));
282   JXL_DASSERT(hs <= 1 && vs <= 1);
283   // The pixel in (xoff, yoff) is the origin of the downsampling coordinate
284   // system.
285   size_t xoff = PassesDecoderState::kGroupDataXBorder;
286   size_t yoff = PassesDecoderState::kGroupDataYBorder;
287 
288   // This X,Y range is the intersection between the requested "rect" expanded
289   // with a lf.Padding() all around and the image frame translated to the
290   // coordinate system used by plane_out.
291   // All the pixels in the [x0, x1) x [y0, y1) range must be defined in the
292   // plane_out output at the end.
293   const size_t y0 = rect.y0() - std::min<size_t>(lf.Padding(), frame_rect.y0());
294   const size_t y1 = rect.y0() +
295                     std::min(frame_rect.y0() + rect.ysize() + lf.Padding(),
296                              frame_dim.ysize_padded) -
297                     frame_rect.y0();
298 
299   const size_t x0 = rect.x0() - std::min<size_t>(lf.Padding(), frame_rect.x0());
300   const size_t x1 = rect.x0() +
301                     std::min(frame_rect.x0() + rect.xsize() + lf.Padding(),
302                              frame_dim.xsize_padded) -
303                     frame_rect.x0();
304 
305   if (hs == 0 && vs == 0) {
306     Rect r(x0, y0, x1 - x0, y1 - y0);
307     JXL_CHECK_IMAGE_INITIALIZED(*plane_in, r);
308     CopyImageTo(r, *plane_in, r, plane_out);
309     return;
310   }
311   // Prepare padding if we are on a border.
312   // Copy the whole row/column here: it is likely similarly fast and ensures
313   // that we don't forget some parts of padding.
314   if (frame_rect.x0() == 0) {
315     for (size_t y = 0; y < plane_in->ysize(); y++) {
316       plane_in->Row(y)[rect.x0() - 1] = plane_in->Row(y)[rect.x0()];
317     }
318   }
319   if (frame_rect.x0() + x1 - rect.x0() >= frame_dim.xsize_padded) {
320     ssize_t borderx = static_cast<ssize_t>(x1 - xoff + hs) / (1 << hs) + xoff;
321     for (size_t y = 0; y < plane_in->ysize(); y++) {
322       plane_in->Row(y)[borderx] = plane_in->Row(y)[borderx - 1];
323     }
324   }
325   if (frame_rect.y0() == 0) {
326     memcpy(plane_in->Row(rect.y0() - 1), plane_in->Row(rect.y0()),
327            plane_in->xsize() * sizeof(float));
328   }
329   if (frame_rect.y0() + y1 - rect.y0() >= frame_dim.ysize_padded) {
330     ssize_t bordery = static_cast<ssize_t>(y1 - yoff + vs) / (1 << vs) + yoff;
331     memcpy(plane_in->Row(bordery), plane_in->Row(bordery - 1),
332            plane_in->xsize() * sizeof(float));
333   }
334   if (hs == 1) {
335     // Limited to 4 for Interleave*.
336     HWY_CAPPED(float, 4) d;
337     auto threefour = Set(d, 0.75f);
338     auto onefour = Set(d, 0.25f);
339     size_t orig_y0 = y0;
340     size_t orig_y1 = y1;
341     if (vs != 0) {
342       orig_y0 = (y0 >> 1) + (yoff >> 1) - 1;
343       orig_y1 = (y1 >> 1) + (yoff >> 1) + 1;
344     }
345     for (size_t y = orig_y0; y < orig_y1; y++) {
346       const float* in = plane_in->Row(y);
347       float* out = temp->Row(y);
348       for (size_t x = x0 / (2 * Lanes(d)) * 2 * Lanes(d);
349            x < RoundUpTo(x1, 2 * Lanes(d)); x += 2 * Lanes(d)) {
350         size_t ox = (x >> 1) + (xoff >> 1);
351         auto current = Load(d, in + ox) * threefour;
352         auto prev = LoadU(d, in + ox - 1);
353         auto next = LoadU(d, in + ox + 1);
354         auto left = MulAdd(onefour, prev, current);
355         auto right = MulAdd(onefour, next, current);
356 #if HWY_TARGET == HWY_SCALAR
357         Store(left, d, out + x);
358         Store(right, d, out + x + 1);
359 #else
360         Store(InterleaveLower(left, right), d, out + x);
361         Store(InterleaveUpper(left, right), d, out + x + Lanes(d));
362 #endif
363       }
364     }
365   } else {
366     CopyImageTo(*plane_in, temp);
367   }
368   if (vs == 1) {
369     HWY_FULL(float) d;
370     auto threefour = Set(d, 0.75f);
371     auto onefour = Set(d, 0.25f);
372     for (size_t y = y0; y < y1; y++) {
373       size_t oy1 = (y >> 1) + (yoff >> 1);
374       if ((y & 1) == 1) oy1++;
375       size_t oy0 = oy1 - 1;
376       const float* in0 = temp->Row(oy0);
377       const float* in1 = temp->Row(oy1);
378       float* out = plane_out->Row(y);
379       if ((y & 1) == 1) {
380         for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
381              x += Lanes(d)) {
382           auto i0 = Load(d, in0 + x);
383           auto i1 = Load(d, in1 + x);
384           auto o = MulAdd(i0, threefour, i1 * onefour);
385           Store(o, d, out + x);
386         }
387       } else {
388         for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
389              x += Lanes(d)) {
390           auto i0 = Load(d, in0 + x);
391           auto i1 = Load(d, in1 + x);
392           auto o = MulAdd(i0, onefour, i1 * threefour);
393           Store(o, d, out + x);
394         }
395       }
396     }
397   } else {
398     CopyImageTo(*temp, plane_out);
399   }
400 
401   // The output must be initialized including the lf.Padding() around the image
402   // for all the pixels that fall inside the image frame.
403   JXL_CHECK_IMAGE_INITIALIZED(*plane_out, Rect(x0, y0, x1 - x0, y1 - y0));
404 }
405 
406 // NOLINTNEXTLINE(google-readability-namespace-comments)
407 }  // namespace HWY_NAMESPACE
408 }  // namespace jxl
409 HWY_AFTER_NAMESPACE();
410 
411 #if HWY_ONCE
412 namespace jxl {
413 
414 HWY_EXPORT(UndoXYBInPlace);
415 HWY_EXPORT(FloatToRGBA8);
416 HWY_EXPORT(DoYCbCrUpsampling);
417 
UndoXYB(const Image3F & src,Image3F * dst,const OutputEncodingInfo & output_info,ThreadPool * pool)418 void UndoXYB(const Image3F& src, Image3F* dst,
419              const OutputEncodingInfo& output_info, ThreadPool* pool) {
420   CopyImageTo(src, dst);
421   pool->Run(0, src.ysize(), ThreadPool::SkipInit(), [&](int y, int /*thread*/) {
422     JXL_CHECK(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(dst, Rect(*dst).Line(y),
423                                                    output_info));
424   });
425 }
426 
427 namespace {
ScaleRectForEC(Rect in,const FrameHeader & frame_header,size_t ec)428 Rect ScaleRectForEC(Rect in, const FrameHeader& frame_header, size_t ec) {
429   auto s = [&](size_t x) {
430     return DivCeil(x * frame_header.upsampling,
431                    frame_header.extra_channel_upsampling[ec]);
432   };
433   // For x0 and y0 the DivCeil is actually an exact division.
434   return Rect(s(in.x0()), s(in.y0()), s(in.xsize()), s(in.ysize()));
435 }
436 
437 // Implements EnsurePaddingInPlace, but allows processing data one row at a
438 // time.
439 class EnsurePaddingInPlaceRowByRow {
Init(const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)440   void Init(const Rect& rect, const Rect& image_rect, size_t image_xsize,
441             size_t image_ysize, size_t xpadding, size_t ypadding, ssize_t* y0,
442             ssize_t* y1) {
443     // coordinates relative to rect.
444     JXL_ASSERT(SameSize(rect, image_rect));
445     JXL_ASSERT(image_rect.x0() + image_rect.xsize() <= image_xsize);
446     JXL_ASSERT(image_rect.y0() + image_rect.ysize() <= image_ysize);
447     *y0 = -std::min(image_rect.y0(), ypadding);
448     *y1 = rect.ysize() + std::min(ypadding, image_ysize - image_rect.ysize() -
449                                                 image_rect.y0());
450     if (image_rect.x0() >= xpadding &&
451         image_rect.x0() + image_rect.xsize() + xpadding <= image_xsize) {
452       // Nothing to do.
453       strategy_ = kSkip;
454     } else if (image_xsize >= 2 * xpadding) {
455       strategy_ = kFast;
456     } else {
457       strategy_ = kSlow;
458     }
459     y0_ = rect.y0();
460     JXL_ASSERT(rect.x0() >= xpadding);
461     x0_ = x1_ = rect.x0() - xpadding;
462     // If close to the left border - do mirroring.
463     if (image_rect.x0() < xpadding) x1_ = rect.x0() - image_rect.x0();
464     x2_ = x3_ = rect.x0() + rect.xsize() + xpadding;
465     // If close to the right border - do mirroring.
466     if (image_rect.x0() + image_rect.xsize() + xpadding > image_xsize) {
467       x2_ = rect.x0() + image_xsize - image_rect.x0();
468     }
469     JXL_ASSERT(x0_ <= x1_);
470     JXL_ASSERT(x1_ <= x2_);
471     JXL_ASSERT(x2_ <= x3_);
472     JXL_ASSERT(image_xsize == (x2_ - x1_) ||
473                (x1_ - x0_ <= x2_ - x1_ && x3_ - x2_ <= x2_ - x1_));
474   }
475 
476  public:
Init(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)477   void Init(Image3F* img, const Rect& rect, const Rect& image_rect,
478             size_t image_xsize, size_t image_ysize, size_t xpadding,
479             size_t ypadding, ssize_t* y0, ssize_t* y1) {
480     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
481          y1);
482     img3_ = img;
483     JXL_DASSERT(x3_ <= img->xsize());
484   }
Init(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)485   void Init(ImageF* img, const Rect& rect, const Rect& image_rect,
486             size_t image_xsize, size_t image_ysize, size_t xpadding,
487             size_t ypadding, ssize_t* y0, ssize_t* y1) {
488     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
489          y1);
490     img_ = img;
491     JXL_DASSERT(x3_ <= img->xsize());
492   }
493   // To be called when row `y` of the input is available, for all the values in
494   // [*y0, *y1).
Process3(ssize_t y)495   void Process3(ssize_t y) {
496     JXL_DASSERT(img3_);
497     for (size_t c = 0; c < 3; c++) {
498       img_ = &img3_->Plane(c);
499       Process(y);
500     }
501   }
Process(ssize_t y)502   void Process(ssize_t y) {
503     JXL_DASSERT(img_);
504     switch (strategy_) {
505       case kSkip:
506         break;
507       case kFast: {
508         // Image is wide enough that a single Mirror() step is sufficient.
509         float* JXL_RESTRICT row = img_->Row(y + y0_);
510         for (size_t x = x0_; x < x1_; x++) {
511           row[x] = row[2 * x1_ - x - 1];
512         }
513         for (size_t x = x2_; x < x3_; x++) {
514           row[x] = row[2 * x2_ - x - 1];
515         }
516         break;
517       }
518       case kSlow: {
519         // Slow case for small images.
520         float* JXL_RESTRICT row = img_->Row(y + y0_) + x1_;
521         for (ssize_t x = x0_ - x1_; x < 0; x++) {
522           *(row + x) = row[Mirror(x, x2_ - x1_)];
523         }
524         for (size_t x = x2_ - x1_; x < x3_ - x1_; x++) {
525           *(row + x) = row[Mirror(x, x2_ - x1_)];
526         }
527         break;
528       }
529     }
530   }
531 
532  private:
533   // Initialized to silence spurious compiler warnings.
534   Image3F* img3_ = nullptr;
535   ImageF* img_ = nullptr;
536   // Will fill [x0_, x1_) and [x2_, x3_) on every row.
537   // The [x1_, x2_) range contains valid image pixels. We guarantee that either
538   // x1_ - x0_ <= x2_ - x1_, (and similarly for x2_, x3_), or that the [x1_,
539   // x2_) contains a full horizontal line of the original image.
540   size_t x0_ = 0, x1_ = 0, x2_ = 0, x3_ = 0;
541   size_t y0_ = 0;
542   // kSlow: use calls to Mirror(), for the case where the border might be larger
543   // than the image.
544   // kFast: directly use the result of Mirror() when it can be computed in a
545   // single iteration.
546   // kSkip: do nothing.
547   enum Strategy { kFast, kSlow, kSkip };
548   Strategy strategy_ = kSkip;
549 };
550 }  // namespace
551 
EnsurePaddingInPlace(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)552 void EnsurePaddingInPlace(Image3F* img, const Rect& rect,
553                           const Rect& image_rect, size_t image_xsize,
554                           size_t image_ysize, size_t xpadding,
555                           size_t ypadding) {
556   ssize_t y0, y1;
557   EnsurePaddingInPlaceRowByRow impl;
558   impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
559             &y0, &y1);
560   for (ssize_t y = y0; y < y1; y++) {
561     impl.Process3(y);
562   }
563 }
564 
EnsurePaddingInPlace(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)565 void EnsurePaddingInPlace(ImageF* img, const Rect& rect, const Rect& image_rect,
566                           size_t image_xsize, size_t image_ysize,
567                           size_t xpadding, size_t ypadding) {
568   ssize_t y0, y1;
569   EnsurePaddingInPlaceRowByRow impl;
570   impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
571             &y0, &y1);
572   for (ssize_t y = y0; y < y1; y++) {
573     impl.Process(y);
574   }
575 }
576 
FinalizeImageRect(Image3F * input_image,const Rect & input_rect,const std::vector<std::pair<ImageF *,Rect>> & extra_channels,PassesDecoderState * dec_state,size_t thread,ImageBundle * JXL_RESTRICT output_image,const Rect & frame_rect)577 Status FinalizeImageRect(
578     Image3F* input_image, const Rect& input_rect,
579     const std::vector<std::pair<ImageF*, Rect>>& extra_channels,
580     PassesDecoderState* dec_state, size_t thread,
581     ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect) {
582   const ImageFeatures& image_features = dec_state->shared->image_features;
583   const FrameHeader& frame_header = dec_state->shared->frame_header;
584   const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
585   const LoopFilter& lf = frame_header.loop_filter;
586   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
587   JXL_DASSERT(frame_rect.xsize() <= kApplyImageFeaturesTileDim);
588   JXL_DASSERT(frame_rect.ysize() <= kApplyImageFeaturesTileDim);
589   JXL_DASSERT(input_rect.xsize() == frame_rect.xsize());
590   JXL_DASSERT(input_rect.ysize() == frame_rect.ysize());
591   JXL_DASSERT(frame_rect.x0() % GroupBorderAssigner::kPaddingXRound == 0);
592   JXL_DASSERT(frame_rect.xsize() % GroupBorderAssigner::kPaddingXRound == 0 ||
593               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize ||
594               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize_padded);
595 
596   // +----------------------------- STEP 1 ------------------------------+
597   // | Compute the rects on which patches and splines will be applied.   |
598   // | In case we are applying upsampling, we need to apply patches on a |
599   // | slightly larger image.                                            |
600   // +-------------------------------------------------------------------+
601 
602   // If we are applying upsampling, we need 2 more pixels around the actual rect
603   // for border. Thus, we also need to apply patches and splines to those
604   // pixels. We compute here
605   // - The portion of image that corresponds to the area we are applying IF.
606   //   (rect_for_if)
607   // - The rect where that pixel data is stored in upsampling_input_storage.
608   //   (rect_for_if_storage)
609   // - The rect where the pixel data that we need to upsample is stored.
610   //   (rect_for_upsampling)
611   // - The source rect for the pixel data in `input_image`. It is assumed that,
612   //   if `frame_rect` is not on an image border, `input_image:input_rect` has
613   //   enough border available. (rect_for_if_input)
614 
615   Image3F* output_color =
616       dec_state->rgb_output == nullptr && dec_state->pixel_callback == nullptr
617           ? output_image->color()
618           : nullptr;
619 
620   Image3F* storage_for_if = output_color;
621   Rect rect_for_if = frame_rect;
622   Rect rect_for_if_storage = frame_rect;
623   Rect rect_for_upsampling = frame_rect;
624   Rect rect_for_if_input = input_rect;
625   // The same as rect_for_if_input but in the frame coordinates.
626   Rect frame_rect_for_ycbcr_upsampling = frame_rect;
627   size_t extra_rows_t = 0;
628   size_t extra_rows_b = 0;
629   if (frame_header.upsampling != 1) {
630     size_t ifbx0 = 0;
631     size_t ifbx1 = 0;
632     size_t ifby0 = 0;
633     size_t ifby1 = 0;
634     if (frame_rect.x0() >= 2) {
635       JXL_DASSERT(input_rect.x0() >= 2);
636       ifbx0 = 2;
637     }
638     if (frame_rect.y0() >= 2) {
639       JXL_DASSERT(input_rect.y0() >= 2);
640       extra_rows_t = ifby0 = 2;
641     }
642     for (size_t extra : {1, 2}) {
643       if (frame_rect.x0() + frame_rect.xsize() + extra <=
644           dec_state->shared->frame_dim.xsize_padded) {
645         JXL_DASSERT(input_rect.x0() + input_rect.xsize() + extra <=
646                     input_image->xsize());
647         ifbx1 = extra;
648       }
649       if (frame_rect.y0() + frame_rect.ysize() + extra <=
650           dec_state->shared->frame_dim.ysize_padded) {
651         JXL_DASSERT(input_rect.y0() + input_rect.ysize() + extra <=
652                     input_image->ysize());
653         extra_rows_b = ifby1 = extra;
654       }
655     }
656     rect_for_if = Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
657                        frame_rect.xsize() + ifbx0 + ifbx1,
658                        frame_rect.ysize() + ifby0 + ifby1);
659     // Storage for pixel data does not necessarily start at (0, 0) as we need to
660     // have the left border of upsampling_rect aligned to a multiple of
661     // GroupBorderAssigner::kPaddingXRound.
662     rect_for_if_storage =
663         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound) -
664                  ifbx0,
665              kBlockDim, rect_for_if.xsize(), rect_for_if.ysize());
666     rect_for_upsampling =
667         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound),
668              kBlockDim + ifby0, frame_rect.xsize(), frame_rect.ysize());
669     rect_for_if_input =
670         Rect(input_rect.x0() - ifbx0, input_rect.y0() - ifby0,
671              rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
672     frame_rect_for_ycbcr_upsampling =
673         Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
674              rect_for_if_input.xsize(), rect_for_if_input.ysize());
675     storage_for_if = &dec_state->upsampling_input_storage[thread];
676   }
677 
678   // +--------------------------- STEP 1.5 ------------------------------+
679   // | Perform YCbCr upsampling if needed.                               |
680   // +-------------------------------------------------------------------+
681 
682   Image3F* input = input_image;
683   if (!frame_header.chroma_subsampling.Is444()) {
684     for (size_t c = 0; c < 3; c++) {
685       size_t vs = frame_header.chroma_subsampling.VShift(c);
686       size_t hs = frame_header.chroma_subsampling.HShift(c);
687       // The per-thread output is used for the first time here. Poison the temp
688       // image on this thread to prevent leaking initialized data from a
689       // previous run in this thread in msan builds.
690       msan::PoisonImage(dec_state->ycbcr_out_images[thread].Plane(c));
691       HWY_DYNAMIC_DISPATCH(DoYCbCrUpsampling)
692       (hs, vs, &input_image->Plane(c), rect_for_if_input,
693        frame_rect_for_ycbcr_upsampling, frame_dim,
694        &dec_state->ycbcr_out_images[thread].Plane(c), lf,
695        &dec_state->ycbcr_temp_images[thread]);
696     }
697     input = &dec_state->ycbcr_out_images[thread];
698   }
699 
700   // Variables for upsampling and filtering.
701   Rect upsampled_frame_rect(frame_rect.x0() * frame_header.upsampling,
702                             frame_rect.y0() * frame_header.upsampling,
703                             frame_rect.xsize() * frame_header.upsampling,
704                             frame_rect.ysize() * frame_header.upsampling);
705   Rect full_frame_rect(0, 0, frame_dim.xsize_upsampled,
706                        frame_dim.ysize_upsampled);
707   upsampled_frame_rect = upsampled_frame_rect.Crop(full_frame_rect);
708   EnsurePaddingInPlaceRowByRow ensure_padding_upsampling;
709   ssize_t ensure_padding_upsampling_y0 = 0;
710   ssize_t ensure_padding_upsampling_y1 = 0;
711 
712   EnsurePaddingInPlaceRowByRow ensure_padding_filter;
713   FilterPipeline* fp = nullptr;
714   ssize_t ensure_padding_filter_y0 = 0;
715   ssize_t ensure_padding_filter_y1 = 0;
716   if (lf.epf_iters != 0 || lf.gab) {
717     fp = &dec_state->filter_pipelines[thread];
718   }
719 
720   // +----------------------------- STEP 2 ------------------------------+
721   // | Change rects and buffer to not use `output_image` if direct       |
722   // | output to rgb8 is requested.                                      |
723   // +-------------------------------------------------------------------+
724   Image3F* output_pixel_data_storage = output_color;
725   Rect upsampled_frame_rect_for_storage = upsampled_frame_rect;
726   if (dec_state->rgb_output || dec_state->pixel_callback) {
727     size_t log2_upsampling = CeilLog2Nonzero(frame_header.upsampling);
728     if (storage_for_if == output_color) {
729       storage_for_if =
730           &dec_state->output_pixel_data_storage[log2_upsampling][thread];
731       rect_for_if_storage =
732           Rect(0, 0, rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
733     }
734     output_pixel_data_storage =
735         &dec_state->output_pixel_data_storage[log2_upsampling][thread];
736     upsampled_frame_rect_for_storage =
737         Rect(0, 0, upsampled_frame_rect.xsize(), upsampled_frame_rect.ysize());
738     if (frame_header.upsampling == 1 && fp == nullptr) {
739       upsampled_frame_rect_for_storage = rect_for_if_storage =
740           rect_for_if_input;
741       output_pixel_data_storage = storage_for_if = input;
742     }
743   }
744   // Set up alpha channel.
745   const size_t ec =
746       metadata.Find(ExtraChannel::kAlpha) - metadata.extra_channel_info.data();
747   const ImageF* alpha = nullptr;
748   Rect alpha_rect = upsampled_frame_rect;
749   if (ec < metadata.extra_channel_info.size()) {
750     JXL_ASSERT(ec < extra_channels.size());
751     if (frame_header.extra_channel_upsampling[ec] == 1) {
752       alpha = extra_channels[ec].first;
753       alpha_rect = extra_channels[ec].second;
754     } else {
755       alpha = &output_image->extra_channels()[ec];
756       alpha_rect = upsampled_frame_rect;
757     }
758   }
759 
760   // +----------------------------- STEP 3 ------------------------------+
761   // | Set up upsampling and upsample extra channels.                    |
762   // +-------------------------------------------------------------------+
763   Upsampler* color_upsampler = nullptr;
764   if (frame_header.upsampling != 1) {
765     color_upsampler =
766         &dec_state->upsamplers[CeilLog2Nonzero(frame_header.upsampling) - 1];
767     ensure_padding_upsampling.Init(
768         storage_for_if, rect_for_upsampling, frame_rect, frame_dim.xsize_padded,
769         frame_dim.ysize_padded, 2, 2, &ensure_padding_upsampling_y0,
770         &ensure_padding_upsampling_y1);
771   }
772 
773   std::vector<std::pair<ImageF*, Rect>> extra_channels_for_patches;
774   std::vector<EnsurePaddingInPlaceRowByRow> ec_padding;
775 
776   bool late_ec_upsample = frame_header.upsampling != 1;
777   for (auto ecups : frame_header.extra_channel_upsampling) {
778     if (ecups != frame_header.upsampling) {
779       // If patches are applied, either frame_header.upsampling == 1 or
780       // late_ec_upsample is true.
781       late_ec_upsample = false;
782     }
783   }
784 
785   ssize_t ensure_padding_upsampling_ec_y0 = 0;
786   ssize_t ensure_padding_upsampling_ec_y1 = 0;
787 
788   // TODO(veluca) do not upsample extra channels to a full-image-sized buffer if
789   // we are not outputting to an ImageBundle.
790   if (!late_ec_upsample) {
791     // Upsample extra channels first if not all channels have the same
792     // upsampling factor.
793     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
794       size_t ecups = frame_header.extra_channel_upsampling[ec];
795       if (ecups == 1) {
796         extra_channels_for_patches.push_back(extra_channels[ec]);
797         continue;
798       }
799       ssize_t ensure_padding_y0, ensure_padding_y1;
800       EnsurePaddingInPlaceRowByRow ensure_padding;
801       // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
802       Rect ec_image_rect = ScaleRectForEC(
803           frame_rect.Crop(frame_dim.xsize, frame_dim.ysize), frame_header, ec);
804       size_t ecxs = DivCeil(frame_dim.xsize_upsampled,
805                             frame_header.extra_channel_upsampling[ec]);
806       size_t ecys = DivCeil(frame_dim.ysize_upsampled,
807                             frame_header.extra_channel_upsampling[ec]);
808       ensure_padding.Init(extra_channels[ec].first, extra_channels[ec].second,
809                           ec_image_rect, ecxs, ecys, 2, 2, &ensure_padding_y0,
810                           &ensure_padding_y1);
811       for (ssize_t y = ensure_padding_y0; y < ensure_padding_y1; y++) {
812         ensure_padding.Process(y);
813       }
814       Upsampler& upsampler =
815           dec_state->upsamplers[CeilLog2Nonzero(
816                                     frame_header.extra_channel_upsampling[ec]) -
817                                 1];
818       upsampler.UpsampleRect(
819           *extra_channels[ec].first, extra_channels[ec].second,
820           &output_image->extra_channels()[ec], upsampled_frame_rect,
821           static_cast<ssize_t>(ec_image_rect.y0()) -
822               static_cast<ssize_t>(extra_channels[ec].second.y0()),
823           ecys, dec_state->upsampler_storage[thread].get());
824       extra_channels_for_patches.emplace_back(
825           &output_image->extra_channels()[ec], upsampled_frame_rect);
826     }
827   } else {
828     // Upsample extra channels last if color channels are upsampled and all the
829     // extra channels have the same upsampling as them.
830     ec_padding.resize(extra_channels.size());
831     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
832       // Add a border to the extra channel rect for when patches are applied.
833       // This ensures that the correct row is accessed (y values for patches are
834       // relative to rect_for_if, not to input_rect).
835       // As the rect is extended by 0 or 2 pixels, and the patches input has,
836       // accordingly, the same padding, this is safe.
837       Rect r(extra_channels[ec].second.x0() + rect_for_upsampling.x0() -
838                  rect_for_if_storage.x0(),
839              extra_channels[ec].second.y0() + rect_for_upsampling.y0() -
840                  rect_for_if_storage.y0(),
841              extra_channels[ec].second.xsize() + rect_for_if_storage.xsize() -
842                  rect_for_upsampling.xsize(),
843              extra_channels[ec].second.ysize() + rect_for_if_storage.ysize() -
844                  rect_for_upsampling.ysize());
845       extra_channels_for_patches.emplace_back(extra_channels[ec].first, r);
846       // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
847       ec_padding[ec].Init(extra_channels[ec].first, extra_channels[ec].second,
848                           frame_rect.Crop(frame_dim.xsize, frame_dim.ysize),
849                           frame_dim.xsize, frame_dim.ysize, 2, 2,
850                           &ensure_padding_upsampling_ec_y0,
851                           &ensure_padding_upsampling_ec_y1);
852     }
853   }
854 
855   // Initialized to a valid non-null ptr to avoid UB if arithmetic is done with
856   // the pointer value (which would then not be used).
857   std::vector<float*> ec_ptrs_for_patches(extra_channels.size(),
858                                           input->PlaneRow(0, 0));
859 
860   // +----------------------------- STEP 4 ------------------------------+
861   // | Set up the filter pipeline.                                       |
862   // +-------------------------------------------------------------------+
863   if (fp) {
864     ensure_padding_filter.Init(
865         input, rect_for_if_input, rect_for_if, frame_dim.xsize_padded,
866         frame_dim.ysize_padded, lf.Padding(), lf.Padding(),
867         &ensure_padding_filter_y0, &ensure_padding_filter_y1);
868 
869     fp = PrepareFilterPipeline(dec_state, rect_for_if, *input,
870                                rect_for_if_input, frame_dim.ysize_padded,
871                                thread, storage_for_if, rect_for_if_storage);
872   }
873 
874   // +----------------------------- STEP 5 ------------------------------+
875   // | Run the prepared pipeline of operations.                          |
876   // +-------------------------------------------------------------------+
877 
878   // y values are relative to rect_for_if.
879   // Automatic mirroring in fp->ApplyFiltersRow() implies that we should ensure
880   // that padding for the first lines of the image is already present before
881   // calling ApplyFiltersRow() with "virtual" rows.
882   // Here we rely on the fact that virtual rows at the beginning of the image
883   // are only present if input_rect.y0() == 0.
884   ssize_t first_ensure_padding_y = ensure_padding_filter_y0;
885   if (frame_rect.y0() == 0) {
886     JXL_DASSERT(ensure_padding_filter_y0 == 0);
887     first_ensure_padding_y =
888         std::min<ssize_t>(lf.Padding(), ensure_padding_filter_y1);
889     for (ssize_t y = 0; y < first_ensure_padding_y; y++) {
890       ensure_padding_filter.Process3(y);
891     }
892   }
893 
894   for (ssize_t y = -lf.Padding();
895        y < static_cast<ssize_t>(lf.Padding() + rect_for_if.ysize()); y++) {
896     if (fp) {
897       if (y >= first_ensure_padding_y && y < ensure_padding_filter_y1) {
898         ensure_padding_filter.Process3(y);
899       }
900       fp->ApplyFiltersRow(lf, dec_state->filter_weights, y);
901     } else if (output_pixel_data_storage != input) {
902       for (size_t c = 0; c < 3; c++) {
903         memcpy(rect_for_if_storage.PlaneRow(storage_for_if, c, y),
904                rect_for_if_input.ConstPlaneRow(*input, c, y),
905                rect_for_if_input.xsize() * sizeof(float));
906       }
907     }
908     if (y < static_cast<ssize_t>(lf.Padding())) continue;
909     // At this point, row `y - lf.Padding()` of `rect_for_if` has been produced
910     // by the filters.
911     ssize_t available_y = y - lf.Padding();
912     if (frame_header.upsampling == 1) {
913       for (size_t i = 0; i < extra_channels.size(); i++) {
914         ec_ptrs_for_patches[i] = extra_channels_for_patches[i].second.Row(
915             extra_channels_for_patches[i].first, available_y);
916       }
917     }
918     JXL_RETURN_IF_ERROR(image_features.patches.AddTo(
919         storage_for_if, rect_for_if_storage.Line(available_y),
920         ec_ptrs_for_patches.data(), rect_for_if.Line(available_y)));
921     image_features.splines.AddTo(storage_for_if,
922                                  rect_for_if_storage.Line(available_y),
923                                  rect_for_if.Line(available_y));
924     size_t num_ys = 1;
925     if (frame_header.upsampling != 1) {
926       // Upsampling `y` values are relative to `rect_for_upsampling`, not to
927       // `rect_for_if`.
928       ssize_t shifted_y = available_y - extra_rows_t;
929       if (shifted_y >= ensure_padding_upsampling_y0 &&
930           shifted_y < ensure_padding_upsampling_y1) {
931         ensure_padding_upsampling.Process3(shifted_y);
932       }
933       if (late_ec_upsample && shifted_y >= ensure_padding_upsampling_ec_y0 &&
934           shifted_y < ensure_padding_upsampling_ec_y1) {
935         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
936           ec_padding[ec].Process(shifted_y);
937         }
938       }
939       // Upsampling will access two rows of border, so the first upsampling
940       // output will be available after shifted_y is at least 2, *unless* image
941       // height is <= 2.
942       if (shifted_y < 2 &&
943           shifted_y + 1 != static_cast<ssize_t>(frame_rect.ysize())) {
944         continue;
945       }
946       // Value relative to upsampled_frame_rect.
947       size_t input_y = std::max<ssize_t>(shifted_y - 2, 0);
948       size_t upsampled_available_y = frame_header.upsampling * input_y;
949       size_t num_input_rows = 1;
950       // If we are going to mirror the last output rows, then we already have 3
951       // input lines ready. This happens iff we did not extend rect_for_if on
952       // the bottom *and* we are at the last `y` value.
953       if (extra_rows_b != 2 &&
954           static_cast<size_t>(y) + 1 == lf.Padding() + rect_for_if.ysize()) {
955         num_input_rows = 3;
956       }
957       num_input_rows = std::min(num_input_rows, frame_dim.ysize_padded);
958       num_ys = num_input_rows * frame_header.upsampling;
959 
960       if (static_cast<size_t>(upsampled_available_y) >=
961           upsampled_frame_rect.ysize()) {
962         continue;
963       }
964 
965       if (upsampled_available_y + num_ys >= upsampled_frame_rect.ysize()) {
966         num_ys = upsampled_frame_rect.ysize() - upsampled_available_y;
967       }
968 
969       // Upsampler takes care of mirroring, and checks "physical" boundaries.
970       Rect upsample_input_rect = rect_for_upsampling.Lines(input_y, 1);
971       color_upsampler->UpsampleRect(
972           *storage_for_if, upsample_input_rect, output_pixel_data_storage,
973           upsampled_frame_rect_for_storage.Lines(upsampled_available_y, num_ys),
974           static_cast<ssize_t>(frame_rect.y0()) -
975               static_cast<ssize_t>(rect_for_upsampling.y0()),
976           frame_dim.ysize_padded, dec_state->upsampler_storage[thread].get());
977       if (late_ec_upsample) {
978         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
979           // Upsampler takes care of mirroring, and checks "physical"
980           // boundaries.
981           Rect upsample_ec_input_rect =
982               extra_channels[ec].second.Lines(input_y, 1);
983           color_upsampler->UpsampleRect(
984               *extra_channels[ec].first, upsample_ec_input_rect,
985               &output_image->extra_channels()[ec],
986               upsampled_frame_rect.Lines(upsampled_available_y, num_ys),
987               static_cast<ssize_t>(frame_rect.y0()) -
988                   static_cast<ssize_t>(extra_channels[ec].second.y0()),
989               frame_dim.ysize, dec_state->upsampler_storage[thread].get());
990         }
991       }
992       available_y = upsampled_available_y;
993     }
994 
995     if (static_cast<size_t>(available_y) >= upsampled_frame_rect.ysize()) {
996       continue;
997     }
998 
999     // The image data is now unconditionally in
1000     // `output_image_storage:upsampled_frame_rect_for_storage`.
1001     if (frame_header.flags & FrameHeader::kNoise) {
1002       PROFILER_ZONE("AddNoise");
1003       AddNoise(image_features.noise_params,
1004                upsampled_frame_rect.Lines(available_y, num_ys),
1005                dec_state->noise,
1006                upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1007                dec_state->shared_storage.cmap, output_pixel_data_storage);
1008     }
1009 
1010     if (dec_state->pre_color_transform_frame.xsize() != 0) {
1011       for (size_t c = 0; c < 3; c++) {
1012         for (size_t y = available_y; y < available_y + num_ys; y++) {
1013           float* JXL_RESTRICT row_out = upsampled_frame_rect.PlaneRow(
1014               &dec_state->pre_color_transform_frame, c, y);
1015           const float* JXL_RESTRICT row_in =
1016               upsampled_frame_rect_for_storage.ConstPlaneRow(
1017                   *output_pixel_data_storage, c, y);
1018           memcpy(row_out, row_in,
1019                  upsampled_frame_rect.xsize() * sizeof(*row_in));
1020         }
1021       }
1022     }
1023 
1024     // We skip the color transform entirely if save_before_color_transform and
1025     // the frame is not supposed to be displayed.
1026 
1027     if (dec_state->fast_xyb_srgb8_conversion) {
1028       FastXYBTosRGB8(
1029           *output_pixel_data_storage,
1030           upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1031           upsampled_frame_rect.Lines(available_y, num_ys)
1032               .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1033                          frame_dim.ysize_upsampled)),
1034           alpha, alpha_rect.Lines(available_y, num_ys),
1035           dec_state->rgb_output_is_rgba, dec_state->rgb_output, frame_dim.xsize,
1036           dec_state->rgb_stride);
1037     } else {
1038       if (frame_header.needs_color_transform()) {
1039         if (frame_header.color_transform == ColorTransform::kXYB) {
1040           JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(
1041               output_pixel_data_storage,
1042               upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1043               dec_state->output_encoding_info));
1044         } else if (frame_header.color_transform == ColorTransform::kYCbCr) {
1045           YcbcrToRgb(
1046               *output_pixel_data_storage, output_pixel_data_storage,
1047               upsampled_frame_rect_for_storage.Lines(available_y, num_ys));
1048         }
1049       }
1050 
1051       // TODO(veluca): all blending should happen here.
1052 
1053       if (dec_state->rgb_output != nullptr) {
1054         HWY_DYNAMIC_DISPATCH(FloatToRGBA8)
1055         (*output_pixel_data_storage,
1056          upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1057          dec_state->rgb_output_is_rgba, alpha,
1058          alpha_rect.Lines(available_y, num_ys),
1059          upsampled_frame_rect.Lines(available_y, num_ys)
1060              .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)),
1061          dec_state->rgb_output, dec_state->rgb_stride);
1062       }
1063       if (dec_state->pixel_callback != nullptr) {
1064         Rect alpha_line_rect = alpha_rect.Lines(available_y, num_ys);
1065         Rect color_input_line_rect =
1066             upsampled_frame_rect_for_storage.Lines(available_y, num_ys);
1067         Rect image_line_rect = upsampled_frame_rect.Lines(available_y, num_ys)
1068                                    .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1069                                               frame_dim.ysize_upsampled));
1070         const float* line_buffers[4];
1071         for (size_t iy = 0; iy < image_line_rect.ysize(); iy++) {
1072           for (size_t c = 0; c < 3; c++) {
1073             line_buffers[c] = color_input_line_rect.ConstPlaneRow(
1074                 *output_pixel_data_storage, c, iy);
1075           }
1076           if (alpha) {
1077             line_buffers[3] = alpha_line_rect.ConstRow(*alpha, iy);
1078           } else {
1079             line_buffers[3] = dec_state->opaque_alpha.data();
1080           }
1081           std::vector<float>& interleaved =
1082               dec_state->pixel_callback_rows[thread];
1083           size_t j = 0;
1084           for (size_t i = 0; i < image_line_rect.xsize(); i++) {
1085             interleaved[j++] = line_buffers[0][i];
1086             interleaved[j++] = line_buffers[1][i];
1087             interleaved[j++] = line_buffers[2][i];
1088             if (dec_state->rgb_output_is_rgba) {
1089               interleaved[j++] = line_buffers[3][i];
1090             }
1091           }
1092           dec_state->pixel_callback(interleaved.data(), image_line_rect.x0(),
1093                                     image_line_rect.y0() + iy,
1094                                     image_line_rect.xsize());
1095         }
1096       }
1097     }
1098   }
1099 
1100   return true;
1101 }
1102 
FinalizeFrameDecoding(ImageBundle * decoded,PassesDecoderState * dec_state,ThreadPool * pool,bool force_fir,bool skip_blending)1103 Status FinalizeFrameDecoding(ImageBundle* decoded,
1104                              PassesDecoderState* dec_state, ThreadPool* pool,
1105                              bool force_fir, bool skip_blending) {
1106   const FrameHeader& frame_header = dec_state->shared->frame_header;
1107   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
1108 
1109   // FinalizeImageRect was not yet run, or we are forcing a run.
1110   if (!dec_state->EagerFinalizeImageRect() || force_fir) {
1111     std::vector<Rect> rects_to_process;
1112     for (size_t y = 0; y < frame_dim.ysize_padded; y += kGroupDim) {
1113       for (size_t x = 0; x < frame_dim.xsize_padded; x += kGroupDim) {
1114         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_padded,
1115                   frame_dim.ysize_padded);
1116         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1117         rects_to_process.push_back(rect);
1118       }
1119     }
1120     const auto allocate_storage = [&](size_t num_threads) {
1121       dec_state->EnsureStorage(num_threads);
1122       return true;
1123     };
1124 
1125     {
1126       std::vector<ImageF> ecs;
1127       const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
1128       for (size_t i = 0; i < metadata.num_extra_channels; i++) {
1129         if (frame_header.extra_channel_upsampling[i] == 1) {
1130           ecs.push_back(std::move(dec_state->extra_channels[i]));
1131         } else {
1132           ecs.emplace_back(frame_dim.xsize_upsampled_padded,
1133                            frame_dim.ysize_upsampled_padded);
1134         }
1135       }
1136       decoded->SetExtraChannels(std::move(ecs));
1137     }
1138 
1139     std::atomic<bool> apply_features_ok{true};
1140     auto run_apply_features = [&](size_t rect_id, size_t thread) {
1141       size_t xstart = PassesDecoderState::kGroupDataXBorder;
1142       size_t ystart = PassesDecoderState::kGroupDataYBorder;
1143       for (size_t c = 0; c < 3; c++) {
1144         Rect rh(rects_to_process[rect_id].x0() >>
1145                     frame_header.chroma_subsampling.HShift(c),
1146                 rects_to_process[rect_id].y0() >>
1147                     frame_header.chroma_subsampling.VShift(c),
1148                 rects_to_process[rect_id].xsize() >>
1149                     frame_header.chroma_subsampling.HShift(c),
1150                 rects_to_process[rect_id].ysize() >>
1151                     frame_header.chroma_subsampling.VShift(c));
1152         Rect group_data_rect(xstart, ystart, rh.xsize(), rh.ysize());
1153         // Poison the image in this thread to prevent leaking initialized data
1154         // from a previous run in this thread in msan builds.
1155         msan::PoisonImage(dec_state->group_data[thread].Plane(c));
1156         CopyImageToWithPadding(
1157             rh, dec_state->decoded.Plane(c), dec_state->FinalizeRectPadding(),
1158             group_data_rect, &dec_state->group_data[thread].Plane(c));
1159       }
1160       Rect group_data_rect(xstart, ystart, rects_to_process[rect_id].xsize(),
1161                            rects_to_process[rect_id].ysize());
1162       std::vector<std::pair<ImageF*, Rect>> ec_rects;
1163       ec_rects.reserve(decoded->extra_channels().size());
1164       for (size_t i = 0; i < decoded->extra_channels().size(); i++) {
1165         Rect r = ScaleRectForEC(
1166             rects_to_process[rect_id].Crop(frame_dim.xsize, frame_dim.ysize),
1167             frame_header, i);
1168         if (frame_header.extra_channel_upsampling[i] != 1) {
1169           Rect ec_input_rect(kBlockDim, 2, r.xsize(), r.ysize());
1170           auto eti =
1171               &dec_state
1172                    ->ec_temp_images[thread * decoded->extra_channels().size() +
1173                                     i];
1174           // Poison the temp image on this thread to prevent leaking initialized
1175           // data from a previous run in this thread in msan builds.
1176           msan::PoisonImage(*eti);
1177           JXL_CHECK_IMAGE_INITIALIZED(dec_state->extra_channels[i], r);
1178           CopyImageToWithPadding(r, dec_state->extra_channels[i],
1179                                  /*padding=*/2, ec_input_rect, eti);
1180           ec_rects.emplace_back(eti, ec_input_rect);
1181         } else {
1182           JXL_CHECK_IMAGE_INITIALIZED(decoded->extra_channels()[i], r);
1183           ec_rects.emplace_back(&decoded->extra_channels()[i], r);
1184         }
1185       }
1186       if (!FinalizeImageRect(&dec_state->group_data[thread], group_data_rect,
1187                              ec_rects, dec_state, thread, decoded,
1188                              rects_to_process[rect_id])) {
1189         apply_features_ok = false;
1190       }
1191     };
1192 
1193     RunOnPool(pool, 0, rects_to_process.size(), allocate_storage,
1194               run_apply_features, "ApplyFeatures");
1195 
1196     if (!apply_features_ok) {
1197       return JXL_FAILURE("FinalizeImageRect failed");
1198     }
1199   }
1200 
1201   const size_t xsize = frame_dim.xsize_upsampled;
1202   const size_t ysize = frame_dim.ysize_upsampled;
1203 
1204   decoded->ShrinkTo(xsize, ysize);
1205   if (dec_state->pre_color_transform_frame.xsize() != 0) {
1206     dec_state->pre_color_transform_frame.ShrinkTo(xsize, ysize);
1207   }
1208 
1209   if (!skip_blending && ImageBlender::NeedsBlending(dec_state)) {
1210     if (dec_state->pre_color_transform_frame.xsize() != 0) {
1211       // Extra channels are going to be modified. Make a copy.
1212       dec_state->pre_color_transform_ec.clear();
1213       for (const auto& ec : decoded->extra_channels()) {
1214         dec_state->pre_color_transform_ec.emplace_back(CopyImage(ec));
1215       }
1216     }
1217     ImageBlender blender;
1218     ImageBundle foreground = std::move(*decoded);
1219     decoded->SetFromImage(Image3F(frame_header.nonserialized_metadata->xsize(),
1220                                   frame_header.nonserialized_metadata->ysize()),
1221                           foreground.c_current());
1222     std::vector<Rect> extra_channels_rects;
1223     decoded->extra_channels().reserve(foreground.extra_channels().size());
1224     extra_channels_rects.reserve(foreground.extra_channels().size());
1225     for (size_t i = 0; i < foreground.extra_channels().size(); ++i) {
1226       decoded->extra_channels().emplace_back(
1227           frame_header.nonserialized_metadata->xsize(),
1228           frame_header.nonserialized_metadata->ysize());
1229       extra_channels_rects.emplace_back(decoded->extra_channels().back());
1230     }
1231     JXL_RETURN_IF_ERROR(blender.PrepareBlending(
1232         dec_state, foreground.origin, foreground.xsize(), foreground.ysize(),
1233         &frame_header.nonserialized_metadata->m.extra_channel_info,
1234         foreground.c_current(), Rect(*decoded->color()),
1235         /*output=*/decoded->color(), Rect(*decoded->color()),
1236         &decoded->extra_channels(), std::move(extra_channels_rects)));
1237 
1238     std::vector<Rect> rects_to_process;
1239     for (size_t y = 0; y < frame_dim.ysize; y += kGroupDim) {
1240       for (size_t x = 0; x < frame_dim.xsize; x += kGroupDim) {
1241         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize, frame_dim.ysize);
1242         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1243         rects_to_process.push_back(rect);
1244       }
1245     }
1246 
1247     std::atomic<bool> blending_ok{true};
1248     JXL_RETURN_IF_ERROR(RunOnPool(
1249         pool, 0, rects_to_process.size(), ThreadPool::SkipInit(),
1250         [&](size_t i, size_t /*thread*/) {
1251           const Rect& rect = rects_to_process[i];
1252           auto rect_blender = blender.PrepareRect(
1253               rect, *foreground.color(), foreground.extra_channels(), rect);
1254           for (size_t y = 0; y < rect.ysize(); ++y) {
1255             if (!rect_blender.DoBlending(y)) {
1256               blending_ok = false;
1257               return;
1258             }
1259           }
1260         },
1261         "Blend"));
1262     JXL_RETURN_IF_ERROR(blending_ok.load());
1263   }
1264 
1265   return true;
1266 }
1267 
1268 }  // namespace jxl
1269 #endif  // HWY_ONCE
1270