1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #include "lib/jxl/dec_reconstruct.h"
7 
8 #include <atomic>
9 #include <utility>
10 
11 #include "lib/jxl/filters.h"
12 #include "lib/jxl/image_ops.h"
13 
14 #undef HWY_TARGET_INCLUDE
15 #define HWY_TARGET_INCLUDE "lib/jxl/dec_reconstruct.cc"
16 #include <hwy/foreach_target.h>
17 #include <hwy/highway.h>
18 
19 #include "lib/jxl/aux_out.h"
20 #include "lib/jxl/base/data_parallel.h"
21 #include "lib/jxl/base/profiler.h"
22 #include "lib/jxl/blending.h"
23 #include "lib/jxl/color_management.h"
24 #include "lib/jxl/common.h"
25 #include "lib/jxl/dec_noise.h"
26 #include "lib/jxl/dec_upsample.h"
27 #include "lib/jxl/dec_xyb-inl.h"
28 #include "lib/jxl/dec_xyb.h"
29 #include "lib/jxl/epf.h"
30 #include "lib/jxl/fast_math-inl.h"
31 #include "lib/jxl/frame_header.h"
32 #include "lib/jxl/loop_filter.h"
33 #include "lib/jxl/passes_state.h"
34 #include "lib/jxl/sanitizers.h"
35 #include "lib/jxl/transfer_functions-inl.h"
36 HWY_BEFORE_NAMESPACE();
37 namespace jxl {
38 namespace HWY_NAMESPACE {
39 
40 template <typename Op>
DoUndoXYBInPlace(Image3F * idct,const Rect & rect,Op op,const OutputEncodingInfo & output_encoding_info)41 void DoUndoXYBInPlace(Image3F* idct, const Rect& rect, Op op,
42                       const OutputEncodingInfo& output_encoding_info) {
43   // TODO(eustas): should it still be capped?
44   const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) d;
45   const size_t xsize = rect.xsize();
46   const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
47   // The size of `rect` might not be a multiple of Lanes(d), but is guaranteed
48   // to be a multiple of kBlockDim or at the margin of the image.
49   for (size_t y = 0; y < rect.ysize(); y++) {
50     float* JXL_RESTRICT row0 = rect.PlaneRow(idct, 0, y);
51     float* JXL_RESTRICT row1 = rect.PlaneRow(idct, 1, y);
52     float* JXL_RESTRICT row2 = rect.PlaneRow(idct, 2, y);
53     // All calculations are lane-wise, still some might require value-dependent
54     // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
55     msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
56     msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
57     msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
58     for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
59       const auto in_opsin_x = Load(d, row0 + x);
60       const auto in_opsin_y = Load(d, row1 + x);
61       const auto in_opsin_b = Load(d, row2 + x);
62       auto r = Undefined(d);
63       auto g = Undefined(d);
64       auto b = Undefined(d);
65       XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
66                output_encoding_info.opsin_params, &r, &g, &b);
67       op.Transform(d, &r, &g, &b);
68       Store(r, d, row0 + x);
69       Store(g, d, row1 + x);
70       Store(b, d, row2 + x);
71     }
72     msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
73     msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
74     msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
75   }
76 }
77 
78 template <typename Op>
79 struct PerChannelOp {
80   template <typename... Args>
PerChannelOpjxl::HWY_NAMESPACE::PerChannelOp81   explicit PerChannelOp(Args&&... args) : op(std::forward<Args>(args)...) {}
82   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::PerChannelOp83   void Transform(D d, T* r, T* g, T* b) {
84     *r = op.Transform(d, *r);
85     *g = op.Transform(d, *g);
86     *b = op.Transform(d, *b);
87   }
88 
89   Op op;
90 };
91 
92 struct OpLinear {
93   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpLinear94   T Transform(D d, const T& linear) {
95     return linear;
96   }
97 };
98 
99 struct OpRgb {
100   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpRgb101   T Transform(D d, const T& linear) {
102 #if JXL_HIGH_PRECISION
103     return TF_SRGB().EncodedFromDisplay(d, linear);
104 #else
105     return FastLinearToSRGB(d, linear);
106 #endif
107   }
108 };
109 
110 struct OpPq {
111   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpPq112   T Transform(D d, const T& linear) {
113     return TF_PQ().EncodedFromDisplay(d, linear);
114   }
115 };
116 
117 struct OpHlg {
OpHlgjxl::HWY_NAMESPACE::OpHlg118   explicit OpHlg(const float luminances[3], const float intensity_target)
119       : luminances(luminances) {
120     if (295 <= intensity_target && intensity_target <= 305) {
121       apply_inverse_ootf = false;
122       return;
123     }
124     exponent =
125         (1 / 1.2f) * std::pow(1.111f, -std::log2(intensity_target * 1e-3f)) - 1;
126   }
127   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpHlg128   void Transform(D d, T* r, T* g, T* b) {
129     if (apply_inverse_ootf) {
130       const T luminance = Set(d, luminances[0]) * *r +
131                           Set(d, luminances[1]) * *g +
132                           Set(d, luminances[2]) * *b;
133       const T ratio =
134           Min(FastPowf(d, luminance, Set(d, exponent)), Set(d, 1e9));
135       *r *= ratio;
136       *g *= ratio;
137       *b *= ratio;
138     }
139     *r = TF_HLG().EncodedFromDisplay(d, *r);
140     *g = TF_HLG().EncodedFromDisplay(d, *g);
141     *b = TF_HLG().EncodedFromDisplay(d, *b);
142   }
143   bool apply_inverse_ootf = true;
144   const float* luminances;
145   float exponent;
146 };
147 
148 struct Op709 {
149   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::Op709150   T Transform(D d, const T& linear) {
151     return TF_709().EncodedFromDisplay(d, linear);
152   }
153 };
154 
155 struct OpGamma {
OpGammajxl::HWY_NAMESPACE::OpGamma156   explicit OpGamma(const float inverse_gamma) : inverse_gamma(inverse_gamma) {}
157   const float inverse_gamma;
158   template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpGamma159   T Transform(D d, const T& linear) {
160     return IfThenZeroElse(linear <= Set(d, 1e-5f),
161                           FastPowf(d, linear, Set(d, inverse_gamma)));
162   }
163 };
164 
UndoXYBInPlace(Image3F * idct,const Rect & rect,const OutputEncodingInfo & output_encoding_info)165 Status UndoXYBInPlace(Image3F* idct, const Rect& rect,
166                       const OutputEncodingInfo& output_encoding_info) {
167   PROFILER_ZONE("UndoXYB");
168 
169   if (output_encoding_info.color_encoding.tf.IsLinear()) {
170     DoUndoXYBInPlace(idct, rect, PerChannelOp<OpLinear>(),
171                      output_encoding_info);
172   } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
173     DoUndoXYBInPlace(idct, rect, PerChannelOp<OpRgb>(), output_encoding_info);
174   } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
175     DoUndoXYBInPlace(idct, rect, PerChannelOp<OpPq>(), output_encoding_info);
176   } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
177     DoUndoXYBInPlace(idct, rect,
178                      OpHlg(output_encoding_info.luminances,
179                            output_encoding_info.intensity_target),
180                      output_encoding_info);
181   } else if (output_encoding_info.color_encoding.tf.Is709()) {
182     DoUndoXYBInPlace(idct, rect, PerChannelOp<Op709>(), output_encoding_info);
183   } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
184              output_encoding_info.color_encoding.tf.IsDCI()) {
185     DoUndoXYBInPlace(idct, rect,
186                      PerChannelOp<OpGamma>(output_encoding_info.inverse_gamma),
187                      output_encoding_info);
188   } else {
189     // This is a programming error.
190     JXL_ABORT("Invalid target encoding");
191   }
192   return true;
193 }
194 
195 template <typename D, typename V>
StoreRGBA(D d,V r,V g,V b,V a,bool alpha,size_t n,size_t extra,uint8_t * buf)196 void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra,
197                uint8_t* buf) {
198 #if HWY_TARGET == HWY_SCALAR
199   buf[0] = r.raw;
200   buf[1] = g.raw;
201   buf[2] = b.raw;
202   if (alpha) {
203     buf[3] = a.raw;
204   }
205 #elif HWY_TARGET == HWY_NEON
206   if (alpha) {
207     uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw};
208     if (extra >= 8) {
209       vst4_u8(buf, data);
210     } else {
211       uint8_t tmp[8 * 4];
212       vst4_u8(tmp, data);
213       memcpy(buf, tmp, n * 4);
214     }
215   } else {
216     uint8x8x3_t data = {r.raw, g.raw, b.raw};
217     if (extra >= 8) {
218       vst3_u8(buf, data);
219     } else {
220       uint8_t tmp[8 * 3];
221       vst3_u8(tmp, data);
222       memcpy(buf, tmp, n * 3);
223     }
224   }
225 #else
226   // TODO(veluca): implement this for x86.
227   size_t mul = alpha ? 4 : 3;
228   HWY_ALIGN uint8_t bytes[16];
229   Store(r, d, bytes);
230   for (size_t i = 0; i < n; i++) {
231     buf[mul * i] = bytes[i];
232   }
233   Store(g, d, bytes);
234   for (size_t i = 0; i < n; i++) {
235     buf[mul * i + 1] = bytes[i];
236   }
237   Store(b, d, bytes);
238   for (size_t i = 0; i < n; i++) {
239     buf[mul * i + 2] = bytes[i];
240   }
241   if (alpha) {
242     Store(a, d, bytes);
243     for (size_t i = 0; i < n; i++) {
244       buf[4 * i + 3] = bytes[i];
245     }
246   }
247 #endif
248 }
249 
250 // Outputs floating point image to RGBA 8-bit buffer. Does not support alpha
251 // channel in the input, but outputs opaque alpha channel for the case where the
252 // output buffer to write to is in the 4-byte per pixel RGBA format.
FloatToRGBA8(const Image3F & input,const Rect & input_rect,bool is_rgba,const ImageF * alpha_in,const Rect & alpha_rect,const Rect & output_buf_rect,uint8_t * JXL_RESTRICT output_buf,size_t stride)253 void FloatToRGBA8(const Image3F& input, const Rect& input_rect, bool is_rgba,
254                   const ImageF* alpha_in, const Rect& alpha_rect,
255                   const Rect& output_buf_rect, uint8_t* JXL_RESTRICT output_buf,
256                   size_t stride) {
257   size_t bytes = is_rgba ? 4 : 3;
258   for (size_t y = 0; y < output_buf_rect.ysize(); y++) {
259     const float* JXL_RESTRICT row_in_r = input_rect.ConstPlaneRow(input, 0, y);
260     const float* JXL_RESTRICT row_in_g = input_rect.ConstPlaneRow(input, 1, y);
261     const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y);
262     const float* JXL_RESTRICT row_in_a =
263         alpha_in ? alpha_rect.ConstRow(*alpha_in, y) : nullptr;
264     size_t base_ptr =
265         (y + output_buf_rect.y0()) * stride + bytes * output_buf_rect.x0();
266     using D = HWY_CAPPED(float, 4);
267     const D d;
268     D::Rebind<uint32_t> du;
269     auto zero = Zero(d);
270     auto one = Set(d, 1.0f);
271     auto mul = Set(d, 255.0f);
272 
273     // All calculations are lane-wise, still some might require value-dependent
274     // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
275     size_t xsize = output_buf_rect.xsize();
276     size_t xsize_v = RoundUpTo(xsize, Lanes(d));
277     msan::UnpoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
278     msan::UnpoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
279     msan::UnpoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
280     if (row_in_a)
281       msan::UnpoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
282     for (size_t x = 0; x < xsize; x += Lanes(d)) {
283       auto rf = Clamp(zero, Load(d, row_in_r + x), one) * mul;
284       auto gf = Clamp(zero, Load(d, row_in_g + x), one) * mul;
285       auto bf = Clamp(zero, Load(d, row_in_b + x), one) * mul;
286       auto af = row_in_a ? Clamp(zero, Load(d, row_in_a + x), one) * mul
287                          : Set(d, 255.0f);
288       auto r8 = U8FromU32(BitCast(du, NearestInt(rf)));
289       auto g8 = U8FromU32(BitCast(du, NearestInt(gf)));
290       auto b8 = U8FromU32(BitCast(du, NearestInt(bf)));
291       auto a8 = U8FromU32(BitCast(du, NearestInt(af)));
292       size_t n = output_buf_rect.xsize() - x;
293       if (JXL_LIKELY(n >= Lanes(d))) {
294         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, Lanes(d), n,
295                   output_buf + base_ptr + bytes * x);
296       } else {
297         StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, n, n,
298                   output_buf + base_ptr + bytes * x);
299       }
300     }
301     msan::PoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
302     msan::PoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
303     msan::PoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
304     if (row_in_a)
305       msan::PoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
306   }
307 }
308 
309 // Upsample in horizonal (if hs=1) and vertical (if vs=1) the plane_in image
310 // to the output plane_out image.
311 // The output region "rect" in plane_out and a border around it of lf.Padding()
312 // will be generated, as long as those pixels fall inside the image frame.
313 // Otherwise the border pixels that fall outside the image frame in plane_out
314 // are undefined.
315 // "rect" is an area inside the plane_out image which corresponds to the
316 // "frame_rect" area in the frame. plane_in and plane_out both are expected to
317 // have a padding of kGroupDataXBorder and kGroupDataYBorder on either side of
318 // X and Y coordinates. This means that when upsampling vertically the plane_out
319 // row `kGroupDataXBorder + N` will be generated from the plane_in row
320 // `kGroupDataXBorder + N / 2` (and a previous or next row).
DoYCbCrUpsampling(size_t hs,size_t vs,ImageF * plane_in,const Rect & rect,const Rect & frame_rect,const FrameDimensions & frame_dim,ImageF * plane_out,const LoopFilter & lf,ImageF * temp)321 void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect,
322                        const Rect& frame_rect, const FrameDimensions& frame_dim,
323                        ImageF* plane_out, const LoopFilter& lf, ImageF* temp) {
324   JXL_DASSERT(SameSize(rect, frame_rect));
325   JXL_DASSERT(hs <= 1 && vs <= 1);
326   // The pixel in (xoff, yoff) is the origin of the downsampling coordinate
327   // system.
328   size_t xoff = PassesDecoderState::kGroupDataXBorder;
329   size_t yoff = PassesDecoderState::kGroupDataYBorder;
330 
331   // This X,Y range is the intersection between the requested "rect" expanded
332   // with a lf.Padding() all around and the image frame translated to the
333   // coordinate system used by plane_out.
334   // All the pixels in the [x0, x1) x [y0, y1) range must be defined in the
335   // plane_out output at the end.
336   const size_t y0 = rect.y0() - std::min<size_t>(lf.Padding(), frame_rect.y0());
337   const size_t y1 = rect.y0() +
338                     std::min(frame_rect.y0() + rect.ysize() + lf.Padding(),
339                              frame_dim.ysize_padded) -
340                     frame_rect.y0();
341 
342   const size_t x0 = rect.x0() - std::min<size_t>(lf.Padding(), frame_rect.x0());
343   const size_t x1 = rect.x0() +
344                     std::min(frame_rect.x0() + rect.xsize() + lf.Padding(),
345                              frame_dim.xsize_padded) -
346                     frame_rect.x0();
347 
348   if (hs == 0 && vs == 0) {
349     Rect r(x0, y0, x1 - x0, y1 - y0);
350     JXL_CHECK_IMAGE_INITIALIZED(*plane_in, r);
351     CopyImageTo(r, *plane_in, r, plane_out);
352     return;
353   }
354   // Prepare padding if we are on a border.
355   // Copy the whole row/column here: it is likely similarly fast and ensures
356   // that we don't forget some parts of padding.
357   if (frame_rect.x0() == 0) {
358     for (size_t y = 0; y < plane_in->ysize(); y++) {
359       plane_in->Row(y)[rect.x0() - 1] = plane_in->Row(y)[rect.x0()];
360     }
361   }
362   if (frame_rect.x0() + x1 - rect.x0() >= frame_dim.xsize_padded) {
363     ssize_t borderx = static_cast<ssize_t>(x1 - xoff + hs) / (1 << hs) + xoff;
364     for (size_t y = 0; y < plane_in->ysize(); y++) {
365       plane_in->Row(y)[borderx] = plane_in->Row(y)[borderx - 1];
366     }
367   }
368   if (frame_rect.y0() == 0) {
369     memcpy(plane_in->Row(rect.y0() - 1), plane_in->Row(rect.y0()),
370            plane_in->xsize() * sizeof(float));
371   }
372   if (frame_rect.y0() + y1 - rect.y0() >= frame_dim.ysize_padded) {
373     ssize_t bordery = static_cast<ssize_t>(y1 - yoff + vs) / (1 << vs) + yoff;
374     memcpy(plane_in->Row(bordery), plane_in->Row(bordery - 1),
375            plane_in->xsize() * sizeof(float));
376   }
377   if (hs == 1) {
378     // Limited to 4 for Interleave*.
379     HWY_CAPPED(float, 4) d;
380     auto threefour = Set(d, 0.75f);
381     auto onefour = Set(d, 0.25f);
382     size_t orig_y0 = y0;
383     size_t orig_y1 = y1;
384     if (vs != 0) {
385       orig_y0 = (y0 >> 1) + (yoff >> 1) - 1;
386       orig_y1 = (y1 >> 1) + (yoff >> 1) + 1;
387     }
388     for (size_t y = orig_y0; y < orig_y1; y++) {
389       const float* in = plane_in->Row(y);
390       float* out = temp->Row(y);
391       for (size_t x = x0 / (2 * Lanes(d)) * 2 * Lanes(d);
392            x < RoundUpTo(x1, 2 * Lanes(d)); x += 2 * Lanes(d)) {
393         size_t ox = (x >> 1) + (xoff >> 1);
394         auto current = Load(d, in + ox) * threefour;
395         auto prev = LoadU(d, in + ox - 1);
396         auto next = LoadU(d, in + ox + 1);
397         auto left = MulAdd(onefour, prev, current);
398         auto right = MulAdd(onefour, next, current);
399 #if HWY_TARGET == HWY_SCALAR
400         Store(left, d, out + x);
401         Store(right, d, out + x + 1);
402 #else
403         Store(InterleaveLower(d, left, right), d, out + x);
404         Store(InterleaveUpper(d, left, right), d, out + x + Lanes(d));
405 #endif
406       }
407     }
408   } else {
409     CopyImageTo(*plane_in, temp);
410   }
411   if (vs == 1) {
412     HWY_FULL(float) d;
413     auto threefour = Set(d, 0.75f);
414     auto onefour = Set(d, 0.25f);
415     for (size_t y = y0; y < y1; y++) {
416       size_t oy1 = (y >> 1) + (yoff >> 1);
417       if ((y & 1) == 1) oy1++;
418       size_t oy0 = oy1 - 1;
419       const float* in0 = temp->Row(oy0);
420       const float* in1 = temp->Row(oy1);
421       float* out = plane_out->Row(y);
422       if ((y & 1) == 1) {
423         for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
424              x += Lanes(d)) {
425           auto i0 = Load(d, in0 + x);
426           auto i1 = Load(d, in1 + x);
427           auto o = MulAdd(i0, threefour, i1 * onefour);
428           Store(o, d, out + x);
429         }
430       } else {
431         for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
432              x += Lanes(d)) {
433           auto i0 = Load(d, in0 + x);
434           auto i1 = Load(d, in1 + x);
435           auto o = MulAdd(i0, onefour, i1 * threefour);
436           Store(o, d, out + x);
437         }
438       }
439     }
440   } else {
441     CopyImageTo(*temp, plane_out);
442   }
443 
444   // The output must be initialized including the lf.Padding() around the image
445   // for all the pixels that fall inside the image frame.
446   JXL_CHECK_IMAGE_INITIALIZED(*plane_out, Rect(x0, y0, x1 - x0, y1 - y0));
447 }
448 
449 // NOLINTNEXTLINE(google-readability-namespace-comments)
450 }  // namespace HWY_NAMESPACE
451 }  // namespace jxl
452 HWY_AFTER_NAMESPACE();
453 
454 #if HWY_ONCE
455 namespace jxl {
456 
457 HWY_EXPORT(UndoXYBInPlace);
458 HWY_EXPORT(FloatToRGBA8);
459 HWY_EXPORT(DoYCbCrUpsampling);
460 
UndoXYB(const Image3F & src,Image3F * dst,const OutputEncodingInfo & output_info,ThreadPool * pool)461 void UndoXYB(const Image3F& src, Image3F* dst,
462              const OutputEncodingInfo& output_info, ThreadPool* pool) {
463   CopyImageTo(src, dst);
464   JXL_CHECK(RunOnPool(
465       pool, 0, src.ysize(), ThreadPool::NoInit,
466       [&](const uint32_t y, size_t /*thread*/) {
467         JXL_CHECK(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(dst, Rect(*dst).Line(y),
468                                                        output_info));
469       },
470       "UndoXYB"));
471 }
472 
473 namespace {
ScaleRectForEC(Rect in,const FrameHeader & frame_header,size_t ec)474 Rect ScaleRectForEC(Rect in, const FrameHeader& frame_header, size_t ec) {
475   auto s = [&](size_t x) {
476     return DivCeil(x * frame_header.upsampling,
477                    frame_header.extra_channel_upsampling[ec]);
478   };
479   // For x0 and y0 the DivCeil is actually an exact division.
480   return Rect(s(in.x0()), s(in.y0()), s(in.xsize()), s(in.ysize()));
481 }
482 
483 // Implements EnsurePaddingInPlace, but allows processing data one row at a
484 // time.
485 class EnsurePaddingInPlaceRowByRow {
Init(const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)486   void Init(const Rect& rect, const Rect& image_rect, size_t image_xsize,
487             size_t image_ysize, size_t xpadding, size_t ypadding, ssize_t* y0,
488             ssize_t* y1) {
489     // coordinates relative to rect.
490     JXL_ASSERT(SameSize(rect, image_rect));
491     JXL_ASSERT(image_rect.x0() + image_rect.xsize() <= image_xsize);
492     JXL_ASSERT(image_rect.y0() + image_rect.ysize() <= image_ysize);
493     *y0 = -std::min(image_rect.y0(), ypadding);
494     *y1 = rect.ysize() + std::min(ypadding, image_ysize - image_rect.ysize() -
495                                                 image_rect.y0());
496     if (image_rect.x0() >= xpadding &&
497         image_rect.x0() + image_rect.xsize() + xpadding <= image_xsize) {
498       // Nothing to do.
499       strategy_ = kSkip;
500     } else if (image_xsize >= 2 * xpadding) {
501       strategy_ = kFast;
502     } else {
503       strategy_ = kSlow;
504     }
505     y0_ = rect.y0();
506     JXL_ASSERT(rect.x0() >= xpadding);
507     x0_ = x1_ = rect.x0() - xpadding;
508     // If close to the left border - do mirroring.
509     if (image_rect.x0() < xpadding) x1_ = rect.x0() - image_rect.x0();
510     x2_ = x3_ = rect.x0() + rect.xsize() + xpadding;
511     // If close to the right border - do mirroring.
512     if (image_rect.x0() + image_rect.xsize() + xpadding > image_xsize) {
513       x2_ = rect.x0() + image_xsize - image_rect.x0();
514     }
515     JXL_ASSERT(x0_ <= x1_);
516     JXL_ASSERT(x1_ <= x2_);
517     JXL_ASSERT(x2_ <= x3_);
518     JXL_ASSERT(image_xsize == (x2_ - x1_) ||
519                (x1_ - x0_ <= x2_ - x1_ && x3_ - x2_ <= x2_ - x1_));
520   }
521 
522  public:
Init(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)523   void Init(Image3F* img, const Rect& rect, const Rect& image_rect,
524             size_t image_xsize, size_t image_ysize, size_t xpadding,
525             size_t ypadding, ssize_t* y0, ssize_t* y1) {
526     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
527          y1);
528     img3_ = img;
529     JXL_DASSERT(x3_ <= img->xsize());
530   }
Init(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)531   void Init(ImageF* img, const Rect& rect, const Rect& image_rect,
532             size_t image_xsize, size_t image_ysize, size_t xpadding,
533             size_t ypadding, ssize_t* y0, ssize_t* y1) {
534     Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
535          y1);
536     img_ = img;
537     JXL_DASSERT(x3_ <= img->xsize());
538   }
539   // To be called when row `y` of the input is available, for all the values in
540   // [*y0, *y1).
Process3(ssize_t y)541   void Process3(ssize_t y) {
542     JXL_DASSERT(img3_);
543     for (size_t c = 0; c < 3; c++) {
544       img_ = &img3_->Plane(c);
545       Process(y);
546     }
547   }
Process(ssize_t y)548   void Process(ssize_t y) {
549     JXL_DASSERT(img_);
550     switch (strategy_) {
551       case kSkip:
552         break;
553       case kFast: {
554         // Image is wide enough that a single Mirror() step is sufficient.
555         float* JXL_RESTRICT row = img_->Row(y + y0_);
556         for (size_t x = x0_; x < x1_; x++) {
557           row[x] = row[2 * x1_ - x - 1];
558         }
559         for (size_t x = x2_; x < x3_; x++) {
560           row[x] = row[2 * x2_ - x - 1];
561         }
562         break;
563       }
564       case kSlow: {
565         // Slow case for small images.
566         float* JXL_RESTRICT row = img_->Row(y + y0_) + x1_;
567         for (ssize_t x = x0_ - x1_; x < 0; x++) {
568           *(row + x) = row[Mirror(x, x2_ - x1_)];
569         }
570         for (size_t x = x2_ - x1_; x < x3_ - x1_; x++) {
571           *(row + x) = row[Mirror(x, x2_ - x1_)];
572         }
573         break;
574       }
575     }
576   }
577 
578  private:
579   // Initialized to silence spurious compiler warnings.
580   Image3F* img3_ = nullptr;
581   ImageF* img_ = nullptr;
582   // Will fill [x0_, x1_) and [x2_, x3_) on every row.
583   // The [x1_, x2_) range contains valid image pixels. We guarantee that either
584   // x1_ - x0_ <= x2_ - x1_, (and similarly for x2_, x3_), or that the [x1_,
585   // x2_) contains a full horizontal line of the original image.
586   size_t x0_ = 0, x1_ = 0, x2_ = 0, x3_ = 0;
587   size_t y0_ = 0;
588   // kSlow: use calls to Mirror(), for the case where the border might be larger
589   // than the image.
590   // kFast: directly use the result of Mirror() when it can be computed in a
591   // single iteration.
592   // kSkip: do nothing.
593   enum Strategy { kFast, kSlow, kSkip };
594   Strategy strategy_ = kSkip;
595 };
596 }  // namespace
597 
EnsurePaddingInPlace(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)598 void EnsurePaddingInPlace(Image3F* img, const Rect& rect,
599                           const Rect& image_rect, size_t image_xsize,
600                           size_t image_ysize, size_t xpadding,
601                           size_t ypadding) {
602   ssize_t y0, y1;
603   EnsurePaddingInPlaceRowByRow impl;
604   impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
605             &y0, &y1);
606   for (ssize_t y = y0; y < y1; y++) {
607     impl.Process3(y);
608   }
609 }
610 
EnsurePaddingInPlace(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)611 void EnsurePaddingInPlace(ImageF* img, const Rect& rect, const Rect& image_rect,
612                           size_t image_xsize, size_t image_ysize,
613                           size_t xpadding, size_t ypadding) {
614   ssize_t y0, y1;
615   EnsurePaddingInPlaceRowByRow impl;
616   impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
617             &y0, &y1);
618   for (ssize_t y = y0; y < y1; y++) {
619     impl.Process(y);
620   }
621 }
622 
FinalizeImageRect(Image3F * input_image,const Rect & input_rect,const std::vector<std::pair<ImageF *,Rect>> & extra_channels,PassesDecoderState * dec_state,size_t thread,ImageBundle * JXL_RESTRICT output_image,const Rect & frame_rect)623 Status FinalizeImageRect(
624     Image3F* input_image, const Rect& input_rect,
625     const std::vector<std::pair<ImageF*, Rect>>& extra_channels,
626     PassesDecoderState* dec_state, size_t thread,
627     ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect) {
628   // Do nothing if using the rendering pipeline.
629   if (dec_state->render_pipeline) {
630     return true;
631   }
632   const ImageFeatures& image_features = dec_state->shared->image_features;
633   const FrameHeader& frame_header = dec_state->shared->frame_header;
634   const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
635   const LoopFilter& lf = frame_header.loop_filter;
636   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
637   JXL_DASSERT(frame_rect.xsize() <= kApplyImageFeaturesTileDim);
638   JXL_DASSERT(frame_rect.ysize() <= kApplyImageFeaturesTileDim);
639   JXL_DASSERT(input_rect.xsize() == frame_rect.xsize());
640   JXL_DASSERT(input_rect.ysize() == frame_rect.ysize());
641   JXL_DASSERT(frame_rect.x0() % GroupBorderAssigner::kPaddingXRound == 0);
642   JXL_DASSERT(frame_rect.xsize() % GroupBorderAssigner::kPaddingXRound == 0 ||
643               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize ||
644               frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize_padded);
645 
646   // +----------------------------- STEP 1 ------------------------------+
647   // | Compute the rects on which patches and splines will be applied.   |
648   // | In case we are applying upsampling, we need to apply patches on a |
649   // | slightly larger image.                                            |
650   // +-------------------------------------------------------------------+
651 
652   // If we are applying upsampling, we need 2 more pixels around the actual rect
653   // for border. Thus, we also need to apply patches and splines to those
654   // pixels. We compute here
655   // - The portion of image that corresponds to the area we are applying IF.
656   //   (rect_for_if)
657   // - The rect where that pixel data is stored in upsampling_input_storage.
658   //   (rect_for_if_storage)
659   // - The rect where the pixel data that we need to upsample is stored.
660   //   (rect_for_upsampling)
661   // - The source rect for the pixel data in `input_image`. It is assumed that,
662   //   if `frame_rect` is not on an image border, `input_image:input_rect` has
663   //   enough border available. (rect_for_if_input)
664 
665   Image3F* output_color =
666       dec_state->rgb_output == nullptr && dec_state->pixel_callback == nullptr
667           ? output_image->color()
668           : nullptr;
669 
670   Image3F* storage_for_if = output_color;
671   Rect rect_for_if = frame_rect;
672   Rect rect_for_if_storage = frame_rect;
673   Rect rect_for_upsampling = frame_rect;
674   Rect rect_for_if_input = input_rect;
675   // The same as rect_for_if_input but in the frame coordinates.
676   Rect frame_rect_for_ycbcr_upsampling = frame_rect;
677   size_t extra_rows_t = 0;
678   size_t extra_rows_b = 0;
679   if (frame_header.upsampling != 1) {
680     size_t ifbx0 = 0;
681     size_t ifbx1 = 0;
682     size_t ifby0 = 0;
683     size_t ifby1 = 0;
684     if (frame_rect.x0() >= 2) {
685       JXL_DASSERT(input_rect.x0() >= 2);
686       ifbx0 = 2;
687     }
688     if (frame_rect.y0() >= 2) {
689       JXL_DASSERT(input_rect.y0() >= 2);
690       extra_rows_t = ifby0 = 2;
691     }
692     for (size_t extra : {1, 2}) {
693       if (frame_rect.x0() + frame_rect.xsize() + extra <=
694           dec_state->shared->frame_dim.xsize_padded) {
695         JXL_DASSERT(input_rect.x0() + input_rect.xsize() + extra <=
696                     input_image->xsize());
697         ifbx1 = extra;
698       }
699       if (frame_rect.y0() + frame_rect.ysize() + extra <=
700           dec_state->shared->frame_dim.ysize_padded) {
701         JXL_DASSERT(input_rect.y0() + input_rect.ysize() + extra <=
702                     input_image->ysize());
703         extra_rows_b = ifby1 = extra;
704       }
705     }
706     rect_for_if = Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
707                        frame_rect.xsize() + ifbx0 + ifbx1,
708                        frame_rect.ysize() + ifby0 + ifby1);
709     // Storage for pixel data does not necessarily start at (0, 0) as we need to
710     // have the left border of upsampling_rect aligned to a multiple of
711     // GroupBorderAssigner::kPaddingXRound.
712     rect_for_if_storage =
713         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound) -
714                  ifbx0,
715              kBlockDim, rect_for_if.xsize(), rect_for_if.ysize());
716     rect_for_upsampling =
717         Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound),
718              kBlockDim + ifby0, frame_rect.xsize(), frame_rect.ysize());
719     rect_for_if_input =
720         Rect(input_rect.x0() - ifbx0, input_rect.y0() - ifby0,
721              rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
722     frame_rect_for_ycbcr_upsampling =
723         Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
724              rect_for_if_input.xsize(), rect_for_if_input.ysize());
725     storage_for_if = &dec_state->upsampling_input_storage[thread];
726   }
727 
728   // +--------------------------- STEP 1.5 ------------------------------+
729   // | Perform YCbCr upsampling if needed.                               |
730   // +-------------------------------------------------------------------+
731 
732   Image3F* input = input_image;
733   if (!frame_header.chroma_subsampling.Is444()) {
734     for (size_t c = 0; c < 3; c++) {
735       size_t vs = frame_header.chroma_subsampling.VShift(c);
736       size_t hs = frame_header.chroma_subsampling.HShift(c);
737       // The per-thread output is used for the first time here. Poison the temp
738       // image on this thread to prevent leaking initialized data from a
739       // previous run in this thread in msan builds.
740       msan::PoisonImage(dec_state->ycbcr_out_images[thread].Plane(c));
741       HWY_DYNAMIC_DISPATCH(DoYCbCrUpsampling)
742       (hs, vs, &input_image->Plane(c), rect_for_if_input,
743        frame_rect_for_ycbcr_upsampling, frame_dim,
744        &dec_state->ycbcr_out_images[thread].Plane(c), lf,
745        &dec_state->ycbcr_temp_images[thread]);
746     }
747     input = &dec_state->ycbcr_out_images[thread];
748   }
749 
750   // Variables for upsampling and filtering.
751   Rect upsampled_frame_rect(frame_rect.x0() * frame_header.upsampling,
752                             frame_rect.y0() * frame_header.upsampling,
753                             frame_rect.xsize() * frame_header.upsampling,
754                             frame_rect.ysize() * frame_header.upsampling);
755   Rect full_frame_rect(0, 0, frame_dim.xsize_upsampled,
756                        frame_dim.ysize_upsampled);
757   upsampled_frame_rect = upsampled_frame_rect.Crop(full_frame_rect);
758   EnsurePaddingInPlaceRowByRow ensure_padding_upsampling;
759   ssize_t ensure_padding_upsampling_y0 = 0;
760   ssize_t ensure_padding_upsampling_y1 = 0;
761 
762   EnsurePaddingInPlaceRowByRow ensure_padding_filter;
763   FilterPipeline* fp = nullptr;
764   ssize_t ensure_padding_filter_y0 = 0;
765   ssize_t ensure_padding_filter_y1 = 0;
766   if (lf.epf_iters != 0 || lf.gab) {
767     fp = &dec_state->filter_pipelines[thread];
768   }
769 
770   // +----------------------------- STEP 2 ------------------------------+
771   // | Change rects and buffer to not use `output_image` if direct       |
772   // | output to rgb8 is requested.                                      |
773   // +-------------------------------------------------------------------+
774   Image3F* output_pixel_data_storage = output_color;
775   Rect upsampled_frame_rect_for_storage = upsampled_frame_rect;
776   if (dec_state->rgb_output || dec_state->pixel_callback) {
777     size_t log2_upsampling = CeilLog2Nonzero(frame_header.upsampling);
778     if (storage_for_if == output_color) {
779       storage_for_if =
780           &dec_state->output_pixel_data_storage[log2_upsampling][thread];
781       rect_for_if_storage =
782           Rect(0, 0, rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
783     }
784     output_pixel_data_storage =
785         &dec_state->output_pixel_data_storage[log2_upsampling][thread];
786     upsampled_frame_rect_for_storage =
787         Rect(0, 0, upsampled_frame_rect.xsize(), upsampled_frame_rect.ysize());
788     if (frame_header.upsampling == 1 && fp == nullptr) {
789       upsampled_frame_rect_for_storage = rect_for_if_storage =
790           rect_for_if_input;
791       output_pixel_data_storage = storage_for_if = input;
792     }
793   }
794   // Set up alpha channel.
795   const size_t ec =
796       metadata.Find(ExtraChannel::kAlpha) - metadata.extra_channel_info.data();
797   const ImageF* alpha = nullptr;
798   Rect alpha_rect = upsampled_frame_rect;
799   if (ec < metadata.extra_channel_info.size()) {
800     JXL_ASSERT(ec < extra_channels.size());
801     if (frame_header.extra_channel_upsampling[ec] == 1) {
802       alpha = extra_channels[ec].first;
803       alpha_rect = extra_channels[ec].second;
804     } else {
805       alpha = &output_image->extra_channels()[ec];
806       alpha_rect = upsampled_frame_rect;
807     }
808   }
809 
810   // +----------------------------- STEP 3 ------------------------------+
811   // | Set up upsampling and upsample extra channels.                    |
812   // +-------------------------------------------------------------------+
813   Upsampler* color_upsampler = nullptr;
814   if (frame_header.upsampling != 1) {
815     color_upsampler =
816         &dec_state->upsamplers[CeilLog2Nonzero(frame_header.upsampling) - 1];
817     ensure_padding_upsampling.Init(
818         storage_for_if, rect_for_upsampling, frame_rect, frame_dim.xsize_padded,
819         frame_dim.ysize_padded, 2, 2, &ensure_padding_upsampling_y0,
820         &ensure_padding_upsampling_y1);
821   }
822 
823   std::vector<std::pair<ImageF*, Rect>> extra_channels_for_patches;
824   std::vector<EnsurePaddingInPlaceRowByRow> ec_padding;
825 
826   bool late_ec_upsample = frame_header.upsampling != 1;
827   for (auto ecups : frame_header.extra_channel_upsampling) {
828     if (ecups != frame_header.upsampling) {
829       // If patches are applied, either frame_header.upsampling == 1 or
830       // late_ec_upsample is true.
831       late_ec_upsample = false;
832     }
833   }
834 
835   ssize_t ensure_padding_upsampling_ec_y0 = 0;
836   ssize_t ensure_padding_upsampling_ec_y1 = 0;
837 
838   // TODO(veluca) do not upsample extra channels to a full-image-sized buffer if
839   // we are not outputting to an ImageBundle.
840   if (!late_ec_upsample) {
841     // Upsample extra channels first if not all channels have the same
842     // upsampling factor.
843     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
844       size_t ecups = frame_header.extra_channel_upsampling[ec];
845       if (ecups == 1) {
846         extra_channels_for_patches.push_back(extra_channels[ec]);
847         continue;
848       }
849       ssize_t ensure_padding_y0, ensure_padding_y1;
850       EnsurePaddingInPlaceRowByRow ensure_padding;
851       // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
852       Rect ec_image_rect = ScaleRectForEC(
853           frame_rect.Crop(frame_dim.xsize, frame_dim.ysize), frame_header, ec);
854       size_t ecxs = DivCeil(frame_dim.xsize_upsampled,
855                             frame_header.extra_channel_upsampling[ec]);
856       size_t ecys = DivCeil(frame_dim.ysize_upsampled,
857                             frame_header.extra_channel_upsampling[ec]);
858       ensure_padding.Init(extra_channels[ec].first, extra_channels[ec].second,
859                           ec_image_rect, ecxs, ecys, 2, 2, &ensure_padding_y0,
860                           &ensure_padding_y1);
861       for (ssize_t y = ensure_padding_y0; y < ensure_padding_y1; y++) {
862         ensure_padding.Process(y);
863       }
864       Upsampler& upsampler =
865           dec_state->upsamplers[CeilLog2Nonzero(
866                                     frame_header.extra_channel_upsampling[ec]) -
867                                 1];
868       upsampler.UpsampleRect(
869           *extra_channels[ec].first, extra_channels[ec].second,
870           &output_image->extra_channels()[ec], upsampled_frame_rect,
871           static_cast<ssize_t>(ec_image_rect.y0()) -
872               static_cast<ssize_t>(extra_channels[ec].second.y0()),
873           ecys, dec_state->upsampler_storage[thread].get());
874       extra_channels_for_patches.emplace_back(
875           &output_image->extra_channels()[ec], upsampled_frame_rect);
876     }
877   } else {
878     // Upsample extra channels last if color channels are upsampled and all the
879     // extra channels have the same upsampling as them.
880     ec_padding.resize(extra_channels.size());
881     for (size_t ec = 0; ec < extra_channels.size(); ec++) {
882       // Add a border to the extra channel rect for when patches are applied.
883       // This ensures that the correct row is accessed (y values for patches are
884       // relative to rect_for_if, not to input_rect).
885       // As the rect is extended by 0 or 2 pixels, and the patches input has,
886       // accordingly, the same padding, this is safe.
887       Rect r(extra_channels[ec].second.x0() + rect_for_upsampling.x0() -
888                  rect_for_if_storage.x0(),
889              extra_channels[ec].second.y0() + rect_for_upsampling.y0() -
890                  rect_for_if_storage.y0(),
891              extra_channels[ec].second.xsize() + rect_for_if_storage.xsize() -
892                  rect_for_upsampling.xsize(),
893              extra_channels[ec].second.ysize() + rect_for_if_storage.ysize() -
894                  rect_for_upsampling.ysize());
895       extra_channels_for_patches.emplace_back(extra_channels[ec].first, r);
896       // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
897       ec_padding[ec].Init(extra_channels[ec].first, extra_channels[ec].second,
898                           frame_rect.Crop(frame_dim.xsize, frame_dim.ysize),
899                           frame_dim.xsize, frame_dim.ysize, 2, 2,
900                           &ensure_padding_upsampling_ec_y0,
901                           &ensure_padding_upsampling_ec_y1);
902     }
903   }
904 
905   // Initialized to a valid non-null ptr to avoid UB if arithmetic is done with
906   // the pointer value (which would then not be used).
907   std::vector<float*> ec_ptrs_for_patches(extra_channels.size(),
908                                           input->PlaneRow(0, 0));
909 
910   // +----------------------------- STEP 4 ------------------------------+
911   // | Set up the filter pipeline.                                       |
912   // +-------------------------------------------------------------------+
913   if (fp) {
914     ensure_padding_filter.Init(
915         input, rect_for_if_input, rect_for_if, frame_dim.xsize_padded,
916         frame_dim.ysize_padded, lf.Padding(), lf.Padding(),
917         &ensure_padding_filter_y0, &ensure_padding_filter_y1);
918 
919     fp = PrepareFilterPipeline(dec_state, rect_for_if, *input,
920                                rect_for_if_input, frame_dim.ysize_padded,
921                                thread, storage_for_if, rect_for_if_storage);
922   }
923 
924   // +----------------------------- STEP 5 ------------------------------+
925   // | Run the prepared pipeline of operations.                          |
926   // +-------------------------------------------------------------------+
927 
928   // y values are relative to rect_for_if.
929   // Automatic mirroring in fp->ApplyFiltersRow() implies that we should ensure
930   // that padding for the first lines of the image is already present before
931   // calling ApplyFiltersRow() with "virtual" rows.
932   // Here we rely on the fact that virtual rows at the beginning of the image
933   // are only present if input_rect.y0() == 0.
934   ssize_t first_ensure_padding_y = ensure_padding_filter_y0;
935   if (frame_rect.y0() == 0) {
936     JXL_DASSERT(ensure_padding_filter_y0 == 0);
937     first_ensure_padding_y =
938         std::min<ssize_t>(lf.Padding(), ensure_padding_filter_y1);
939     for (ssize_t y = 0; y < first_ensure_padding_y; y++) {
940       ensure_padding_filter.Process3(y);
941     }
942   }
943 
944   for (ssize_t y = -lf.Padding();
945        y < static_cast<ssize_t>(lf.Padding() + rect_for_if.ysize()); y++) {
946     if (fp) {
947       if (y >= first_ensure_padding_y && y < ensure_padding_filter_y1) {
948         ensure_padding_filter.Process3(y);
949       }
950       fp->ApplyFiltersRow(lf, dec_state->filter_weights, y);
951     } else if (output_pixel_data_storage != input) {
952       for (size_t c = 0; c < 3; c++) {
953         memcpy(rect_for_if_storage.PlaneRow(storage_for_if, c, y),
954                rect_for_if_input.ConstPlaneRow(*input, c, y),
955                rect_for_if_input.xsize() * sizeof(float));
956       }
957     }
958     if (y < static_cast<ssize_t>(lf.Padding())) continue;
959     // At this point, row `y - lf.Padding()` of `rect_for_if` has been produced
960     // by the filters.
961     ssize_t available_y = y - lf.Padding();
962     if (frame_header.upsampling == 1) {
963       for (size_t i = 0; i < extra_channels.size(); i++) {
964         ec_ptrs_for_patches[i] = extra_channels_for_patches[i].second.Row(
965             extra_channels_for_patches[i].first, available_y);
966       }
967     }
968     image_features.patches.AddTo(
969         storage_for_if, rect_for_if_storage.Line(available_y),
970         ec_ptrs_for_patches.data(), rect_for_if.Line(available_y));
971     image_features.splines.AddTo(storage_for_if,
972                                  rect_for_if_storage.Line(available_y),
973                                  rect_for_if.Line(available_y));
974     size_t num_ys = 1;
975     if (frame_header.upsampling != 1) {
976       // Upsampling `y` values are relative to `rect_for_upsampling`, not to
977       // `rect_for_if`.
978       ssize_t shifted_y = available_y - extra_rows_t;
979       if (shifted_y >= ensure_padding_upsampling_y0 &&
980           shifted_y < ensure_padding_upsampling_y1) {
981         ensure_padding_upsampling.Process3(shifted_y);
982       }
983       if (late_ec_upsample && shifted_y >= ensure_padding_upsampling_ec_y0 &&
984           shifted_y < ensure_padding_upsampling_ec_y1) {
985         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
986           ec_padding[ec].Process(shifted_y);
987         }
988       }
989       // Upsampling will access two rows of border, so the first upsampling
990       // output will be available after shifted_y is at least 2, *unless* image
991       // height is <= 2.
992       if (shifted_y < 2 &&
993           shifted_y + 1 != static_cast<ssize_t>(frame_rect.ysize())) {
994         continue;
995       }
996       // Value relative to upsampled_frame_rect.
997       size_t input_y = std::max<ssize_t>(shifted_y - 2, 0);
998       size_t upsampled_available_y = frame_header.upsampling * input_y;
999       size_t num_input_rows = 1;
1000       // If we are going to mirror the last output rows, then we already have 3
1001       // input lines ready. This happens iff we did not extend rect_for_if on
1002       // the bottom *and* we are at the last `y` value.
1003       if (extra_rows_b != 2 &&
1004           static_cast<size_t>(y) + 1 == lf.Padding() + rect_for_if.ysize()) {
1005         num_input_rows = 3;
1006       }
1007       num_input_rows = std::min(num_input_rows, frame_dim.ysize_padded);
1008       num_ys = num_input_rows * frame_header.upsampling;
1009 
1010       if (static_cast<size_t>(upsampled_available_y) >=
1011           upsampled_frame_rect.ysize()) {
1012         continue;
1013       }
1014 
1015       if (upsampled_available_y + num_ys >= upsampled_frame_rect.ysize()) {
1016         num_ys = upsampled_frame_rect.ysize() - upsampled_available_y;
1017       }
1018 
1019       // Upsampler takes care of mirroring, and checks "physical" boundaries.
1020       Rect upsample_input_rect = rect_for_upsampling.Lines(input_y, 1);
1021       color_upsampler->UpsampleRect(
1022           *storage_for_if, upsample_input_rect, output_pixel_data_storage,
1023           upsampled_frame_rect_for_storage.Lines(upsampled_available_y, num_ys),
1024           static_cast<ssize_t>(frame_rect.y0()) -
1025               static_cast<ssize_t>(rect_for_upsampling.y0()),
1026           frame_dim.ysize_padded, dec_state->upsampler_storage[thread].get());
1027       if (late_ec_upsample) {
1028         for (size_t ec = 0; ec < extra_channels.size(); ec++) {
1029           // Upsampler takes care of mirroring, and checks "physical"
1030           // boundaries.
1031           Rect upsample_ec_input_rect =
1032               extra_channels[ec].second.Lines(input_y, 1);
1033           color_upsampler->UpsampleRect(
1034               *extra_channels[ec].first, upsample_ec_input_rect,
1035               &output_image->extra_channels()[ec],
1036               upsampled_frame_rect.Lines(upsampled_available_y, num_ys),
1037               static_cast<ssize_t>(frame_rect.y0()) -
1038                   static_cast<ssize_t>(extra_channels[ec].second.y0()),
1039               frame_dim.ysize, dec_state->upsampler_storage[thread].get());
1040         }
1041       }
1042       available_y = upsampled_available_y;
1043     }
1044 
1045     if (static_cast<size_t>(available_y) >= upsampled_frame_rect.ysize()) {
1046       continue;
1047     }
1048 
1049     // The image data is now unconditionally in
1050     // `output_image_storage:upsampled_frame_rect_for_storage`.
1051     if (frame_header.flags & FrameHeader::kNoise) {
1052       PROFILER_ZONE("AddNoise");
1053       AddNoise(image_features.noise_params,
1054                upsampled_frame_rect.Lines(available_y, num_ys),
1055                dec_state->noise,
1056                upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1057                dec_state->shared_storage.cmap, output_pixel_data_storage);
1058     }
1059 
1060     if (dec_state->pre_color_transform_frame.xsize() != 0) {
1061       for (size_t c = 0; c < 3; c++) {
1062         for (size_t y = available_y; y < available_y + num_ys; y++) {
1063           float* JXL_RESTRICT row_out = upsampled_frame_rect.PlaneRow(
1064               &dec_state->pre_color_transform_frame, c, y);
1065           const float* JXL_RESTRICT row_in =
1066               upsampled_frame_rect_for_storage.ConstPlaneRow(
1067                   *output_pixel_data_storage, c, y);
1068           memcpy(row_out, row_in,
1069                  upsampled_frame_rect.xsize() * sizeof(*row_in));
1070         }
1071       }
1072     }
1073 
1074     // We skip the color transform entirely if save_before_color_transform and
1075     // the frame is not supposed to be displayed.
1076 
1077     if (dec_state->fast_xyb_srgb8_conversion) {
1078       Rect output_rect = upsampled_frame_rect.Lines(available_y, num_ys)
1079                              .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1080                                         frame_dim.ysize_upsampled));
1081       for (size_t iy = 0; iy < output_rect.ysize(); iy++) {
1082         const float* xyba[4] = {
1083             upsampled_frame_rect_for_storage.ConstPlaneRow(
1084                 *output_pixel_data_storage, 0, available_y + iy),
1085             upsampled_frame_rect_for_storage.ConstPlaneRow(
1086                 *output_pixel_data_storage, 1, available_y + iy),
1087             upsampled_frame_rect_for_storage.ConstPlaneRow(
1088                 *output_pixel_data_storage, 2, available_y + iy),
1089             nullptr};
1090         if (alpha) {
1091           xyba[3] = alpha_rect.ConstRow(*alpha, available_y + iy);
1092         }
1093         uint8_t* out_buf =
1094             dec_state->rgb_output +
1095             (iy + output_rect.y0()) * dec_state->rgb_stride +
1096             (dec_state->rgb_output_is_rgba ? 4 : 3) * output_rect.x0();
1097         FastXYBTosRGB8(xyba, out_buf, dec_state->rgb_output_is_rgba,
1098                        output_rect.xsize());
1099       }
1100     } else {
1101       if (frame_header.needs_color_transform()) {
1102         if (frame_header.color_transform == ColorTransform::kXYB) {
1103           JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(
1104               output_pixel_data_storage,
1105               upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1106               dec_state->output_encoding_info));
1107         } else if (frame_header.color_transform == ColorTransform::kYCbCr) {
1108           YcbcrToRgb(
1109               *output_pixel_data_storage, output_pixel_data_storage,
1110               upsampled_frame_rect_for_storage.Lines(available_y, num_ys));
1111         }
1112       }
1113 
1114       // TODO(veluca): all blending should happen here.
1115 
1116       Rect image_line_rect = upsampled_frame_rect.Lines(available_y, num_ys)
1117                                  .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1118                                             frame_dim.ysize_upsampled));
1119       if ((image_line_rect.xsize()) == 0 || image_line_rect.ysize() == 0) {
1120         continue;
1121       }
1122 
1123       if (dec_state->rgb_output != nullptr) {
1124         HWY_DYNAMIC_DISPATCH(FloatToRGBA8)
1125         (*output_pixel_data_storage,
1126          upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1127          dec_state->rgb_output_is_rgba, alpha,
1128          alpha_rect.Lines(available_y, num_ys), image_line_rect,
1129          dec_state->rgb_output, dec_state->rgb_stride);
1130       }
1131       if (dec_state->pixel_callback != nullptr) {
1132         Rect alpha_line_rect = alpha_rect.Lines(available_y, num_ys);
1133         Rect color_input_line_rect =
1134             upsampled_frame_rect_for_storage.Lines(available_y, num_ys);
1135         const float* line_buffers[4];
1136         for (size_t iy = 0; iy < image_line_rect.ysize(); iy++) {
1137           for (size_t c = 0; c < 3; c++) {
1138             line_buffers[c] = color_input_line_rect.ConstPlaneRow(
1139                 *output_pixel_data_storage, c, iy);
1140           }
1141           if (alpha) {
1142             line_buffers[3] = alpha_line_rect.ConstRow(*alpha, iy);
1143           } else {
1144             line_buffers[3] = dec_state->opaque_alpha.data();
1145           }
1146           std::vector<float>& interleaved =
1147               dec_state->pixel_callback_rows[thread];
1148           size_t j = 0;
1149           for (size_t i = 0; i < image_line_rect.xsize(); i++) {
1150             interleaved[j++] = line_buffers[0][i];
1151             interleaved[j++] = line_buffers[1][i];
1152             interleaved[j++] = line_buffers[2][i];
1153             if (dec_state->rgb_output_is_rgba) {
1154               interleaved[j++] = line_buffers[3][i];
1155             }
1156           }
1157           dec_state->pixel_callback(interleaved.data(), image_line_rect.x0(),
1158                                     image_line_rect.y0() + iy,
1159                                     image_line_rect.xsize());
1160         }
1161       }
1162     }
1163   }
1164 
1165   return true;
1166 }
1167 
FinalizeFrameDecoding(ImageBundle * decoded,PassesDecoderState * dec_state,ThreadPool * pool,bool force_fir,bool skip_blending,bool move_ec)1168 Status FinalizeFrameDecoding(ImageBundle* decoded,
1169                              PassesDecoderState* dec_state, ThreadPool* pool,
1170                              bool force_fir, bool skip_blending, bool move_ec) {
1171   if (dec_state->render_pipeline) {
1172     return true;
1173   }
1174   const FrameHeader& frame_header = dec_state->shared->frame_header;
1175   const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
1176 
1177   // FinalizeImageRect was not yet run, or we are forcing a run.
1178   if (!dec_state->EagerFinalizeImageRect() || force_fir) {
1179     std::vector<Rect> rects_to_process;
1180     for (size_t y = 0; y < frame_dim.ysize_padded; y += kGroupDim) {
1181       for (size_t x = 0; x < frame_dim.xsize_padded; x += kGroupDim) {
1182         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_padded,
1183                   frame_dim.ysize_padded);
1184         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1185         rects_to_process.push_back(rect);
1186       }
1187     }
1188     const auto allocate_storage = [&](const size_t num_threads) {
1189       dec_state->EnsureStorage(num_threads);
1190       return true;
1191     };
1192 
1193     {
1194       std::vector<ImageF> ecs;
1195       const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
1196       for (size_t i = 0; i < metadata.num_extra_channels; i++) {
1197         if (frame_header.extra_channel_upsampling[i] == 1) {
1198           if (move_ec) {
1199             ecs.push_back(std::move(dec_state->extra_channels[i]));
1200           } else {
1201             ecs.push_back(CopyImage(dec_state->extra_channels[i]));
1202           }
1203         } else {
1204           ecs.emplace_back(frame_dim.xsize_upsampled_padded,
1205                            frame_dim.ysize_upsampled_padded);
1206         }
1207       }
1208       decoded->SetExtraChannels(std::move(ecs));
1209     }
1210 
1211     std::atomic<bool> apply_features_ok{true};
1212     auto run_apply_features = [&](const uint32_t rect_id, size_t thread) {
1213       size_t xstart = PassesDecoderState::kGroupDataXBorder;
1214       size_t ystart = PassesDecoderState::kGroupDataYBorder;
1215       for (size_t c = 0; c < 3; c++) {
1216         Rect rh(rects_to_process[rect_id].x0() >>
1217                     frame_header.chroma_subsampling.HShift(c),
1218                 rects_to_process[rect_id].y0() >>
1219                     frame_header.chroma_subsampling.VShift(c),
1220                 rects_to_process[rect_id].xsize() >>
1221                     frame_header.chroma_subsampling.HShift(c),
1222                 rects_to_process[rect_id].ysize() >>
1223                     frame_header.chroma_subsampling.VShift(c));
1224         Rect group_data_rect(xstart, ystart, rh.xsize(), rh.ysize());
1225         // Poison the image in this thread to prevent leaking initialized data
1226         // from a previous run in this thread in msan builds.
1227         msan::PoisonImage(dec_state->group_data[thread].Plane(c));
1228         CopyImageToWithPadding(
1229             rh, dec_state->decoded.Plane(c), dec_state->FinalizeRectPadding(),
1230             group_data_rect, &dec_state->group_data[thread].Plane(c));
1231       }
1232       Rect group_data_rect(xstart, ystart, rects_to_process[rect_id].xsize(),
1233                            rects_to_process[rect_id].ysize());
1234       std::vector<std::pair<ImageF*, Rect>> ec_rects;
1235       ec_rects.reserve(decoded->extra_channels().size());
1236       for (size_t i = 0; i < decoded->extra_channels().size(); i++) {
1237         Rect r = ScaleRectForEC(
1238             rects_to_process[rect_id].Crop(frame_dim.xsize, frame_dim.ysize),
1239             frame_header, i);
1240         if (frame_header.extra_channel_upsampling[i] != 1) {
1241           Rect ec_input_rect(kBlockDim, 2, r.xsize(), r.ysize());
1242           auto eti =
1243               &dec_state
1244                    ->ec_temp_images[thread * decoded->extra_channels().size() +
1245                                     i];
1246           // Poison the temp image on this thread to prevent leaking initialized
1247           // data from a previous run in this thread in msan builds.
1248           msan::PoisonImage(*eti);
1249           JXL_CHECK_IMAGE_INITIALIZED(dec_state->extra_channels[i], r);
1250           CopyImageToWithPadding(r, dec_state->extra_channels[i],
1251                                  /*padding=*/2, ec_input_rect, eti);
1252           ec_rects.emplace_back(eti, ec_input_rect);
1253         } else {
1254           JXL_CHECK_IMAGE_INITIALIZED(decoded->extra_channels()[i], r);
1255           ec_rects.emplace_back(&decoded->extra_channels()[i], r);
1256         }
1257       }
1258       if (!FinalizeImageRect(&dec_state->group_data[thread], group_data_rect,
1259                              ec_rects, dec_state, thread, decoded,
1260                              rects_to_process[rect_id])) {
1261         apply_features_ok = false;
1262       }
1263     };
1264 
1265     JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, rects_to_process.size(),
1266                                   allocate_storage, run_apply_features,
1267                                   "ApplyFeatures"));
1268 
1269     if (!apply_features_ok) {
1270       return JXL_FAILURE("FinalizeImageRect failed");
1271     }
1272   }
1273 
1274   const size_t xsize = frame_dim.xsize_upsampled;
1275   const size_t ysize = frame_dim.ysize_upsampled;
1276 
1277   decoded->ShrinkTo(xsize, ysize);
1278   if (dec_state->pre_color_transform_frame.xsize() != 0) {
1279     dec_state->pre_color_transform_frame.ShrinkTo(xsize, ysize);
1280   }
1281 
1282   if (!skip_blending && ImageBlender::NeedsBlending(dec_state)) {
1283     if (dec_state->pre_color_transform_frame.xsize() != 0) {
1284       // Extra channels are going to be modified. Make a copy.
1285       dec_state->pre_color_transform_ec.clear();
1286       for (const auto& ec : decoded->extra_channels()) {
1287         dec_state->pre_color_transform_ec.emplace_back(CopyImage(ec));
1288       }
1289     }
1290     ImageBlender blender;
1291     ImageBundle foreground = std::move(*decoded);
1292     decoded->SetFromImage(Image3F(frame_header.nonserialized_metadata->xsize(),
1293                                   frame_header.nonserialized_metadata->ysize()),
1294                           foreground.c_current());
1295     std::vector<Rect> extra_channels_rects;
1296     decoded->extra_channels().reserve(foreground.extra_channels().size());
1297     extra_channels_rects.reserve(foreground.extra_channels().size());
1298     for (size_t i = 0; i < foreground.extra_channels().size(); ++i) {
1299       decoded->extra_channels().emplace_back(
1300           frame_header.nonserialized_metadata->xsize(),
1301           frame_header.nonserialized_metadata->ysize());
1302       extra_channels_rects.emplace_back(decoded->extra_channels().back());
1303     }
1304     JXL_RETURN_IF_ERROR(blender.PrepareBlending(
1305         dec_state, foreground.origin, foreground.xsize(), foreground.ysize(),
1306         &frame_header.nonserialized_metadata->m.extra_channel_info,
1307         foreground.c_current(), Rect(*decoded->color()),
1308         /*output=*/decoded->color(), Rect(*decoded->color()),
1309         &decoded->extra_channels(), std::move(extra_channels_rects)));
1310 
1311     std::vector<Rect> rects_to_process;
1312     for (size_t y = 0; y < frame_dim.ysize_upsampled; y += kGroupDim) {
1313       for (size_t x = 0; x < frame_dim.xsize_upsampled; x += kGroupDim) {
1314         Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_upsampled,
1315                   frame_dim.ysize_upsampled);
1316         if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1317         rects_to_process.push_back(rect);
1318       }
1319     }
1320 
1321     std::atomic<bool> blending_ok{true};
1322     JXL_RETURN_IF_ERROR(RunOnPool(
1323         pool, 0, rects_to_process.size(), ThreadPool::NoInit,
1324         [&](const uint32_t i, size_t /*thread*/) {
1325           const Rect& rect = rects_to_process[i];
1326           auto rect_blender = blender.PrepareRect(
1327               rect, *foreground.color(), foreground.extra_channels(), rect);
1328           for (size_t y = 0; y < rect.ysize(); ++y) {
1329             if (!rect_blender.DoBlending(y)) {
1330               blending_ok = false;
1331               return;
1332             }
1333           }
1334         },
1335         "Blend"));
1336     JXL_RETURN_IF_ERROR(blending_ok.load());
1337   }
1338 
1339   return true;
1340 }
1341 
1342 }  // namespace jxl
1343 #endif  // HWY_ONCE
1344