1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 #include "lib/jxl/dec_reconstruct.h"
7
8 #include <atomic>
9 #include <utility>
10
11 #include "lib/jxl/filters.h"
12 #include "lib/jxl/image_ops.h"
13
14 #undef HWY_TARGET_INCLUDE
15 #define HWY_TARGET_INCLUDE "lib/jxl/dec_reconstruct.cc"
16 #include <hwy/foreach_target.h>
17 #include <hwy/highway.h>
18
19 #include "lib/jxl/aux_out.h"
20 #include "lib/jxl/base/data_parallel.h"
21 #include "lib/jxl/base/profiler.h"
22 #include "lib/jxl/blending.h"
23 #include "lib/jxl/color_management.h"
24 #include "lib/jxl/common.h"
25 #include "lib/jxl/dec_noise.h"
26 #include "lib/jxl/dec_upsample.h"
27 #include "lib/jxl/dec_xyb-inl.h"
28 #include "lib/jxl/dec_xyb.h"
29 #include "lib/jxl/epf.h"
30 #include "lib/jxl/fast_math-inl.h"
31 #include "lib/jxl/frame_header.h"
32 #include "lib/jxl/loop_filter.h"
33 #include "lib/jxl/passes_state.h"
34 #include "lib/jxl/sanitizers.h"
35 #include "lib/jxl/transfer_functions-inl.h"
36 HWY_BEFORE_NAMESPACE();
37 namespace jxl {
38 namespace HWY_NAMESPACE {
39
40 template <typename Op>
DoUndoXYBInPlace(Image3F * idct,const Rect & rect,Op op,const OutputEncodingInfo & output_encoding_info)41 void DoUndoXYBInPlace(Image3F* idct, const Rect& rect, Op op,
42 const OutputEncodingInfo& output_encoding_info) {
43 // TODO(eustas): should it still be capped?
44 const HWY_CAPPED(float, GroupBorderAssigner::kPaddingXRound) d;
45 const size_t xsize = rect.xsize();
46 const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
47 // The size of `rect` might not be a multiple of Lanes(d), but is guaranteed
48 // to be a multiple of kBlockDim or at the margin of the image.
49 for (size_t y = 0; y < rect.ysize(); y++) {
50 float* JXL_RESTRICT row0 = rect.PlaneRow(idct, 0, y);
51 float* JXL_RESTRICT row1 = rect.PlaneRow(idct, 1, y);
52 float* JXL_RESTRICT row2 = rect.PlaneRow(idct, 2, y);
53 // All calculations are lane-wise, still some might require value-dependent
54 // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
55 msan::UnpoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
56 msan::UnpoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
57 msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
58 for (size_t x = 0; x < rect.xsize(); x += Lanes(d)) {
59 const auto in_opsin_x = Load(d, row0 + x);
60 const auto in_opsin_y = Load(d, row1 + x);
61 const auto in_opsin_b = Load(d, row2 + x);
62 JXL_COMPILER_FENCE;
63 auto linear_r = Undefined(d);
64 auto linear_g = Undefined(d);
65 auto linear_b = Undefined(d);
66 XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b,
67 output_encoding_info.opsin_params, &linear_r, &linear_g,
68 &linear_b);
69 Store(op.Transform(d, linear_r), d, row0 + x);
70 Store(op.Transform(d, linear_g), d, row1 + x);
71 Store(op.Transform(d, linear_b), d, row2 + x);
72 }
73 msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
74 msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
75 msan::PoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
76 }
77 }
78
79 struct OpLinear {
80 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpLinear81 T Transform(D d, const T& linear) {
82 return linear;
83 }
84 };
85
86 struct OpRgb {
87 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpRgb88 T Transform(D d, const T& linear) {
89 #if JXL_HIGH_PRECISION
90 return TF_SRGB().EncodedFromDisplay(d, linear);
91 #else
92 return FastLinearToSRGB(d, linear);
93 #endif
94 }
95 };
96
97 struct OpPq {
98 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpPq99 T Transform(D d, const T& linear) {
100 return TF_PQ().EncodedFromDisplay(d, linear);
101 }
102 };
103
104 struct OpHlg {
105 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpHlg106 T Transform(D d, const T& linear) {
107 return TF_HLG().EncodedFromDisplay(d, linear);
108 }
109 };
110
111 struct Op709 {
112 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::Op709113 T Transform(D d, const T& linear) {
114 return TF_709().EncodedFromDisplay(d, linear);
115 }
116 };
117
118 struct OpGamma {
119 const float inverse_gamma;
120 template <typename D, typename T>
Transformjxl::HWY_NAMESPACE::OpGamma121 T Transform(D d, const T& linear) {
122 return IfThenZeroElse(linear <= Set(d, 1e-5f),
123 FastPowf(d, linear, Set(d, inverse_gamma)));
124 }
125 };
126
UndoXYBInPlace(Image3F * idct,const Rect & rect,const OutputEncodingInfo & output_encoding_info)127 Status UndoXYBInPlace(Image3F* idct, const Rect& rect,
128 const OutputEncodingInfo& output_encoding_info) {
129 PROFILER_ZONE("UndoXYB");
130
131 if (output_encoding_info.color_encoding.tf.IsLinear()) {
132 DoUndoXYBInPlace(idct, rect, OpLinear(), output_encoding_info);
133 } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
134 DoUndoXYBInPlace(idct, rect, OpRgb(), output_encoding_info);
135 } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
136 DoUndoXYBInPlace(idct, rect, OpPq(), output_encoding_info);
137 } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
138 DoUndoXYBInPlace(idct, rect, OpHlg(), output_encoding_info);
139 } else if (output_encoding_info.color_encoding.tf.Is709()) {
140 DoUndoXYBInPlace(idct, rect, Op709(), output_encoding_info);
141 } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
142 output_encoding_info.color_encoding.tf.IsDCI()) {
143 OpGamma op = {output_encoding_info.inverse_gamma};
144 DoUndoXYBInPlace(idct, rect, op, output_encoding_info);
145 } else {
146 // This is a programming error.
147 JXL_ABORT("Invalid target encoding");
148 }
149 return true;
150 }
151
152 template <typename D, typename V>
StoreRGBA(D d,V r,V g,V b,V a,bool alpha,size_t n,size_t extra,uint8_t * buf)153 void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra,
154 uint8_t* buf) {
155 #if HWY_TARGET == HWY_SCALAR
156 buf[0] = r.raw;
157 buf[1] = g.raw;
158 buf[2] = b.raw;
159 if (alpha) {
160 buf[3] = a.raw;
161 }
162 #elif HWY_TARGET == HWY_NEON
163 if (alpha) {
164 uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw};
165 if (extra >= 8) {
166 vst4_u8(buf, data);
167 } else {
168 uint8_t tmp[8 * 4];
169 vst4_u8(tmp, data);
170 memcpy(buf, tmp, n * 4);
171 }
172 } else {
173 uint8x8x3_t data = {r.raw, g.raw, b.raw};
174 if (extra >= 8) {
175 vst3_u8(buf, data);
176 } else {
177 uint8_t tmp[8 * 3];
178 vst3_u8(tmp, data);
179 memcpy(buf, tmp, n * 3);
180 }
181 }
182 #else
183 // TODO(veluca): implement this for x86.
184 size_t mul = alpha ? 4 : 3;
185 HWY_ALIGN uint8_t bytes[16];
186 Store(r, d, bytes);
187 for (size_t i = 0; i < n; i++) {
188 buf[mul * i] = bytes[i];
189 }
190 Store(g, d, bytes);
191 for (size_t i = 0; i < n; i++) {
192 buf[mul * i + 1] = bytes[i];
193 }
194 Store(b, d, bytes);
195 for (size_t i = 0; i < n; i++) {
196 buf[mul * i + 2] = bytes[i];
197 }
198 if (alpha) {
199 Store(a, d, bytes);
200 for (size_t i = 0; i < n; i++) {
201 buf[4 * i + 3] = bytes[i];
202 }
203 }
204 #endif
205 }
206
207 // Outputs floating point image to RGBA 8-bit buffer. Does not support alpha
208 // channel in the input, but outputs opaque alpha channel for the case where the
209 // output buffer to write to is in the 4-byte per pixel RGBA format.
FloatToRGBA8(const Image3F & input,const Rect & input_rect,bool is_rgba,const ImageF * alpha_in,const Rect & alpha_rect,const Rect & output_buf_rect,uint8_t * JXL_RESTRICT output_buf,size_t stride)210 void FloatToRGBA8(const Image3F& input, const Rect& input_rect, bool is_rgba,
211 const ImageF* alpha_in, const Rect& alpha_rect,
212 const Rect& output_buf_rect, uint8_t* JXL_RESTRICT output_buf,
213 size_t stride) {
214 size_t bytes = is_rgba ? 4 : 3;
215 for (size_t y = 0; y < output_buf_rect.ysize(); y++) {
216 const float* JXL_RESTRICT row_in_r = input_rect.ConstPlaneRow(input, 0, y);
217 const float* JXL_RESTRICT row_in_g = input_rect.ConstPlaneRow(input, 1, y);
218 const float* JXL_RESTRICT row_in_b = input_rect.ConstPlaneRow(input, 2, y);
219 const float* JXL_RESTRICT row_in_a =
220 alpha_in ? alpha_rect.ConstRow(*alpha_in, y) : nullptr;
221 size_t base_ptr =
222 (y + output_buf_rect.y0()) * stride + bytes * output_buf_rect.x0();
223 using D = HWY_CAPPED(float, 4);
224 const D d;
225 D::Rebind<uint32_t> du;
226 auto zero = Zero(d);
227 auto one = Set(d, 1.0f);
228 auto mul = Set(d, 255.0f);
229
230 // All calculations are lane-wise, still some might require value-dependent
231 // behaviour (e.g. NearestInt). Temporary unposion last vector tail.
232 size_t xsize = output_buf_rect.xsize();
233 size_t xsize_v = RoundUpTo(xsize, Lanes(d));
234 msan::UnpoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
235 msan::UnpoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
236 msan::UnpoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
237 if (row_in_a)
238 msan::UnpoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
239 for (size_t x = 0; x < xsize; x += Lanes(d)) {
240 auto rf = Clamp(zero, Load(d, row_in_r + x), one) * mul;
241 auto gf = Clamp(zero, Load(d, row_in_g + x), one) * mul;
242 auto bf = Clamp(zero, Load(d, row_in_b + x), one) * mul;
243 auto af = row_in_a ? Clamp(zero, Load(d, row_in_a + x), one) * mul
244 : Set(d, 255.0f);
245 auto r8 = U8FromU32(BitCast(du, NearestInt(rf)));
246 auto g8 = U8FromU32(BitCast(du, NearestInt(gf)));
247 auto b8 = U8FromU32(BitCast(du, NearestInt(bf)));
248 auto a8 = U8FromU32(BitCast(du, NearestInt(af)));
249 size_t n = output_buf_rect.xsize() - x;
250 if (JXL_LIKELY(n >= Lanes(d))) {
251 StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, Lanes(d), n,
252 output_buf + base_ptr + bytes * x);
253 } else {
254 StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, is_rgba, n, n,
255 output_buf + base_ptr + bytes * x);
256 }
257 }
258 msan::PoisonMemory(row_in_r + xsize, sizeof(float) * (xsize_v - xsize));
259 msan::PoisonMemory(row_in_g + xsize, sizeof(float) * (xsize_v - xsize));
260 msan::PoisonMemory(row_in_b + xsize, sizeof(float) * (xsize_v - xsize));
261 if (row_in_a)
262 msan::PoisonMemory(row_in_a + xsize, sizeof(float) * (xsize_v - xsize));
263 }
264 }
265
266 // Upsample in horizonal (if hs=1) and vertical (if vs=1) the plane_in image
267 // to the output plane_out image.
268 // The output region "rect" in plane_out and a border around it of lf.Padding()
269 // will be generated, as long as those pixels fall inside the image frame.
270 // Otherwise the border pixels that fall outside the image frame in plane_out
271 // are undefined.
272 // "rect" is an area inside the plane_out image which corresponds to the
273 // "frame_rect" area in the frame. plane_in and plane_out both are expected to
274 // have a padding of kGroupDataXBorder and kGroupDataYBorder on either side of
275 // X and Y coordinates. This means that when upsampling vertically the plane_out
276 // row `kGroupDataXBorder + N` will be generated from the plane_in row
277 // `kGroupDataXBorder + N / 2` (and a previous or next row).
DoYCbCrUpsampling(size_t hs,size_t vs,ImageF * plane_in,const Rect & rect,const Rect & frame_rect,const FrameDimensions & frame_dim,ImageF * plane_out,const LoopFilter & lf,ImageF * temp)278 void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect,
279 const Rect& frame_rect, const FrameDimensions& frame_dim,
280 ImageF* plane_out, const LoopFilter& lf, ImageF* temp) {
281 JXL_DASSERT(SameSize(rect, frame_rect));
282 JXL_DASSERT(hs <= 1 && vs <= 1);
283 // The pixel in (xoff, yoff) is the origin of the downsampling coordinate
284 // system.
285 size_t xoff = PassesDecoderState::kGroupDataXBorder;
286 size_t yoff = PassesDecoderState::kGroupDataYBorder;
287
288 // This X,Y range is the intersection between the requested "rect" expanded
289 // with a lf.Padding() all around and the image frame translated to the
290 // coordinate system used by plane_out.
291 // All the pixels in the [x0, x1) x [y0, y1) range must be defined in the
292 // plane_out output at the end.
293 const size_t y0 = rect.y0() - std::min<size_t>(lf.Padding(), frame_rect.y0());
294 const size_t y1 = rect.y0() +
295 std::min(frame_rect.y0() + rect.ysize() + lf.Padding(),
296 frame_dim.ysize_padded) -
297 frame_rect.y0();
298
299 const size_t x0 = rect.x0() - std::min<size_t>(lf.Padding(), frame_rect.x0());
300 const size_t x1 = rect.x0() +
301 std::min(frame_rect.x0() + rect.xsize() + lf.Padding(),
302 frame_dim.xsize_padded) -
303 frame_rect.x0();
304
305 if (hs == 0 && vs == 0) {
306 Rect r(x0, y0, x1 - x0, y1 - y0);
307 JXL_CHECK_IMAGE_INITIALIZED(*plane_in, r);
308 CopyImageTo(r, *plane_in, r, plane_out);
309 return;
310 }
311 // Prepare padding if we are on a border.
312 // Copy the whole row/column here: it is likely similarly fast and ensures
313 // that we don't forget some parts of padding.
314 if (frame_rect.x0() == 0) {
315 for (size_t y = 0; y < plane_in->ysize(); y++) {
316 plane_in->Row(y)[rect.x0() - 1] = plane_in->Row(y)[rect.x0()];
317 }
318 }
319 if (frame_rect.x0() + x1 - rect.x0() >= frame_dim.xsize_padded) {
320 ssize_t borderx = static_cast<ssize_t>(x1 - xoff + hs) / (1 << hs) + xoff;
321 for (size_t y = 0; y < plane_in->ysize(); y++) {
322 plane_in->Row(y)[borderx] = plane_in->Row(y)[borderx - 1];
323 }
324 }
325 if (frame_rect.y0() == 0) {
326 memcpy(plane_in->Row(rect.y0() - 1), plane_in->Row(rect.y0()),
327 plane_in->xsize() * sizeof(float));
328 }
329 if (frame_rect.y0() + y1 - rect.y0() >= frame_dim.ysize_padded) {
330 ssize_t bordery = static_cast<ssize_t>(y1 - yoff + vs) / (1 << vs) + yoff;
331 memcpy(plane_in->Row(bordery), plane_in->Row(bordery - 1),
332 plane_in->xsize() * sizeof(float));
333 }
334 if (hs == 1) {
335 // Limited to 4 for Interleave*.
336 HWY_CAPPED(float, 4) d;
337 auto threefour = Set(d, 0.75f);
338 auto onefour = Set(d, 0.25f);
339 size_t orig_y0 = y0;
340 size_t orig_y1 = y1;
341 if (vs != 0) {
342 orig_y0 = (y0 >> 1) + (yoff >> 1) - 1;
343 orig_y1 = (y1 >> 1) + (yoff >> 1) + 1;
344 }
345 for (size_t y = orig_y0; y < orig_y1; y++) {
346 const float* in = plane_in->Row(y);
347 float* out = temp->Row(y);
348 for (size_t x = x0 / (2 * Lanes(d)) * 2 * Lanes(d);
349 x < RoundUpTo(x1, 2 * Lanes(d)); x += 2 * Lanes(d)) {
350 size_t ox = (x >> 1) + (xoff >> 1);
351 auto current = Load(d, in + ox) * threefour;
352 auto prev = LoadU(d, in + ox - 1);
353 auto next = LoadU(d, in + ox + 1);
354 auto left = MulAdd(onefour, prev, current);
355 auto right = MulAdd(onefour, next, current);
356 #if HWY_TARGET == HWY_SCALAR
357 Store(left, d, out + x);
358 Store(right, d, out + x + 1);
359 #else
360 Store(InterleaveLower(left, right), d, out + x);
361 Store(InterleaveUpper(left, right), d, out + x + Lanes(d));
362 #endif
363 }
364 }
365 } else {
366 CopyImageTo(*plane_in, temp);
367 }
368 if (vs == 1) {
369 HWY_FULL(float) d;
370 auto threefour = Set(d, 0.75f);
371 auto onefour = Set(d, 0.25f);
372 for (size_t y = y0; y < y1; y++) {
373 size_t oy1 = (y >> 1) + (yoff >> 1);
374 if ((y & 1) == 1) oy1++;
375 size_t oy0 = oy1 - 1;
376 const float* in0 = temp->Row(oy0);
377 const float* in1 = temp->Row(oy1);
378 float* out = plane_out->Row(y);
379 if ((y & 1) == 1) {
380 for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
381 x += Lanes(d)) {
382 auto i0 = Load(d, in0 + x);
383 auto i1 = Load(d, in1 + x);
384 auto o = MulAdd(i0, threefour, i1 * onefour);
385 Store(o, d, out + x);
386 }
387 } else {
388 for (size_t x = x0 / Lanes(d) * Lanes(d); x < RoundUpTo(x1, Lanes(d));
389 x += Lanes(d)) {
390 auto i0 = Load(d, in0 + x);
391 auto i1 = Load(d, in1 + x);
392 auto o = MulAdd(i0, onefour, i1 * threefour);
393 Store(o, d, out + x);
394 }
395 }
396 }
397 } else {
398 CopyImageTo(*temp, plane_out);
399 }
400
401 // The output must be initialized including the lf.Padding() around the image
402 // for all the pixels that fall inside the image frame.
403 JXL_CHECK_IMAGE_INITIALIZED(*plane_out, Rect(x0, y0, x1 - x0, y1 - y0));
404 }
405
406 // NOLINTNEXTLINE(google-readability-namespace-comments)
407 } // namespace HWY_NAMESPACE
408 } // namespace jxl
409 HWY_AFTER_NAMESPACE();
410
411 #if HWY_ONCE
412 namespace jxl {
413
414 HWY_EXPORT(UndoXYBInPlace);
415 HWY_EXPORT(FloatToRGBA8);
416 HWY_EXPORT(DoYCbCrUpsampling);
417
UndoXYB(const Image3F & src,Image3F * dst,const OutputEncodingInfo & output_info,ThreadPool * pool)418 void UndoXYB(const Image3F& src, Image3F* dst,
419 const OutputEncodingInfo& output_info, ThreadPool* pool) {
420 CopyImageTo(src, dst);
421 pool->Run(0, src.ysize(), ThreadPool::SkipInit(), [&](int y, int /*thread*/) {
422 JXL_CHECK(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(dst, Rect(*dst).Line(y),
423 output_info));
424 });
425 }
426
427 namespace {
ScaleRectForEC(Rect in,const FrameHeader & frame_header,size_t ec)428 Rect ScaleRectForEC(Rect in, const FrameHeader& frame_header, size_t ec) {
429 auto s = [&](size_t x) {
430 return DivCeil(x * frame_header.upsampling,
431 frame_header.extra_channel_upsampling[ec]);
432 };
433 // For x0 and y0 the DivCeil is actually an exact division.
434 return Rect(s(in.x0()), s(in.y0()), s(in.xsize()), s(in.ysize()));
435 }
436
437 // Implements EnsurePaddingInPlace, but allows processing data one row at a
438 // time.
439 class EnsurePaddingInPlaceRowByRow {
Init(const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)440 void Init(const Rect& rect, const Rect& image_rect, size_t image_xsize,
441 size_t image_ysize, size_t xpadding, size_t ypadding, ssize_t* y0,
442 ssize_t* y1) {
443 // coordinates relative to rect.
444 JXL_ASSERT(SameSize(rect, image_rect));
445 JXL_ASSERT(image_rect.x0() + image_rect.xsize() <= image_xsize);
446 JXL_ASSERT(image_rect.y0() + image_rect.ysize() <= image_ysize);
447 *y0 = -std::min(image_rect.y0(), ypadding);
448 *y1 = rect.ysize() + std::min(ypadding, image_ysize - image_rect.ysize() -
449 image_rect.y0());
450 if (image_rect.x0() >= xpadding &&
451 image_rect.x0() + image_rect.xsize() + xpadding <= image_xsize) {
452 // Nothing to do.
453 strategy_ = kSkip;
454 } else if (image_xsize >= 2 * xpadding) {
455 strategy_ = kFast;
456 } else {
457 strategy_ = kSlow;
458 }
459 y0_ = rect.y0();
460 JXL_ASSERT(rect.x0() >= xpadding);
461 x0_ = x1_ = rect.x0() - xpadding;
462 // If close to the left border - do mirroring.
463 if (image_rect.x0() < xpadding) x1_ = rect.x0() - image_rect.x0();
464 x2_ = x3_ = rect.x0() + rect.xsize() + xpadding;
465 // If close to the right border - do mirroring.
466 if (image_rect.x0() + image_rect.xsize() + xpadding > image_xsize) {
467 x2_ = rect.x0() + image_xsize - image_rect.x0();
468 }
469 JXL_ASSERT(x0_ <= x1_);
470 JXL_ASSERT(x1_ <= x2_);
471 JXL_ASSERT(x2_ <= x3_);
472 JXL_ASSERT(image_xsize == (x2_ - x1_) ||
473 (x1_ - x0_ <= x2_ - x1_ && x3_ - x2_ <= x2_ - x1_));
474 }
475
476 public:
Init(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)477 void Init(Image3F* img, const Rect& rect, const Rect& image_rect,
478 size_t image_xsize, size_t image_ysize, size_t xpadding,
479 size_t ypadding, ssize_t* y0, ssize_t* y1) {
480 Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
481 y1);
482 img3_ = img;
483 JXL_DASSERT(x3_ <= img->xsize());
484 }
Init(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding,ssize_t * y0,ssize_t * y1)485 void Init(ImageF* img, const Rect& rect, const Rect& image_rect,
486 size_t image_xsize, size_t image_ysize, size_t xpadding,
487 size_t ypadding, ssize_t* y0, ssize_t* y1) {
488 Init(rect, image_rect, image_xsize, image_ysize, xpadding, ypadding, y0,
489 y1);
490 img_ = img;
491 JXL_DASSERT(x3_ <= img->xsize());
492 }
493 // To be called when row `y` of the input is available, for all the values in
494 // [*y0, *y1).
Process3(ssize_t y)495 void Process3(ssize_t y) {
496 JXL_DASSERT(img3_);
497 for (size_t c = 0; c < 3; c++) {
498 img_ = &img3_->Plane(c);
499 Process(y);
500 }
501 }
Process(ssize_t y)502 void Process(ssize_t y) {
503 JXL_DASSERT(img_);
504 switch (strategy_) {
505 case kSkip:
506 break;
507 case kFast: {
508 // Image is wide enough that a single Mirror() step is sufficient.
509 float* JXL_RESTRICT row = img_->Row(y + y0_);
510 for (size_t x = x0_; x < x1_; x++) {
511 row[x] = row[2 * x1_ - x - 1];
512 }
513 for (size_t x = x2_; x < x3_; x++) {
514 row[x] = row[2 * x2_ - x - 1];
515 }
516 break;
517 }
518 case kSlow: {
519 // Slow case for small images.
520 float* JXL_RESTRICT row = img_->Row(y + y0_) + x1_;
521 for (ssize_t x = x0_ - x1_; x < 0; x++) {
522 *(row + x) = row[Mirror(x, x2_ - x1_)];
523 }
524 for (size_t x = x2_ - x1_; x < x3_ - x1_; x++) {
525 *(row + x) = row[Mirror(x, x2_ - x1_)];
526 }
527 break;
528 }
529 }
530 }
531
532 private:
533 // Initialized to silence spurious compiler warnings.
534 Image3F* img3_ = nullptr;
535 ImageF* img_ = nullptr;
536 // Will fill [x0_, x1_) and [x2_, x3_) on every row.
537 // The [x1_, x2_) range contains valid image pixels. We guarantee that either
538 // x1_ - x0_ <= x2_ - x1_, (and similarly for x2_, x3_), or that the [x1_,
539 // x2_) contains a full horizontal line of the original image.
540 size_t x0_ = 0, x1_ = 0, x2_ = 0, x3_ = 0;
541 size_t y0_ = 0;
542 // kSlow: use calls to Mirror(), for the case where the border might be larger
543 // than the image.
544 // kFast: directly use the result of Mirror() when it can be computed in a
545 // single iteration.
546 // kSkip: do nothing.
547 enum Strategy { kFast, kSlow, kSkip };
548 Strategy strategy_ = kSkip;
549 };
550 } // namespace
551
EnsurePaddingInPlace(Image3F * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)552 void EnsurePaddingInPlace(Image3F* img, const Rect& rect,
553 const Rect& image_rect, size_t image_xsize,
554 size_t image_ysize, size_t xpadding,
555 size_t ypadding) {
556 ssize_t y0, y1;
557 EnsurePaddingInPlaceRowByRow impl;
558 impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
559 &y0, &y1);
560 for (ssize_t y = y0; y < y1; y++) {
561 impl.Process3(y);
562 }
563 }
564
EnsurePaddingInPlace(ImageF * img,const Rect & rect,const Rect & image_rect,size_t image_xsize,size_t image_ysize,size_t xpadding,size_t ypadding)565 void EnsurePaddingInPlace(ImageF* img, const Rect& rect, const Rect& image_rect,
566 size_t image_xsize, size_t image_ysize,
567 size_t xpadding, size_t ypadding) {
568 ssize_t y0, y1;
569 EnsurePaddingInPlaceRowByRow impl;
570 impl.Init(img, rect, image_rect, image_xsize, image_ysize, xpadding, ypadding,
571 &y0, &y1);
572 for (ssize_t y = y0; y < y1; y++) {
573 impl.Process(y);
574 }
575 }
576
FinalizeImageRect(Image3F * input_image,const Rect & input_rect,const std::vector<std::pair<ImageF *,Rect>> & extra_channels,PassesDecoderState * dec_state,size_t thread,ImageBundle * JXL_RESTRICT output_image,const Rect & frame_rect)577 Status FinalizeImageRect(
578 Image3F* input_image, const Rect& input_rect,
579 const std::vector<std::pair<ImageF*, Rect>>& extra_channels,
580 PassesDecoderState* dec_state, size_t thread,
581 ImageBundle* JXL_RESTRICT output_image, const Rect& frame_rect) {
582 const ImageFeatures& image_features = dec_state->shared->image_features;
583 const FrameHeader& frame_header = dec_state->shared->frame_header;
584 const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
585 const LoopFilter& lf = frame_header.loop_filter;
586 const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
587 JXL_DASSERT(frame_rect.xsize() <= kApplyImageFeaturesTileDim);
588 JXL_DASSERT(frame_rect.ysize() <= kApplyImageFeaturesTileDim);
589 JXL_DASSERT(input_rect.xsize() == frame_rect.xsize());
590 JXL_DASSERT(input_rect.ysize() == frame_rect.ysize());
591 JXL_DASSERT(frame_rect.x0() % GroupBorderAssigner::kPaddingXRound == 0);
592 JXL_DASSERT(frame_rect.xsize() % GroupBorderAssigner::kPaddingXRound == 0 ||
593 frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize ||
594 frame_rect.xsize() + frame_rect.x0() == frame_dim.xsize_padded);
595
596 // +----------------------------- STEP 1 ------------------------------+
597 // | Compute the rects on which patches and splines will be applied. |
598 // | In case we are applying upsampling, we need to apply patches on a |
599 // | slightly larger image. |
600 // +-------------------------------------------------------------------+
601
602 // If we are applying upsampling, we need 2 more pixels around the actual rect
603 // for border. Thus, we also need to apply patches and splines to those
604 // pixels. We compute here
605 // - The portion of image that corresponds to the area we are applying IF.
606 // (rect_for_if)
607 // - The rect where that pixel data is stored in upsampling_input_storage.
608 // (rect_for_if_storage)
609 // - The rect where the pixel data that we need to upsample is stored.
610 // (rect_for_upsampling)
611 // - The source rect for the pixel data in `input_image`. It is assumed that,
612 // if `frame_rect` is not on an image border, `input_image:input_rect` has
613 // enough border available. (rect_for_if_input)
614
615 Image3F* output_color =
616 dec_state->rgb_output == nullptr && dec_state->pixel_callback == nullptr
617 ? output_image->color()
618 : nullptr;
619
620 Image3F* storage_for_if = output_color;
621 Rect rect_for_if = frame_rect;
622 Rect rect_for_if_storage = frame_rect;
623 Rect rect_for_upsampling = frame_rect;
624 Rect rect_for_if_input = input_rect;
625 // The same as rect_for_if_input but in the frame coordinates.
626 Rect frame_rect_for_ycbcr_upsampling = frame_rect;
627 size_t extra_rows_t = 0;
628 size_t extra_rows_b = 0;
629 if (frame_header.upsampling != 1) {
630 size_t ifbx0 = 0;
631 size_t ifbx1 = 0;
632 size_t ifby0 = 0;
633 size_t ifby1 = 0;
634 if (frame_rect.x0() >= 2) {
635 JXL_DASSERT(input_rect.x0() >= 2);
636 ifbx0 = 2;
637 }
638 if (frame_rect.y0() >= 2) {
639 JXL_DASSERT(input_rect.y0() >= 2);
640 extra_rows_t = ifby0 = 2;
641 }
642 for (size_t extra : {1, 2}) {
643 if (frame_rect.x0() + frame_rect.xsize() + extra <=
644 dec_state->shared->frame_dim.xsize_padded) {
645 JXL_DASSERT(input_rect.x0() + input_rect.xsize() + extra <=
646 input_image->xsize());
647 ifbx1 = extra;
648 }
649 if (frame_rect.y0() + frame_rect.ysize() + extra <=
650 dec_state->shared->frame_dim.ysize_padded) {
651 JXL_DASSERT(input_rect.y0() + input_rect.ysize() + extra <=
652 input_image->ysize());
653 extra_rows_b = ifby1 = extra;
654 }
655 }
656 rect_for_if = Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
657 frame_rect.xsize() + ifbx0 + ifbx1,
658 frame_rect.ysize() + ifby0 + ifby1);
659 // Storage for pixel data does not necessarily start at (0, 0) as we need to
660 // have the left border of upsampling_rect aligned to a multiple of
661 // GroupBorderAssigner::kPaddingXRound.
662 rect_for_if_storage =
663 Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound) -
664 ifbx0,
665 kBlockDim, rect_for_if.xsize(), rect_for_if.ysize());
666 rect_for_upsampling =
667 Rect(kBlockDim + RoundUpTo(ifbx0, GroupBorderAssigner::kPaddingXRound),
668 kBlockDim + ifby0, frame_rect.xsize(), frame_rect.ysize());
669 rect_for_if_input =
670 Rect(input_rect.x0() - ifbx0, input_rect.y0() - ifby0,
671 rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
672 frame_rect_for_ycbcr_upsampling =
673 Rect(frame_rect.x0() - ifbx0, frame_rect.y0() - ifby0,
674 rect_for_if_input.xsize(), rect_for_if_input.ysize());
675 storage_for_if = &dec_state->upsampling_input_storage[thread];
676 }
677
678 // +--------------------------- STEP 1.5 ------------------------------+
679 // | Perform YCbCr upsampling if needed. |
680 // +-------------------------------------------------------------------+
681
682 Image3F* input = input_image;
683 if (!frame_header.chroma_subsampling.Is444()) {
684 for (size_t c = 0; c < 3; c++) {
685 size_t vs = frame_header.chroma_subsampling.VShift(c);
686 size_t hs = frame_header.chroma_subsampling.HShift(c);
687 // The per-thread output is used for the first time here. Poison the temp
688 // image on this thread to prevent leaking initialized data from a
689 // previous run in this thread in msan builds.
690 msan::PoisonImage(dec_state->ycbcr_out_images[thread].Plane(c));
691 HWY_DYNAMIC_DISPATCH(DoYCbCrUpsampling)
692 (hs, vs, &input_image->Plane(c), rect_for_if_input,
693 frame_rect_for_ycbcr_upsampling, frame_dim,
694 &dec_state->ycbcr_out_images[thread].Plane(c), lf,
695 &dec_state->ycbcr_temp_images[thread]);
696 }
697 input = &dec_state->ycbcr_out_images[thread];
698 }
699
700 // Variables for upsampling and filtering.
701 Rect upsampled_frame_rect(frame_rect.x0() * frame_header.upsampling,
702 frame_rect.y0() * frame_header.upsampling,
703 frame_rect.xsize() * frame_header.upsampling,
704 frame_rect.ysize() * frame_header.upsampling);
705 Rect full_frame_rect(0, 0, frame_dim.xsize_upsampled,
706 frame_dim.ysize_upsampled);
707 upsampled_frame_rect = upsampled_frame_rect.Crop(full_frame_rect);
708 EnsurePaddingInPlaceRowByRow ensure_padding_upsampling;
709 ssize_t ensure_padding_upsampling_y0 = 0;
710 ssize_t ensure_padding_upsampling_y1 = 0;
711
712 EnsurePaddingInPlaceRowByRow ensure_padding_filter;
713 FilterPipeline* fp = nullptr;
714 ssize_t ensure_padding_filter_y0 = 0;
715 ssize_t ensure_padding_filter_y1 = 0;
716 if (lf.epf_iters != 0 || lf.gab) {
717 fp = &dec_state->filter_pipelines[thread];
718 }
719
720 // +----------------------------- STEP 2 ------------------------------+
721 // | Change rects and buffer to not use `output_image` if direct |
722 // | output to rgb8 is requested. |
723 // +-------------------------------------------------------------------+
724 Image3F* output_pixel_data_storage = output_color;
725 Rect upsampled_frame_rect_for_storage = upsampled_frame_rect;
726 if (dec_state->rgb_output || dec_state->pixel_callback) {
727 size_t log2_upsampling = CeilLog2Nonzero(frame_header.upsampling);
728 if (storage_for_if == output_color) {
729 storage_for_if =
730 &dec_state->output_pixel_data_storage[log2_upsampling][thread];
731 rect_for_if_storage =
732 Rect(0, 0, rect_for_if_storage.xsize(), rect_for_if_storage.ysize());
733 }
734 output_pixel_data_storage =
735 &dec_state->output_pixel_data_storage[log2_upsampling][thread];
736 upsampled_frame_rect_for_storage =
737 Rect(0, 0, upsampled_frame_rect.xsize(), upsampled_frame_rect.ysize());
738 if (frame_header.upsampling == 1 && fp == nullptr) {
739 upsampled_frame_rect_for_storage = rect_for_if_storage =
740 rect_for_if_input;
741 output_pixel_data_storage = storage_for_if = input;
742 }
743 }
744 // Set up alpha channel.
745 const size_t ec =
746 metadata.Find(ExtraChannel::kAlpha) - metadata.extra_channel_info.data();
747 const ImageF* alpha = nullptr;
748 Rect alpha_rect = upsampled_frame_rect;
749 if (ec < metadata.extra_channel_info.size()) {
750 JXL_ASSERT(ec < extra_channels.size());
751 if (frame_header.extra_channel_upsampling[ec] == 1) {
752 alpha = extra_channels[ec].first;
753 alpha_rect = extra_channels[ec].second;
754 } else {
755 alpha = &output_image->extra_channels()[ec];
756 alpha_rect = upsampled_frame_rect;
757 }
758 }
759
760 // +----------------------------- STEP 3 ------------------------------+
761 // | Set up upsampling and upsample extra channels. |
762 // +-------------------------------------------------------------------+
763 Upsampler* color_upsampler = nullptr;
764 if (frame_header.upsampling != 1) {
765 color_upsampler =
766 &dec_state->upsamplers[CeilLog2Nonzero(frame_header.upsampling) - 1];
767 ensure_padding_upsampling.Init(
768 storage_for_if, rect_for_upsampling, frame_rect, frame_dim.xsize_padded,
769 frame_dim.ysize_padded, 2, 2, &ensure_padding_upsampling_y0,
770 &ensure_padding_upsampling_y1);
771 }
772
773 std::vector<std::pair<ImageF*, Rect>> extra_channels_for_patches;
774 std::vector<EnsurePaddingInPlaceRowByRow> ec_padding;
775
776 bool late_ec_upsample = frame_header.upsampling != 1;
777 for (auto ecups : frame_header.extra_channel_upsampling) {
778 if (ecups != frame_header.upsampling) {
779 // If patches are applied, either frame_header.upsampling == 1 or
780 // late_ec_upsample is true.
781 late_ec_upsample = false;
782 }
783 }
784
785 ssize_t ensure_padding_upsampling_ec_y0 = 0;
786 ssize_t ensure_padding_upsampling_ec_y1 = 0;
787
788 // TODO(veluca) do not upsample extra channels to a full-image-sized buffer if
789 // we are not outputting to an ImageBundle.
790 if (!late_ec_upsample) {
791 // Upsample extra channels first if not all channels have the same
792 // upsampling factor.
793 for (size_t ec = 0; ec < extra_channels.size(); ec++) {
794 size_t ecups = frame_header.extra_channel_upsampling[ec];
795 if (ecups == 1) {
796 extra_channels_for_patches.push_back(extra_channels[ec]);
797 continue;
798 }
799 ssize_t ensure_padding_y0, ensure_padding_y1;
800 EnsurePaddingInPlaceRowByRow ensure_padding;
801 // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
802 Rect ec_image_rect = ScaleRectForEC(
803 frame_rect.Crop(frame_dim.xsize, frame_dim.ysize), frame_header, ec);
804 size_t ecxs = DivCeil(frame_dim.xsize_upsampled,
805 frame_header.extra_channel_upsampling[ec]);
806 size_t ecys = DivCeil(frame_dim.ysize_upsampled,
807 frame_header.extra_channel_upsampling[ec]);
808 ensure_padding.Init(extra_channels[ec].first, extra_channels[ec].second,
809 ec_image_rect, ecxs, ecys, 2, 2, &ensure_padding_y0,
810 &ensure_padding_y1);
811 for (ssize_t y = ensure_padding_y0; y < ensure_padding_y1; y++) {
812 ensure_padding.Process(y);
813 }
814 Upsampler& upsampler =
815 dec_state->upsamplers[CeilLog2Nonzero(
816 frame_header.extra_channel_upsampling[ec]) -
817 1];
818 upsampler.UpsampleRect(
819 *extra_channels[ec].first, extra_channels[ec].second,
820 &output_image->extra_channels()[ec], upsampled_frame_rect,
821 static_cast<ssize_t>(ec_image_rect.y0()) -
822 static_cast<ssize_t>(extra_channels[ec].second.y0()),
823 ecys, dec_state->upsampler_storage[thread].get());
824 extra_channels_for_patches.emplace_back(
825 &output_image->extra_channels()[ec], upsampled_frame_rect);
826 }
827 } else {
828 // Upsample extra channels last if color channels are upsampled and all the
829 // extra channels have the same upsampling as them.
830 ec_padding.resize(extra_channels.size());
831 for (size_t ec = 0; ec < extra_channels.size(); ec++) {
832 // Add a border to the extra channel rect for when patches are applied.
833 // This ensures that the correct row is accessed (y values for patches are
834 // relative to rect_for_if, not to input_rect).
835 // As the rect is extended by 0 or 2 pixels, and the patches input has,
836 // accordingly, the same padding, this is safe.
837 Rect r(extra_channels[ec].second.x0() + rect_for_upsampling.x0() -
838 rect_for_if_storage.x0(),
839 extra_channels[ec].second.y0() + rect_for_upsampling.y0() -
840 rect_for_if_storage.y0(),
841 extra_channels[ec].second.xsize() + rect_for_if_storage.xsize() -
842 rect_for_upsampling.xsize(),
843 extra_channels[ec].second.ysize() + rect_for_if_storage.ysize() -
844 rect_for_upsampling.ysize());
845 extra_channels_for_patches.emplace_back(extra_channels[ec].first, r);
846 // frame_rect can go up to frame_dim.xsize_padded, in VarDCT mode.
847 ec_padding[ec].Init(extra_channels[ec].first, extra_channels[ec].second,
848 frame_rect.Crop(frame_dim.xsize, frame_dim.ysize),
849 frame_dim.xsize, frame_dim.ysize, 2, 2,
850 &ensure_padding_upsampling_ec_y0,
851 &ensure_padding_upsampling_ec_y1);
852 }
853 }
854
855 // Initialized to a valid non-null ptr to avoid UB if arithmetic is done with
856 // the pointer value (which would then not be used).
857 std::vector<float*> ec_ptrs_for_patches(extra_channels.size(),
858 input->PlaneRow(0, 0));
859
860 // +----------------------------- STEP 4 ------------------------------+
861 // | Set up the filter pipeline. |
862 // +-------------------------------------------------------------------+
863 if (fp) {
864 ensure_padding_filter.Init(
865 input, rect_for_if_input, rect_for_if, frame_dim.xsize_padded,
866 frame_dim.ysize_padded, lf.Padding(), lf.Padding(),
867 &ensure_padding_filter_y0, &ensure_padding_filter_y1);
868
869 fp = PrepareFilterPipeline(dec_state, rect_for_if, *input,
870 rect_for_if_input, frame_dim.ysize_padded,
871 thread, storage_for_if, rect_for_if_storage);
872 }
873
874 // +----------------------------- STEP 5 ------------------------------+
875 // | Run the prepared pipeline of operations. |
876 // +-------------------------------------------------------------------+
877
878 // y values are relative to rect_for_if.
879 // Automatic mirroring in fp->ApplyFiltersRow() implies that we should ensure
880 // that padding for the first lines of the image is already present before
881 // calling ApplyFiltersRow() with "virtual" rows.
882 // Here we rely on the fact that virtual rows at the beginning of the image
883 // are only present if input_rect.y0() == 0.
884 ssize_t first_ensure_padding_y = ensure_padding_filter_y0;
885 if (frame_rect.y0() == 0) {
886 JXL_DASSERT(ensure_padding_filter_y0 == 0);
887 first_ensure_padding_y =
888 std::min<ssize_t>(lf.Padding(), ensure_padding_filter_y1);
889 for (ssize_t y = 0; y < first_ensure_padding_y; y++) {
890 ensure_padding_filter.Process3(y);
891 }
892 }
893
894 for (ssize_t y = -lf.Padding();
895 y < static_cast<ssize_t>(lf.Padding() + rect_for_if.ysize()); y++) {
896 if (fp) {
897 if (y >= first_ensure_padding_y && y < ensure_padding_filter_y1) {
898 ensure_padding_filter.Process3(y);
899 }
900 fp->ApplyFiltersRow(lf, dec_state->filter_weights, y);
901 } else if (output_pixel_data_storage != input) {
902 for (size_t c = 0; c < 3; c++) {
903 memcpy(rect_for_if_storage.PlaneRow(storage_for_if, c, y),
904 rect_for_if_input.ConstPlaneRow(*input, c, y),
905 rect_for_if_input.xsize() * sizeof(float));
906 }
907 }
908 if (y < static_cast<ssize_t>(lf.Padding())) continue;
909 // At this point, row `y - lf.Padding()` of `rect_for_if` has been produced
910 // by the filters.
911 ssize_t available_y = y - lf.Padding();
912 if (frame_header.upsampling == 1) {
913 for (size_t i = 0; i < extra_channels.size(); i++) {
914 ec_ptrs_for_patches[i] = extra_channels_for_patches[i].second.Row(
915 extra_channels_for_patches[i].first, available_y);
916 }
917 }
918 JXL_RETURN_IF_ERROR(image_features.patches.AddTo(
919 storage_for_if, rect_for_if_storage.Line(available_y),
920 ec_ptrs_for_patches.data(), rect_for_if.Line(available_y)));
921 image_features.splines.AddTo(storage_for_if,
922 rect_for_if_storage.Line(available_y),
923 rect_for_if.Line(available_y));
924 size_t num_ys = 1;
925 if (frame_header.upsampling != 1) {
926 // Upsampling `y` values are relative to `rect_for_upsampling`, not to
927 // `rect_for_if`.
928 ssize_t shifted_y = available_y - extra_rows_t;
929 if (shifted_y >= ensure_padding_upsampling_y0 &&
930 shifted_y < ensure_padding_upsampling_y1) {
931 ensure_padding_upsampling.Process3(shifted_y);
932 }
933 if (late_ec_upsample && shifted_y >= ensure_padding_upsampling_ec_y0 &&
934 shifted_y < ensure_padding_upsampling_ec_y1) {
935 for (size_t ec = 0; ec < extra_channels.size(); ec++) {
936 ec_padding[ec].Process(shifted_y);
937 }
938 }
939 // Upsampling will access two rows of border, so the first upsampling
940 // output will be available after shifted_y is at least 2, *unless* image
941 // height is <= 2.
942 if (shifted_y < 2 &&
943 shifted_y + 1 != static_cast<ssize_t>(frame_rect.ysize())) {
944 continue;
945 }
946 // Value relative to upsampled_frame_rect.
947 size_t input_y = std::max<ssize_t>(shifted_y - 2, 0);
948 size_t upsampled_available_y = frame_header.upsampling * input_y;
949 size_t num_input_rows = 1;
950 // If we are going to mirror the last output rows, then we already have 3
951 // input lines ready. This happens iff we did not extend rect_for_if on
952 // the bottom *and* we are at the last `y` value.
953 if (extra_rows_b != 2 &&
954 static_cast<size_t>(y) + 1 == lf.Padding() + rect_for_if.ysize()) {
955 num_input_rows = 3;
956 }
957 num_input_rows = std::min(num_input_rows, frame_dim.ysize_padded);
958 num_ys = num_input_rows * frame_header.upsampling;
959
960 if (static_cast<size_t>(upsampled_available_y) >=
961 upsampled_frame_rect.ysize()) {
962 continue;
963 }
964
965 if (upsampled_available_y + num_ys >= upsampled_frame_rect.ysize()) {
966 num_ys = upsampled_frame_rect.ysize() - upsampled_available_y;
967 }
968
969 // Upsampler takes care of mirroring, and checks "physical" boundaries.
970 Rect upsample_input_rect = rect_for_upsampling.Lines(input_y, 1);
971 color_upsampler->UpsampleRect(
972 *storage_for_if, upsample_input_rect, output_pixel_data_storage,
973 upsampled_frame_rect_for_storage.Lines(upsampled_available_y, num_ys),
974 static_cast<ssize_t>(frame_rect.y0()) -
975 static_cast<ssize_t>(rect_for_upsampling.y0()),
976 frame_dim.ysize_padded, dec_state->upsampler_storage[thread].get());
977 if (late_ec_upsample) {
978 for (size_t ec = 0; ec < extra_channels.size(); ec++) {
979 // Upsampler takes care of mirroring, and checks "physical"
980 // boundaries.
981 Rect upsample_ec_input_rect =
982 extra_channels[ec].second.Lines(input_y, 1);
983 color_upsampler->UpsampleRect(
984 *extra_channels[ec].first, upsample_ec_input_rect,
985 &output_image->extra_channels()[ec],
986 upsampled_frame_rect.Lines(upsampled_available_y, num_ys),
987 static_cast<ssize_t>(frame_rect.y0()) -
988 static_cast<ssize_t>(extra_channels[ec].second.y0()),
989 frame_dim.ysize, dec_state->upsampler_storage[thread].get());
990 }
991 }
992 available_y = upsampled_available_y;
993 }
994
995 if (static_cast<size_t>(available_y) >= upsampled_frame_rect.ysize()) {
996 continue;
997 }
998
999 // The image data is now unconditionally in
1000 // `output_image_storage:upsampled_frame_rect_for_storage`.
1001 if (frame_header.flags & FrameHeader::kNoise) {
1002 PROFILER_ZONE("AddNoise");
1003 AddNoise(image_features.noise_params,
1004 upsampled_frame_rect.Lines(available_y, num_ys),
1005 dec_state->noise,
1006 upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1007 dec_state->shared_storage.cmap, output_pixel_data_storage);
1008 }
1009
1010 if (dec_state->pre_color_transform_frame.xsize() != 0) {
1011 for (size_t c = 0; c < 3; c++) {
1012 for (size_t y = available_y; y < available_y + num_ys; y++) {
1013 float* JXL_RESTRICT row_out = upsampled_frame_rect.PlaneRow(
1014 &dec_state->pre_color_transform_frame, c, y);
1015 const float* JXL_RESTRICT row_in =
1016 upsampled_frame_rect_for_storage.ConstPlaneRow(
1017 *output_pixel_data_storage, c, y);
1018 memcpy(row_out, row_in,
1019 upsampled_frame_rect.xsize() * sizeof(*row_in));
1020 }
1021 }
1022 }
1023
1024 // We skip the color transform entirely if save_before_color_transform and
1025 // the frame is not supposed to be displayed.
1026
1027 if (dec_state->fast_xyb_srgb8_conversion) {
1028 FastXYBTosRGB8(
1029 *output_pixel_data_storage,
1030 upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1031 upsampled_frame_rect.Lines(available_y, num_ys)
1032 .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1033 frame_dim.ysize_upsampled)),
1034 alpha, alpha_rect.Lines(available_y, num_ys),
1035 dec_state->rgb_output_is_rgba, dec_state->rgb_output, frame_dim.xsize,
1036 dec_state->rgb_stride);
1037 } else {
1038 if (frame_header.needs_color_transform()) {
1039 if (frame_header.color_transform == ColorTransform::kXYB) {
1040 JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(UndoXYBInPlace)(
1041 output_pixel_data_storage,
1042 upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1043 dec_state->output_encoding_info));
1044 } else if (frame_header.color_transform == ColorTransform::kYCbCr) {
1045 YcbcrToRgb(
1046 *output_pixel_data_storage, output_pixel_data_storage,
1047 upsampled_frame_rect_for_storage.Lines(available_y, num_ys));
1048 }
1049 }
1050
1051 // TODO(veluca): all blending should happen here.
1052
1053 if (dec_state->rgb_output != nullptr) {
1054 HWY_DYNAMIC_DISPATCH(FloatToRGBA8)
1055 (*output_pixel_data_storage,
1056 upsampled_frame_rect_for_storage.Lines(available_y, num_ys),
1057 dec_state->rgb_output_is_rgba, alpha,
1058 alpha_rect.Lines(available_y, num_ys),
1059 upsampled_frame_rect.Lines(available_y, num_ys)
1060 .Crop(Rect(0, 0, frame_dim.xsize, frame_dim.ysize)),
1061 dec_state->rgb_output, dec_state->rgb_stride);
1062 }
1063 if (dec_state->pixel_callback != nullptr) {
1064 Rect alpha_line_rect = alpha_rect.Lines(available_y, num_ys);
1065 Rect color_input_line_rect =
1066 upsampled_frame_rect_for_storage.Lines(available_y, num_ys);
1067 Rect image_line_rect = upsampled_frame_rect.Lines(available_y, num_ys)
1068 .Crop(Rect(0, 0, frame_dim.xsize_upsampled,
1069 frame_dim.ysize_upsampled));
1070 const float* line_buffers[4];
1071 for (size_t iy = 0; iy < image_line_rect.ysize(); iy++) {
1072 for (size_t c = 0; c < 3; c++) {
1073 line_buffers[c] = color_input_line_rect.ConstPlaneRow(
1074 *output_pixel_data_storage, c, iy);
1075 }
1076 if (alpha) {
1077 line_buffers[3] = alpha_line_rect.ConstRow(*alpha, iy);
1078 } else {
1079 line_buffers[3] = dec_state->opaque_alpha.data();
1080 }
1081 std::vector<float>& interleaved =
1082 dec_state->pixel_callback_rows[thread];
1083 size_t j = 0;
1084 for (size_t i = 0; i < image_line_rect.xsize(); i++) {
1085 interleaved[j++] = line_buffers[0][i];
1086 interleaved[j++] = line_buffers[1][i];
1087 interleaved[j++] = line_buffers[2][i];
1088 if (dec_state->rgb_output_is_rgba) {
1089 interleaved[j++] = line_buffers[3][i];
1090 }
1091 }
1092 dec_state->pixel_callback(interleaved.data(), image_line_rect.x0(),
1093 image_line_rect.y0() + iy,
1094 image_line_rect.xsize());
1095 }
1096 }
1097 }
1098 }
1099
1100 return true;
1101 }
1102
FinalizeFrameDecoding(ImageBundle * decoded,PassesDecoderState * dec_state,ThreadPool * pool,bool force_fir,bool skip_blending)1103 Status FinalizeFrameDecoding(ImageBundle* decoded,
1104 PassesDecoderState* dec_state, ThreadPool* pool,
1105 bool force_fir, bool skip_blending) {
1106 const FrameHeader& frame_header = dec_state->shared->frame_header;
1107 const FrameDimensions& frame_dim = dec_state->shared->frame_dim;
1108
1109 // FinalizeImageRect was not yet run, or we are forcing a run.
1110 if (!dec_state->EagerFinalizeImageRect() || force_fir) {
1111 std::vector<Rect> rects_to_process;
1112 for (size_t y = 0; y < frame_dim.ysize_padded; y += kGroupDim) {
1113 for (size_t x = 0; x < frame_dim.xsize_padded; x += kGroupDim) {
1114 Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize_padded,
1115 frame_dim.ysize_padded);
1116 if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1117 rects_to_process.push_back(rect);
1118 }
1119 }
1120 const auto allocate_storage = [&](size_t num_threads) {
1121 dec_state->EnsureStorage(num_threads);
1122 return true;
1123 };
1124
1125 {
1126 std::vector<ImageF> ecs;
1127 const ImageMetadata& metadata = frame_header.nonserialized_metadata->m;
1128 for (size_t i = 0; i < metadata.num_extra_channels; i++) {
1129 if (frame_header.extra_channel_upsampling[i] == 1) {
1130 ecs.push_back(std::move(dec_state->extra_channels[i]));
1131 } else {
1132 ecs.emplace_back(frame_dim.xsize_upsampled_padded,
1133 frame_dim.ysize_upsampled_padded);
1134 }
1135 }
1136 decoded->SetExtraChannels(std::move(ecs));
1137 }
1138
1139 std::atomic<bool> apply_features_ok{true};
1140 auto run_apply_features = [&](size_t rect_id, size_t thread) {
1141 size_t xstart = PassesDecoderState::kGroupDataXBorder;
1142 size_t ystart = PassesDecoderState::kGroupDataYBorder;
1143 for (size_t c = 0; c < 3; c++) {
1144 Rect rh(rects_to_process[rect_id].x0() >>
1145 frame_header.chroma_subsampling.HShift(c),
1146 rects_to_process[rect_id].y0() >>
1147 frame_header.chroma_subsampling.VShift(c),
1148 rects_to_process[rect_id].xsize() >>
1149 frame_header.chroma_subsampling.HShift(c),
1150 rects_to_process[rect_id].ysize() >>
1151 frame_header.chroma_subsampling.VShift(c));
1152 Rect group_data_rect(xstart, ystart, rh.xsize(), rh.ysize());
1153 // Poison the image in this thread to prevent leaking initialized data
1154 // from a previous run in this thread in msan builds.
1155 msan::PoisonImage(dec_state->group_data[thread].Plane(c));
1156 CopyImageToWithPadding(
1157 rh, dec_state->decoded.Plane(c), dec_state->FinalizeRectPadding(),
1158 group_data_rect, &dec_state->group_data[thread].Plane(c));
1159 }
1160 Rect group_data_rect(xstart, ystart, rects_to_process[rect_id].xsize(),
1161 rects_to_process[rect_id].ysize());
1162 std::vector<std::pair<ImageF*, Rect>> ec_rects;
1163 ec_rects.reserve(decoded->extra_channels().size());
1164 for (size_t i = 0; i < decoded->extra_channels().size(); i++) {
1165 Rect r = ScaleRectForEC(
1166 rects_to_process[rect_id].Crop(frame_dim.xsize, frame_dim.ysize),
1167 frame_header, i);
1168 if (frame_header.extra_channel_upsampling[i] != 1) {
1169 Rect ec_input_rect(kBlockDim, 2, r.xsize(), r.ysize());
1170 auto eti =
1171 &dec_state
1172 ->ec_temp_images[thread * decoded->extra_channels().size() +
1173 i];
1174 // Poison the temp image on this thread to prevent leaking initialized
1175 // data from a previous run in this thread in msan builds.
1176 msan::PoisonImage(*eti);
1177 JXL_CHECK_IMAGE_INITIALIZED(dec_state->extra_channels[i], r);
1178 CopyImageToWithPadding(r, dec_state->extra_channels[i],
1179 /*padding=*/2, ec_input_rect, eti);
1180 ec_rects.emplace_back(eti, ec_input_rect);
1181 } else {
1182 JXL_CHECK_IMAGE_INITIALIZED(decoded->extra_channels()[i], r);
1183 ec_rects.emplace_back(&decoded->extra_channels()[i], r);
1184 }
1185 }
1186 if (!FinalizeImageRect(&dec_state->group_data[thread], group_data_rect,
1187 ec_rects, dec_state, thread, decoded,
1188 rects_to_process[rect_id])) {
1189 apply_features_ok = false;
1190 }
1191 };
1192
1193 RunOnPool(pool, 0, rects_to_process.size(), allocate_storage,
1194 run_apply_features, "ApplyFeatures");
1195
1196 if (!apply_features_ok) {
1197 return JXL_FAILURE("FinalizeImageRect failed");
1198 }
1199 }
1200
1201 const size_t xsize = frame_dim.xsize_upsampled;
1202 const size_t ysize = frame_dim.ysize_upsampled;
1203
1204 decoded->ShrinkTo(xsize, ysize);
1205 if (dec_state->pre_color_transform_frame.xsize() != 0) {
1206 dec_state->pre_color_transform_frame.ShrinkTo(xsize, ysize);
1207 }
1208
1209 if (!skip_blending && ImageBlender::NeedsBlending(dec_state)) {
1210 if (dec_state->pre_color_transform_frame.xsize() != 0) {
1211 // Extra channels are going to be modified. Make a copy.
1212 dec_state->pre_color_transform_ec.clear();
1213 for (const auto& ec : decoded->extra_channels()) {
1214 dec_state->pre_color_transform_ec.emplace_back(CopyImage(ec));
1215 }
1216 }
1217 ImageBlender blender;
1218 ImageBundle foreground = std::move(*decoded);
1219 decoded->SetFromImage(Image3F(frame_header.nonserialized_metadata->xsize(),
1220 frame_header.nonserialized_metadata->ysize()),
1221 foreground.c_current());
1222 std::vector<Rect> extra_channels_rects;
1223 decoded->extra_channels().reserve(foreground.extra_channels().size());
1224 extra_channels_rects.reserve(foreground.extra_channels().size());
1225 for (size_t i = 0; i < foreground.extra_channels().size(); ++i) {
1226 decoded->extra_channels().emplace_back(
1227 frame_header.nonserialized_metadata->xsize(),
1228 frame_header.nonserialized_metadata->ysize());
1229 extra_channels_rects.emplace_back(decoded->extra_channels().back());
1230 }
1231 JXL_RETURN_IF_ERROR(blender.PrepareBlending(
1232 dec_state, foreground.origin, foreground.xsize(), foreground.ysize(),
1233 &frame_header.nonserialized_metadata->m.extra_channel_info,
1234 foreground.c_current(), Rect(*decoded->color()),
1235 /*output=*/decoded->color(), Rect(*decoded->color()),
1236 &decoded->extra_channels(), std::move(extra_channels_rects)));
1237
1238 std::vector<Rect> rects_to_process;
1239 for (size_t y = 0; y < frame_dim.ysize; y += kGroupDim) {
1240 for (size_t x = 0; x < frame_dim.xsize; x += kGroupDim) {
1241 Rect rect(x, y, kGroupDim, kGroupDim, frame_dim.xsize, frame_dim.ysize);
1242 if (rect.xsize() == 0 || rect.ysize() == 0) continue;
1243 rects_to_process.push_back(rect);
1244 }
1245 }
1246
1247 std::atomic<bool> blending_ok{true};
1248 JXL_RETURN_IF_ERROR(RunOnPool(
1249 pool, 0, rects_to_process.size(), ThreadPool::SkipInit(),
1250 [&](size_t i, size_t /*thread*/) {
1251 const Rect& rect = rects_to_process[i];
1252 auto rect_blender = blender.PrepareRect(
1253 rect, *foreground.color(), foreground.extra_channels(), rect);
1254 for (size_t y = 0; y < rect.ysize(); ++y) {
1255 if (!rect_blender.DoBlending(y)) {
1256 blending_ok = false;
1257 return;
1258 }
1259 }
1260 },
1261 "Blend"));
1262 JXL_RETURN_IF_ERROR(blending_ok.load());
1263 }
1264
1265 return true;
1266 }
1267
1268 } // namespace jxl
1269 #endif // HWY_ONCE
1270