1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 #ifndef LIB_JXL_DEC_CACHE_H_
7 #define LIB_JXL_DEC_CACHE_H_
8 
9 #include <stdint.h>
10 
11 #include <hwy/base.h>  // HWY_ALIGN_MAX
12 
13 #include "lib/jxl/ac_strategy.h"
14 #include "lib/jxl/base/profiler.h"
15 #include "lib/jxl/coeff_order.h"
16 #include "lib/jxl/common.h"
17 #include "lib/jxl/convolve.h"
18 #include "lib/jxl/dec_group_border.h"
19 #include "lib/jxl/dec_noise.h"
20 #include "lib/jxl/dec_upsample.h"
21 #include "lib/jxl/filters.h"
22 #include "lib/jxl/image.h"
23 #include "lib/jxl/passes_state.h"
24 #include "lib/jxl/quant_weights.h"
25 
26 namespace jxl {
27 
28 // Per-frame decoder state. All the images here should be accessed through a
29 // group rect (either with block units or pixel units).
30 struct PassesDecoderState {
31   PassesSharedState shared_storage;
32   // Allows avoiding copies for encoder loop.
33   const PassesSharedState* JXL_RESTRICT shared = &shared_storage;
34 
35   // Upsamplers for all the possible upsampling factors (2 to 8).
36   Upsampler upsamplers[3];
37 
38   // Storage for RNG output for noise synthesis.
39   Image3F noise;
40 
41   // Storage for pre-color-transform output for displayed
42   // save_before_color_transform frames.
43   Image3F pre_color_transform_frame;
44   // Non-empty (contains originals) if extra-channels were cropped.
45   std::vector<ImageF> pre_color_transform_ec;
46 
47   // For ANS decoding.
48   std::vector<ANSCode> code;
49   std::vector<std::vector<uint8_t>> context_map;
50 
51   // Multiplier to be applied to the quant matrices of the x channel.
52   float x_dm_multiplier;
53   float b_dm_multiplier;
54 
55   // Decoded image.
56   Image3F decoded;
57   std::vector<ImageF> extra_channels;
58 
59   // Borders between groups. Only allocated if `decoded` is *not* allocated.
60   // We also store the extremal borders for simplicity. Horizontal borders are
61   // stored in an image as wide as the main frame, in top-to-bottom order (top
62   // border of a group first, followed by the bottom border, followed by top
63   // border of the next group). Vertical borders are similarly stored.
64   Image3F borders_horizontal;
65   Image3F borders_vertical;
66 
67   // RGB8 output buffer. If not nullptr, image data will be written to this
68   // buffer instead of being written to the output ImageBundle. The image data
69   // is assumed to have the stride given by `rgb_stride`, hence row `i` starts
70   // at position `i * rgb_stride`.
71   uint8_t* rgb_output;
72   size_t rgb_stride = 0;
73 
74   // Whether to use int16 float-XYB-to-uint8-srgb conversion.
75   bool fast_xyb_srgb8_conversion;
76 
77   // If true, rgb_output or callback output is RGBA using 4 instead of 3 bytes
78   // per pixel.
79   bool rgb_output_is_rgba;
80 
81   // Callback for line-by-line output.
82   std::function<void(const float*, size_t, size_t, size_t)> pixel_callback;
83   // Buffer of upsampling * kApplyImageFeaturesTileDim ones.
84   std::vector<float> opaque_alpha;
85   // One row per thread
86   std::vector<std::vector<float>> pixel_callback_rows;
87 
88   // Seed for noise, to have different noise per-frame.
89   size_t noise_seed = 0;
90 
91   // Keep track of the transform types used.
92   std::atomic<uint32_t> used_acs{0};
93 
94   // Storage for coefficients if in "accumulate" mode.
95   std::unique_ptr<ACImage> coefficients = make_unique<ACImageT<int32_t>>(0, 0);
96 
97   // Filter application pipeline used by ApplyImageFeatures. One entry is needed
98   // per thread.
99   std::vector<FilterPipeline> filter_pipelines;
100 
101   // Input weights used by the filters. These are shared from multiple threads
102   // but are read-only for the filter application.
103   FilterWeights filter_weights;
104 
105   // Manages the status of borders.
106   GroupBorderAssigner group_border_assigner;
107 
108   // TODO(veluca): this should eventually become "iff no global modular
109   // transform was applied".
EagerFinalizeImageRectPassesDecoderState110   bool EagerFinalizeImageRect() const {
111     return shared->frame_header.chroma_subsampling.Is444() &&
112            shared->frame_header.encoding == FrameEncoding::kVarDCT &&
113            shared->frame_header.nonserialized_metadata->m.extra_channel_info
114                .empty();
115   }
116 
117   // Amount of padding that will be accessed, in all directions, outside a rect
118   // during a call to FinalizeImageRect().
FinalizeRectPaddingPassesDecoderState119   size_t FinalizeRectPadding() const {
120     // TODO(veluca): add YCbCr upsampling here too.
121     size_t padding = shared->frame_header.loop_filter.Padding();
122     padding += shared->frame_header.upsampling == 1 ? 0 : 2;
123     JXL_DASSERT(padding <= kMaxFinalizeRectPadding);
124     for (auto ups : shared->frame_header.extra_channel_upsampling) {
125       if (ups > 1) {
126         padding = std::max(padding, size_t{2});
127       }
128     }
129     return padding;
130   }
131 
132   // Storage for intermediate data during FinalizeRect steps.
133   // TODO(veluca): these buffers are larger than strictly necessary.
134   std::vector<Image3F> filter_input_storage;
135   std::vector<Image3F> padded_upsampling_input_storage;
136   std::vector<Image3F> upsampling_input_storage;
137   // We keep four arrays, one per upsampling level, to reduce memory usage in
138   // the common case of no upsampling.
139   std::vector<Image3F> output_pixel_data_storage[4] = {};
140   std::vector<ImageF> ec_temp_images;
141 
142   // Buffer for decoded pixel data for a group.
143   std::vector<Image3F> group_data;
144   static constexpr size_t kGroupDataYBorder = kMaxFinalizeRectPadding * 2;
145   static constexpr size_t kGroupDataXBorder =
146       RoundUpToBlockDim(kMaxFinalizeRectPadding) * 2 + kBlockDim;
147 
EnsureStoragePassesDecoderState148   void EnsureStorage(size_t num_threads) {
149     // We need one filter_storage per thread, ensure we have at least that many.
150     if (shared->frame_header.loop_filter.epf_iters != 0 ||
151         shared->frame_header.loop_filter.gab) {
152       if (filter_pipelines.size() < num_threads) {
153         filter_pipelines.resize(num_threads);
154       }
155     }
156     // We allocate filter_input_storage unconditionally to ensure that the image
157     // is allocated if we need it for DC upsampling.
158     for (size_t _ = filter_input_storage.size(); _ < num_threads; _++) {
159       // Extra padding along the x dimension to ensure memory accesses don't
160       // load out-of-bounds pixels.
161       filter_input_storage.emplace_back(
162           kApplyImageFeaturesTileDim + 2 * kGroupDataXBorder,
163           kApplyImageFeaturesTileDim + 2 * kGroupDataYBorder);
164     }
165     if (shared->frame_header.upsampling != 1) {
166       for (size_t _ = upsampling_input_storage.size(); _ < num_threads; _++) {
167         // At this point, we only need up to 2 pixels of border per side for
168         // upsampling, but we add an extra border for aligned access.
169         upsampling_input_storage.emplace_back(
170             kApplyImageFeaturesTileDim + 2 * kBlockDim,
171             kApplyImageFeaturesTileDim + 4);
172         padded_upsampling_input_storage.emplace_back(
173             kApplyImageFeaturesTileDim + 2 * kBlockDim,
174             kApplyImageFeaturesTileDim + 4);
175       }
176     }
177     for (size_t _ = group_data.size(); _ < num_threads; _++) {
178       group_data.emplace_back(kGroupDim + 2 * kGroupDataXBorder,
179                               kGroupDim + 2 * kGroupDataYBorder);
180 #if MEMORY_SANITIZER
181       // Avoid errors due to loading vectors on the outermost padding.
182       ZeroFillImage(&group_data.back());
183 #endif
184     }
185     if (rgb_output || pixel_callback) {
186       size_t log2_upsampling = CeilLog2Nonzero(shared->frame_header.upsampling);
187       for (size_t _ = output_pixel_data_storage[log2_upsampling].size();
188            _ < num_threads; _++) {
189         output_pixel_data_storage[log2_upsampling].emplace_back(
190             kApplyImageFeaturesTileDim << log2_upsampling,
191             kApplyImageFeaturesTileDim << log2_upsampling);
192       }
193       opaque_alpha.resize(
194           kApplyImageFeaturesTileDim * shared->frame_header.upsampling, 1.0f);
195       if (pixel_callback) {
196         pixel_callback_rows.resize(num_threads);
197         for (size_t i = 0; i < pixel_callback_rows.size(); ++i) {
198           pixel_callback_rows[i].resize(kApplyImageFeaturesTileDim *
199                                         shared->frame_header.upsampling *
200                                         (rgb_output_is_rgba ? 4 : 3));
201         }
202       }
203     }
204     if (shared->metadata->m.num_extra_channels * num_threads >
205         ec_temp_images.size()) {
206       ec_temp_images.resize(shared->metadata->m.num_extra_channels *
207                             num_threads);
208     }
209     for (size_t i = 0; i < shared->metadata->m.num_extra_channels; i++) {
210       if (shared->frame_header.extra_channel_upsampling[i] == 1) continue;
211       // We need up to 2 pixels of padding on each side. On the x axis, we round
212       // up padding so that 0 starts at a multiple of kBlockDim.
213       size_t xs = kApplyImageFeaturesTileDim * shared->frame_header.upsampling /
214                       shared->frame_header.extra_channel_upsampling[i] +
215                   2 * kBlockDim;
216       size_t ys = kApplyImageFeaturesTileDim * shared->frame_header.upsampling /
217                       shared->frame_header.extra_channel_upsampling[i] +
218                   4;
219       for (size_t t = 0; t < num_threads; t++) {
220         auto& eti =
221             ec_temp_images[t * shared->metadata->m.num_extra_channels + i];
222         if (eti.xsize() < xs || eti.ysize() < ys) {
223           eti = ImageF(xs, ys);
224         }
225       }
226     }
227   }
228 
229   // Information for colour conversions.
230   OutputEncodingInfo output_encoding_info;
231 
232   // Initializes decoder-specific structures using information from *shared.
InitPassesDecoderState233   void Init() {
234     x_dm_multiplier =
235         std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f);
236     b_dm_multiplier =
237         std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);
238 
239     rgb_output = nullptr;
240     pixel_callback = nullptr;
241     rgb_output_is_rgba = false;
242     fast_xyb_srgb8_conversion = false;
243     used_acs = 0;
244 
245     group_border_assigner.Init(shared->frame_dim);
246     const LoopFilter& lf = shared->frame_header.loop_filter;
247     filter_weights.Init(lf, shared->frame_dim);
248     for (auto& fp : filter_pipelines) {
249       // De-initialize FilterPipelines.
250       fp.num_filters = 0;
251     }
252     for (size_t i = 0; i < 3; i++) {
253       upsamplers[i].Init(2 << i, shared->metadata->transform_data);
254     }
255   }
256 
257   // Initialize the decoder state after all of DC is decoded.
InitForACPassesDecoderState258   void InitForAC(ThreadPool* pool) {
259     shared_storage.coeff_order_size = 0;
260     for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
261       if (((1 << o) & used_acs) == 0) continue;
262       uint8_t ord = kStrategyOrder[o];
263       shared_storage.coeff_order_size =
264           std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize,
265                    shared_storage.coeff_order_size);
266     }
267     size_t sz = shared_storage.frame_header.passes.num_passes *
268                 shared_storage.coeff_order_size;
269     if (sz > shared_storage.coeff_orders.size()) {
270       shared_storage.coeff_orders.resize(sz);
271     }
272     if (shared->frame_header.flags & FrameHeader::kNoise) {
273       noise = Image3F(shared->frame_dim.xsize_upsampled_padded,
274                       shared->frame_dim.ysize_upsampled_padded);
275       size_t num_x_groups = DivCeil(noise.xsize(), kGroupDim);
276       size_t num_y_groups = DivCeil(noise.ysize(), kGroupDim);
277       PROFILER_ZONE("GenerateNoise");
278       auto generate_noise = [&](int group_index, int _) {
279         size_t gx = group_index % num_x_groups;
280         size_t gy = group_index / num_x_groups;
281         Rect rect(gx * kGroupDim, gy * kGroupDim, kGroupDim, kGroupDim,
282                   noise.xsize(), noise.ysize());
283         RandomImage3(noise_seed + group_index, rect, &noise);
284       };
285       RunOnPool(pool, 0, num_x_groups * num_y_groups, ThreadPool::SkipInit(),
286                 generate_noise, "Generate noise");
287       {
288         PROFILER_ZONE("High pass noise");
289         // 4 * (1 - box kernel)
290         WeightsSymmetric5 weights{{HWY_REP4(-3.84)}, {HWY_REP4(0.16)},
291                                   {HWY_REP4(0.16)},  {HWY_REP4(0.16)},
292                                   {HWY_REP4(0.16)},  {HWY_REP4(0.16)}};
293         // TODO(veluca): avoid copy.
294         // TODO(veluca): avoid having a full copy of the image in main memory.
295         ImageF noise_tmp(noise.xsize(), noise.ysize());
296         for (size_t c = 0; c < 3; c++) {
297           Symmetric5(noise.Plane(c), Rect(noise), weights, pool, &noise_tmp);
298           std::swap(noise.Plane(c), noise_tmp);
299         }
300         noise_seed += shared->frame_dim.num_groups;
301       }
302     }
303     EnsureBordersStorage();
304     if (!EagerFinalizeImageRect()) {
305       // decoded must be padded to a multiple of kBlockDim rows since the last
306       // rows may be used by the filters even if they are outside the frame
307       // dimension.
308       decoded = Image3F(shared->frame_dim.xsize_padded,
309                         shared->frame_dim.ysize_padded);
310     }
311 #if MEMORY_SANITIZER
312     // Avoid errors due to loading vectors on the outermost padding.
313     ZeroFillImage(&decoded);
314 #endif
315   }
316 
EnsureBordersStoragePassesDecoderState317   void EnsureBordersStorage() {
318     if (!EagerFinalizeImageRect()) return;
319     size_t padding = FinalizeRectPadding();
320     size_t bordery = 2 * padding;
321     size_t borderx = padding + group_border_assigner.PaddingX(padding);
322     Rect horizontal = Rect(0, 0, shared->frame_dim.xsize_padded,
323                            bordery * shared->frame_dim.ysize_groups * 2);
324     if (!SameSize(horizontal, borders_horizontal)) {
325       borders_horizontal = Image3F(horizontal.xsize(), horizontal.ysize());
326     }
327     Rect vertical = Rect(0, 0, borderx * shared->frame_dim.xsize_groups * 2,
328                          shared->frame_dim.ysize_padded);
329     if (!SameSize(vertical, borders_vertical)) {
330       borders_vertical = Image3F(vertical.xsize(), vertical.ysize());
331     }
332   }
333 };
334 
335 // Temp images required for decoding a single group. Reduces memory allocations
336 // for large images because we only initialize min(#threads, #groups) instances.
337 struct GroupDecCache {
InitOnceGroupDecCache338   void InitOnce(size_t num_passes, size_t used_acs) {
339     PROFILER_FUNC;
340 
341     for (size_t i = 0; i < num_passes; i++) {
342       if (num_nzeroes[i].xsize() == 0) {
343         // Allocate enough for a whole group - partial groups on the
344         // right/bottom border just use a subset. The valid size is passed via
345         // Rect.
346 
347         num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
348       }
349     }
350     size_t max_block_area = 0;
351 
352     for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) {
353       AcStrategy acs = AcStrategy::FromRawStrategy(o);
354       if ((used_acs & (1 << o)) == 0) continue;
355       size_t area =
356           acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
357       max_block_area = std::max(area, max_block_area);
358     }
359 
360     if (max_block_area > max_block_area_) {
361       max_block_area_ = max_block_area;
362       // We need 3x float blocks for dequantized coefficients and 1x for scratch
363       // space for transforms.
364       float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 4);
365       // We need 3x int32 or int16 blocks for quantized coefficients.
366       int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3);
367       int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3);
368     }
369 
370     dec_group_block = float_memory_.get();
371     scratch_space = dec_group_block + max_block_area_ * 3;
372     dec_group_qblock = int32_memory_.get();
373     dec_group_qblock16 = int16_memory_.get();
374   }
375 
376   // Scratch space used by DecGroupImpl().
377   float* dec_group_block;
378   int32_t* dec_group_qblock;
379   int16_t* dec_group_qblock16;
380 
381   // For TransformToPixels.
382   float* scratch_space;
383   // Note that scratch_space is never used at the same time as dec_group_qblock.
384   // Moreover, only one of dec_group_qblock16 is ever used.
385   // TODO(veluca): figure out if we can save allocations.
386 
387   // AC decoding
388   Image3I num_nzeroes[kMaxNumPasses];
389 
390  private:
391   hwy::AlignedFreeUniquePtr<float[]> float_memory_;
392   hwy::AlignedFreeUniquePtr<int32_t[]> int32_memory_;
393   hwy::AlignedFreeUniquePtr<int16_t[]> int16_memory_;
394   size_t max_block_area_ = 0;
395 };
396 
397 }  // namespace jxl
398 
399 #endif  // LIB_JXL_DEC_CACHE_H_
400