1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #ifndef LIB_JXL_DEC_CACHE_H_ 7 #define LIB_JXL_DEC_CACHE_H_ 8 9 #include <stdint.h> 10 11 #include <hwy/base.h> // HWY_ALIGN_MAX 12 13 #include "lib/jxl/ac_strategy.h" 14 #include "lib/jxl/base/profiler.h" 15 #include "lib/jxl/coeff_order.h" 16 #include "lib/jxl/common.h" 17 #include "lib/jxl/convolve.h" 18 #include "lib/jxl/dec_group_border.h" 19 #include "lib/jxl/dec_noise.h" 20 #include "lib/jxl/dec_upsample.h" 21 #include "lib/jxl/filters.h" 22 #include "lib/jxl/image.h" 23 #include "lib/jxl/passes_state.h" 24 #include "lib/jxl/quant_weights.h" 25 26 namespace jxl { 27 28 // Per-frame decoder state. All the images here should be accessed through a 29 // group rect (either with block units or pixel units). 30 struct PassesDecoderState { 31 PassesSharedState shared_storage; 32 // Allows avoiding copies for encoder loop. 33 const PassesSharedState* JXL_RESTRICT shared = &shared_storage; 34 35 // Upsamplers for all the possible upsampling factors (2 to 8). 36 Upsampler upsamplers[3]; 37 38 // Storage for RNG output for noise synthesis. 39 Image3F noise; 40 41 // Storage for pre-color-transform output for displayed 42 // save_before_color_transform frames. 43 Image3F pre_color_transform_frame; 44 // Non-empty (contains originals) if extra-channels were cropped. 45 std::vector<ImageF> pre_color_transform_ec; 46 47 // For ANS decoding. 48 std::vector<ANSCode> code; 49 std::vector<std::vector<uint8_t>> context_map; 50 51 // Multiplier to be applied to the quant matrices of the x channel. 52 float x_dm_multiplier; 53 float b_dm_multiplier; 54 55 // Decoded image. 56 Image3F decoded; 57 std::vector<ImageF> extra_channels; 58 59 // Borders between groups. Only allocated if `decoded` is *not* allocated. 60 // We also store the extremal borders for simplicity. Horizontal borders are 61 // stored in an image as wide as the main frame, in top-to-bottom order (top 62 // border of a group first, followed by the bottom border, followed by top 63 // border of the next group). Vertical borders are similarly stored. 64 Image3F borders_horizontal; 65 Image3F borders_vertical; 66 67 // RGB8 output buffer. If not nullptr, image data will be written to this 68 // buffer instead of being written to the output ImageBundle. The image data 69 // is assumed to have the stride given by `rgb_stride`, hence row `i` starts 70 // at position `i * rgb_stride`. 71 uint8_t* rgb_output; 72 size_t rgb_stride = 0; 73 74 // Whether to use int16 float-XYB-to-uint8-srgb conversion. 75 bool fast_xyb_srgb8_conversion; 76 77 // If true, rgb_output or callback output is RGBA using 4 instead of 3 bytes 78 // per pixel. 79 bool rgb_output_is_rgba; 80 81 // Callback for line-by-line output. 82 std::function<void(const float*, size_t, size_t, size_t)> pixel_callback; 83 // Buffer of upsampling * kApplyImageFeaturesTileDim ones. 84 std::vector<float> opaque_alpha; 85 // One row per thread 86 std::vector<std::vector<float>> pixel_callback_rows; 87 88 // Seed for noise, to have different noise per-frame. 89 size_t noise_seed = 0; 90 91 // Keep track of the transform types used. 92 std::atomic<uint32_t> used_acs{0}; 93 94 // Storage for coefficients if in "accumulate" mode. 95 std::unique_ptr<ACImage> coefficients = make_unique<ACImageT<int32_t>>(0, 0); 96 97 // Filter application pipeline used by ApplyImageFeatures. One entry is needed 98 // per thread. 99 std::vector<FilterPipeline> filter_pipelines; 100 101 // Input weights used by the filters. These are shared from multiple threads 102 // but are read-only for the filter application. 103 FilterWeights filter_weights; 104 105 // Manages the status of borders. 106 GroupBorderAssigner group_border_assigner; 107 108 // TODO(veluca): this should eventually become "iff no global modular 109 // transform was applied". EagerFinalizeImageRectPassesDecoderState110 bool EagerFinalizeImageRect() const { 111 return shared->frame_header.chroma_subsampling.Is444() && 112 shared->frame_header.encoding == FrameEncoding::kVarDCT && 113 shared->frame_header.nonserialized_metadata->m.extra_channel_info 114 .empty(); 115 } 116 117 // Amount of padding that will be accessed, in all directions, outside a rect 118 // during a call to FinalizeImageRect(). FinalizeRectPaddingPassesDecoderState119 size_t FinalizeRectPadding() const { 120 // TODO(veluca): add YCbCr upsampling here too. 121 size_t padding = shared->frame_header.loop_filter.Padding(); 122 padding += shared->frame_header.upsampling == 1 ? 0 : 2; 123 JXL_DASSERT(padding <= kMaxFinalizeRectPadding); 124 for (auto ups : shared->frame_header.extra_channel_upsampling) { 125 if (ups > 1) { 126 padding = std::max(padding, size_t{2}); 127 } 128 } 129 return padding; 130 } 131 132 // Storage for intermediate data during FinalizeRect steps. 133 // TODO(veluca): these buffers are larger than strictly necessary. 134 std::vector<Image3F> filter_input_storage; 135 std::vector<Image3F> padded_upsampling_input_storage; 136 std::vector<Image3F> upsampling_input_storage; 137 // We keep four arrays, one per upsampling level, to reduce memory usage in 138 // the common case of no upsampling. 139 std::vector<Image3F> output_pixel_data_storage[4] = {}; 140 std::vector<ImageF> ec_temp_images; 141 142 // Buffer for decoded pixel data for a group. 143 std::vector<Image3F> group_data; 144 static constexpr size_t kGroupDataYBorder = kMaxFinalizeRectPadding * 2; 145 static constexpr size_t kGroupDataXBorder = 146 RoundUpToBlockDim(kMaxFinalizeRectPadding) * 2 + kBlockDim; 147 EnsureStoragePassesDecoderState148 void EnsureStorage(size_t num_threads) { 149 // We need one filter_storage per thread, ensure we have at least that many. 150 if (shared->frame_header.loop_filter.epf_iters != 0 || 151 shared->frame_header.loop_filter.gab) { 152 if (filter_pipelines.size() < num_threads) { 153 filter_pipelines.resize(num_threads); 154 } 155 } 156 // We allocate filter_input_storage unconditionally to ensure that the image 157 // is allocated if we need it for DC upsampling. 158 for (size_t _ = filter_input_storage.size(); _ < num_threads; _++) { 159 // Extra padding along the x dimension to ensure memory accesses don't 160 // load out-of-bounds pixels. 161 filter_input_storage.emplace_back( 162 kApplyImageFeaturesTileDim + 2 * kGroupDataXBorder, 163 kApplyImageFeaturesTileDim + 2 * kGroupDataYBorder); 164 } 165 if (shared->frame_header.upsampling != 1) { 166 for (size_t _ = upsampling_input_storage.size(); _ < num_threads; _++) { 167 // At this point, we only need up to 2 pixels of border per side for 168 // upsampling, but we add an extra border for aligned access. 169 upsampling_input_storage.emplace_back( 170 kApplyImageFeaturesTileDim + 2 * kBlockDim, 171 kApplyImageFeaturesTileDim + 4); 172 padded_upsampling_input_storage.emplace_back( 173 kApplyImageFeaturesTileDim + 2 * kBlockDim, 174 kApplyImageFeaturesTileDim + 4); 175 } 176 } 177 for (size_t _ = group_data.size(); _ < num_threads; _++) { 178 group_data.emplace_back(kGroupDim + 2 * kGroupDataXBorder, 179 kGroupDim + 2 * kGroupDataYBorder); 180 #if MEMORY_SANITIZER 181 // Avoid errors due to loading vectors on the outermost padding. 182 ZeroFillImage(&group_data.back()); 183 #endif 184 } 185 if (rgb_output || pixel_callback) { 186 size_t log2_upsampling = CeilLog2Nonzero(shared->frame_header.upsampling); 187 for (size_t _ = output_pixel_data_storage[log2_upsampling].size(); 188 _ < num_threads; _++) { 189 output_pixel_data_storage[log2_upsampling].emplace_back( 190 kApplyImageFeaturesTileDim << log2_upsampling, 191 kApplyImageFeaturesTileDim << log2_upsampling); 192 } 193 opaque_alpha.resize( 194 kApplyImageFeaturesTileDim * shared->frame_header.upsampling, 1.0f); 195 if (pixel_callback) { 196 pixel_callback_rows.resize(num_threads); 197 for (size_t i = 0; i < pixel_callback_rows.size(); ++i) { 198 pixel_callback_rows[i].resize(kApplyImageFeaturesTileDim * 199 shared->frame_header.upsampling * 200 (rgb_output_is_rgba ? 4 : 3)); 201 } 202 } 203 } 204 if (shared->metadata->m.num_extra_channels * num_threads > 205 ec_temp_images.size()) { 206 ec_temp_images.resize(shared->metadata->m.num_extra_channels * 207 num_threads); 208 } 209 for (size_t i = 0; i < shared->metadata->m.num_extra_channels; i++) { 210 if (shared->frame_header.extra_channel_upsampling[i] == 1) continue; 211 // We need up to 2 pixels of padding on each side. On the x axis, we round 212 // up padding so that 0 starts at a multiple of kBlockDim. 213 size_t xs = kApplyImageFeaturesTileDim * shared->frame_header.upsampling / 214 shared->frame_header.extra_channel_upsampling[i] + 215 2 * kBlockDim; 216 size_t ys = kApplyImageFeaturesTileDim * shared->frame_header.upsampling / 217 shared->frame_header.extra_channel_upsampling[i] + 218 4; 219 for (size_t t = 0; t < num_threads; t++) { 220 auto& eti = 221 ec_temp_images[t * shared->metadata->m.num_extra_channels + i]; 222 if (eti.xsize() < xs || eti.ysize() < ys) { 223 eti = ImageF(xs, ys); 224 } 225 } 226 } 227 } 228 229 // Information for colour conversions. 230 OutputEncodingInfo output_encoding_info; 231 232 // Initializes decoder-specific structures using information from *shared. InitPassesDecoderState233 void Init() { 234 x_dm_multiplier = 235 std::pow(1 / (1.25f), shared->frame_header.x_qm_scale - 2.0f); 236 b_dm_multiplier = 237 std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f); 238 239 rgb_output = nullptr; 240 pixel_callback = nullptr; 241 rgb_output_is_rgba = false; 242 fast_xyb_srgb8_conversion = false; 243 used_acs = 0; 244 245 group_border_assigner.Init(shared->frame_dim); 246 const LoopFilter& lf = shared->frame_header.loop_filter; 247 filter_weights.Init(lf, shared->frame_dim); 248 for (auto& fp : filter_pipelines) { 249 // De-initialize FilterPipelines. 250 fp.num_filters = 0; 251 } 252 for (size_t i = 0; i < 3; i++) { 253 upsamplers[i].Init(2 << i, shared->metadata->transform_data); 254 } 255 } 256 257 // Initialize the decoder state after all of DC is decoded. InitForACPassesDecoderState258 void InitForAC(ThreadPool* pool) { 259 shared_storage.coeff_order_size = 0; 260 for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { 261 if (((1 << o) & used_acs) == 0) continue; 262 uint8_t ord = kStrategyOrder[o]; 263 shared_storage.coeff_order_size = 264 std::max(kCoeffOrderOffset[3 * (ord + 1)] * kDCTBlockSize, 265 shared_storage.coeff_order_size); 266 } 267 size_t sz = shared_storage.frame_header.passes.num_passes * 268 shared_storage.coeff_order_size; 269 if (sz > shared_storage.coeff_orders.size()) { 270 shared_storage.coeff_orders.resize(sz); 271 } 272 if (shared->frame_header.flags & FrameHeader::kNoise) { 273 noise = Image3F(shared->frame_dim.xsize_upsampled_padded, 274 shared->frame_dim.ysize_upsampled_padded); 275 size_t num_x_groups = DivCeil(noise.xsize(), kGroupDim); 276 size_t num_y_groups = DivCeil(noise.ysize(), kGroupDim); 277 PROFILER_ZONE("GenerateNoise"); 278 auto generate_noise = [&](int group_index, int _) { 279 size_t gx = group_index % num_x_groups; 280 size_t gy = group_index / num_x_groups; 281 Rect rect(gx * kGroupDim, gy * kGroupDim, kGroupDim, kGroupDim, 282 noise.xsize(), noise.ysize()); 283 RandomImage3(noise_seed + group_index, rect, &noise); 284 }; 285 RunOnPool(pool, 0, num_x_groups * num_y_groups, ThreadPool::SkipInit(), 286 generate_noise, "Generate noise"); 287 { 288 PROFILER_ZONE("High pass noise"); 289 // 4 * (1 - box kernel) 290 WeightsSymmetric5 weights{{HWY_REP4(-3.84)}, {HWY_REP4(0.16)}, 291 {HWY_REP4(0.16)}, {HWY_REP4(0.16)}, 292 {HWY_REP4(0.16)}, {HWY_REP4(0.16)}}; 293 // TODO(veluca): avoid copy. 294 // TODO(veluca): avoid having a full copy of the image in main memory. 295 ImageF noise_tmp(noise.xsize(), noise.ysize()); 296 for (size_t c = 0; c < 3; c++) { 297 Symmetric5(noise.Plane(c), Rect(noise), weights, pool, &noise_tmp); 298 std::swap(noise.Plane(c), noise_tmp); 299 } 300 noise_seed += shared->frame_dim.num_groups; 301 } 302 } 303 EnsureBordersStorage(); 304 if (!EagerFinalizeImageRect()) { 305 // decoded must be padded to a multiple of kBlockDim rows since the last 306 // rows may be used by the filters even if they are outside the frame 307 // dimension. 308 decoded = Image3F(shared->frame_dim.xsize_padded, 309 shared->frame_dim.ysize_padded); 310 } 311 #if MEMORY_SANITIZER 312 // Avoid errors due to loading vectors on the outermost padding. 313 ZeroFillImage(&decoded); 314 #endif 315 } 316 EnsureBordersStoragePassesDecoderState317 void EnsureBordersStorage() { 318 if (!EagerFinalizeImageRect()) return; 319 size_t padding = FinalizeRectPadding(); 320 size_t bordery = 2 * padding; 321 size_t borderx = padding + group_border_assigner.PaddingX(padding); 322 Rect horizontal = Rect(0, 0, shared->frame_dim.xsize_padded, 323 bordery * shared->frame_dim.ysize_groups * 2); 324 if (!SameSize(horizontal, borders_horizontal)) { 325 borders_horizontal = Image3F(horizontal.xsize(), horizontal.ysize()); 326 } 327 Rect vertical = Rect(0, 0, borderx * shared->frame_dim.xsize_groups * 2, 328 shared->frame_dim.ysize_padded); 329 if (!SameSize(vertical, borders_vertical)) { 330 borders_vertical = Image3F(vertical.xsize(), vertical.ysize()); 331 } 332 } 333 }; 334 335 // Temp images required for decoding a single group. Reduces memory allocations 336 // for large images because we only initialize min(#threads, #groups) instances. 337 struct GroupDecCache { InitOnceGroupDecCache338 void InitOnce(size_t num_passes, size_t used_acs) { 339 PROFILER_FUNC; 340 341 for (size_t i = 0; i < num_passes; i++) { 342 if (num_nzeroes[i].xsize() == 0) { 343 // Allocate enough for a whole group - partial groups on the 344 // right/bottom border just use a subset. The valid size is passed via 345 // Rect. 346 347 num_nzeroes[i] = Image3I(kGroupDimInBlocks, kGroupDimInBlocks); 348 } 349 } 350 size_t max_block_area = 0; 351 352 for (uint8_t o = 0; o < AcStrategy::kNumValidStrategies; ++o) { 353 AcStrategy acs = AcStrategy::FromRawStrategy(o); 354 if ((used_acs & (1 << o)) == 0) continue; 355 size_t area = 356 acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize; 357 max_block_area = std::max(area, max_block_area); 358 } 359 360 if (max_block_area > max_block_area_) { 361 max_block_area_ = max_block_area; 362 // We need 3x float blocks for dequantized coefficients and 1x for scratch 363 // space for transforms. 364 float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 4); 365 // We need 3x int32 or int16 blocks for quantized coefficients. 366 int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3); 367 int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3); 368 } 369 370 dec_group_block = float_memory_.get(); 371 scratch_space = dec_group_block + max_block_area_ * 3; 372 dec_group_qblock = int32_memory_.get(); 373 dec_group_qblock16 = int16_memory_.get(); 374 } 375 376 // Scratch space used by DecGroupImpl(). 377 float* dec_group_block; 378 int32_t* dec_group_qblock; 379 int16_t* dec_group_qblock16; 380 381 // For TransformToPixels. 382 float* scratch_space; 383 // Note that scratch_space is never used at the same time as dec_group_qblock. 384 // Moreover, only one of dec_group_qblock16 is ever used. 385 // TODO(veluca): figure out if we can save allocations. 386 387 // AC decoding 388 Image3I num_nzeroes[kMaxNumPasses]; 389 390 private: 391 hwy::AlignedFreeUniquePtr<float[]> float_memory_; 392 hwy::AlignedFreeUniquePtr<int32_t[]> int32_memory_; 393 hwy::AlignedFreeUniquePtr<int16_t[]> int16_memory_; 394 size_t max_block_area_ = 0; 395 }; 396 397 } // namespace jxl 398 399 #endif // LIB_JXL_DEC_CACHE_H_ 400