1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/tile.h"
16 
17 #include <algorithm>
18 #include <array>
19 #include <cassert>
20 #include <climits>
21 #include <cstdlib>
22 #include <cstring>
23 #include <memory>
24 #include <new>
25 #include <numeric>
26 #include <type_traits>
27 #include <utility>
28 
29 #include "src/frame_scratch_buffer.h"
30 #include "src/motion_vector.h"
31 #include "src/reconstruction.h"
32 #include "src/utils/bit_mask_set.h"
33 #include "src/utils/common.h"
34 #include "src/utils/constants.h"
35 #include "src/utils/logging.h"
36 #include "src/utils/segmentation.h"
37 #include "src/utils/stack.h"
38 
39 namespace libgav1 {
40 namespace {
41 
42 // Import all the constants in the anonymous namespace.
43 #include "src/scan_tables.inc"
44 
45 // Range above kNumQuantizerBaseLevels which the exponential golomb coding
46 // process is activated.
47 constexpr int kQuantizerCoefficientBaseRange = 12;
48 constexpr int kNumQuantizerBaseLevels = 2;
49 constexpr int kCoeffBaseRangeMaxIterations =
50     kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
51 constexpr int kEntropyContextLeft = 0;
52 constexpr int kEntropyContextTop = 1;
53 
54 constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
55                                                      {2, 4, 4, 4, 5},
56                                                      {2, 4, 4, 4, 5},
57                                                      {2, 4, 4, 4, 5},
58                                                      {3, 5, 5, 5, 6}};
59 
60 // The space complexity of DFS is O(branching_factor * max_depth). For the
61 // parameter tree, branching_factor = 4 (there could be up to 4 children for
62 // every node) and max_depth (excluding the root) = 5 (to go from a 128x128
63 // block all the way to a 4x4 block). The worse-case stack size is 16, by
64 // counting the number of 'o' nodes in the diagram:
65 //
66 //   |                    128x128  The highest level (corresponding to the
67 //   |                             root of the tree) has no node in the stack.
68 //   |-----------------+
69 //   |     |     |     |
70 //   |     o     o     o  64x64
71 //   |
72 //   |-----------------+
73 //   |     |     |     |
74 //   |     o     o     o  32x32    Higher levels have three nodes in the stack,
75 //   |                             because we pop one node off the stack before
76 //   |-----------------+           pushing its four children onto the stack.
77 //   |     |     |     |
78 //   |     o     o     o  16x16
79 //   |
80 //   |-----------------+
81 //   |     |     |     |
82 //   |     o     o     o  8x8
83 //   |
84 //   |-----------------+
85 //   |     |     |     |
86 //   o     o     o     o  4x4      Only the lowest level has four nodes in the
87 //                                 stack.
88 constexpr int kDfsStackSize = 16;
89 
90 // Mask indicating whether the transform sets contain a particular transform
91 // type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
92 constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
93     BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
94     BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
95 
96 constexpr PredictionMode
97     kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
98         kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
99         kPredictionModeD157, kPredictionModeDc};
100 
101 // Mask used to determine the index for mode_deltas lookup.
102 constexpr BitMaskSet kPredictionModeDeltasMask(
103     kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
104     kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
105     kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
106     kPredictionModeNearNewMv, kPredictionModeNewNearMv,
107     kPredictionModeNewNewMv);
108 
109 // This is computed as:
110 // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
111 constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
112     0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
113 
114 /* clang-format off */
115 constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
116     {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
117      {0, 0, 0, 0, 0}},
118     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
119      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
120     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
121      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
122     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
123      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
124     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
125      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
126     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
127      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
128     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
129      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
130     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
131      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
132     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
133      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
134     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
135      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
136     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
137      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
138     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
139      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
140     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
141      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
142     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
143      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
144     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
145      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
146     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
147      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
148     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
149      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
150     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
151      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
152     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
153      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
154 /* clang-format on */
155 
156 // Extended the table size from 3 to 16 by repeating the last element to avoid
157 // the clips to row or column indices.
158 constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
159     26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
160 
161 constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
162     kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
163     kPredictionModeSmooth};
164 
165 // Number of horizontal luma samples before intra block copy can be used.
166 constexpr int kIntraBlockCopyDelayPixels = 256;
167 // Number of 64 by 64 blocks before intra block copy can be used.
168 constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
169 
170 // Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
171 // height 1 << (j + 2).
172 constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
173     {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
174      kNumTransformSizes, kNumTransformSizes},
175     {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
176      kTransformSize8x32, kNumTransformSizes},
177     {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
178      kTransformSize16x32, kTransformSize16x64},
179     {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
180      kTransformSize32x32, kTransformSize32x64},
181     {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
182      kTransformSize64x32, kTransformSize64x64}};
183 
184 // Defined in section 9.3 of the spec.
185 constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
186     kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
187     kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
188     kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
189     kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
190     kTransformTypeAdstAdst, kTransformTypeDctDct};
191 
192 // Defined in section 5.11.47 of the spec. This array does not contain an entry
193 // for kTransformSetDctOnly, so the first dimension needs to be
194 // |kNumTransformSets| - 1.
195 constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
196     {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
197       kTransformTypeIdentityDct, kTransformTypeDctIdentity,
198       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
199      {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
200       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
201      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
202       kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
203       kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
204       kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
205       kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
206       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
207       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
208       kTransformTypeAdstFlipadst},
209      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
210       kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
211       kTransformTypeAdstDct, kTransformTypeDctFlipadst,
212       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
213       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
214       kTransformTypeAdstFlipadst},
215      {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
216 
217 // Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
218 constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
219     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
220     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
221     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
222     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
223     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
224     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
225     kTransformSize32x32};
226 
227 // This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
228 // transforms replaced with *x32 and 32x* respectively.
229 constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
230     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
231     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
232     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
233     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
234     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
235     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
236     kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
237     kTransformSize32x32};
238 
239 // ith entry of this array is computed as:
240 // DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
241 //           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
242 //           1)
243 constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
244     0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
245 
246 constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
247 
248 constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
249 
250 // Maps compound prediction modes into single modes. For e.g.
251 // kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
252 // and kPredictionModeNewMv for index 1. It is used to simplify the logic in
253 // AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
254 constexpr PredictionMode
255     kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
256         {kPredictionModeNearestMv, kPredictionModeNearestMv},
257         {kPredictionModeNearMv, kPredictionModeNearMv},
258         {kPredictionModeNearestMv, kPredictionModeNewMv},
259         {kPredictionModeNewMv, kPredictionModeNearestMv},
260         {kPredictionModeNearMv, kPredictionModeNewMv},
261         {kPredictionModeNewMv, kPredictionModeNearMv},
262         {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
263         {kPredictionModeNewMv, kPredictionModeNewMv},
264 };
GetSinglePredictionMode(int index,PredictionMode y_mode)265 PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
266   if (y_mode < kPredictionModeNearestNearestMv) {
267     return y_mode;
268   }
269   const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
270   assert(lookup_index >= 0);
271   return kCompoundToSinglePredictionMode[lookup_index][index];
272 }
273 
274 // log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
275 // dqDenom is always a power of two and hence right shift can be used instead of
276 // division.
277 constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
278     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
279 
280 // Returns the minimum of |length| or |max|-|start|. This is used to clamp array
281 // indices when accessing arrays whose bound is equal to |max|.
GetNumElements(int length,int start,int max)282 int GetNumElements(int length, int start, int max) {
283   return std::min(length, max - start);
284 }
285 
286 template <typename T>
SetBlockValues(int rows,int columns,T value,T * dst,ptrdiff_t stride)287 void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
288   // Specialize all columns cases (values in kTransformWidth4x4[]) for better
289   // performance.
290   switch (columns) {
291     case 1:
292       MemSetBlock<T>(rows, 1, value, dst, stride);
293       break;
294     case 2:
295       MemSetBlock<T>(rows, 2, value, dst, stride);
296       break;
297     case 4:
298       MemSetBlock<T>(rows, 4, value, dst, stride);
299       break;
300     case 8:
301       MemSetBlock<T>(rows, 8, value, dst, stride);
302       break;
303     default:
304       assert(columns == 16);
305       MemSetBlock<T>(rows, 16, value, dst, stride);
306       break;
307   }
308 }
309 
SetTransformType(const Tile::Block & block,int x4,int y4,int w4,int h4,TransformType tx_type,TransformType transform_types[32][32])310 void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
311                       TransformType tx_type,
312                       TransformType transform_types[32][32]) {
313   const int y_offset = y4 - block.row4x4;
314   const int x_offset = x4 - block.column4x4;
315   TransformType* const dst = &transform_types[y_offset][x_offset];
316   SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
317 }
318 
StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,const MotionVector & mv_to_store,ptrdiff_t stride,int rows,int columns,ReferenceFrameType * reference_frame_row_start,MotionVector * mv)319 void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
320                          const MotionVector& mv_to_store, ptrdiff_t stride,
321                          int rows, int columns,
322                          ReferenceFrameType* reference_frame_row_start,
323                          MotionVector* mv) {
324   static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
325   do {
326     // Don't switch the following two memory setting functions.
327     // Some ARM CPUs are quite sensitive to the order.
328     memset(reference_frame_row_start, reference_frame_to_store, columns);
329     std::fill(mv, mv + columns, mv_to_store);
330     reference_frame_row_start += stride;
331     mv += stride;
332   } while (--rows != 0);
333 }
334 
335 // Inverse transform process assumes that the quantized coefficients are stored
336 // as a virtual 2d array of size |tx_width| x tx_height. If transform width is
337 // 64, then this assumption is broken because the scan order used for populating
338 // the coefficients for such transforms is the same as the one used for
339 // corresponding transform with width 32 (e.g. the scan order used for 64x16 is
340 // the same as the one used for 32x16). So we must restore the coefficients to
341 // their correct positions and clean the positions they occupied.
342 template <typename ResidualType>
MoveCoefficientsForTxWidth64(int clamped_tx_height,int tx_width,ResidualType * residual)343 void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
344                                   ResidualType* residual) {
345   if (tx_width != 64) return;
346   const int rows = clamped_tx_height - 2;
347   auto* src = residual + 32 * rows;
348   residual += 64 * rows;
349   // Process 2 rows in each loop in reverse order to avoid overwrite.
350   int x = rows >> 1;
351   do {
352     // The 2 rows can be processed in order.
353     memcpy(residual, src, 32 * sizeof(src[0]));
354     memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
355     memset(src + 32, 0, 32 * sizeof(src[0]));
356     src -= 64;
357     residual -= 128;
358   } while (--x);
359   // Process the second row. The first row is already correct.
360   memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
361   memset(src + 32, 0, 32 * sizeof(src[0]));
362 }
363 
GetClampParameters(const Tile::Block & block,int min[2],int max[2])364 void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
365   // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
366   // and 5.11.54).
367   constexpr int kMvBorder4x4 = 4;
368   const int row_border = kMvBorder4x4 + block.height4x4;
369   const int column_border = kMvBorder4x4 + block.width4x4;
370   const int macroblocks_to_top_edge = -block.row4x4;
371   const int macroblocks_to_bottom_edge =
372       block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
373   const int macroblocks_to_left_edge = -block.column4x4;
374   const int macroblocks_to_right_edge =
375       block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
376   min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
377   min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
378   max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
379   max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
380 }
381 
382 // Section 8.3.2 in the spec, under coeff_base_eob.
GetCoeffBaseContextEob(TransformSize tx_size,int index)383 int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
384   if (index == 0) return 0;
385   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
386   const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
387   const int tx_height = kTransformHeight[adjusted_tx_size];
388   if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
389   if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
390   return 3;
391 }
392 
393 // Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
394 // on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
395 // the end of block case.
GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2,int pos,TransformClass tx_class)396 int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
397                                 TransformClass tx_class) {
398   if (pos == 0) return 0;
399   const int tx_width = 1 << adjusted_tx_width_log2;
400   const int row = pos >> adjusted_tx_width_log2;
401   const int column = pos & (tx_width - 1);
402   // This return statement is equivalent to:
403   // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
404   //         (tx_class == kTransformClassHorizontal && column == 0) ||
405   //         (tx_class == kTransformClassVertical && row == 0))
406   //            ? 7
407   //            : 14;
408   return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
409                  static_cast<int>((row | column) < 2)) |
410                 (tx_class & static_cast<int>(column == 0)) |
411                 ((tx_class >> 1) & static_cast<int>(row == 0)));
412 }
413 
414 }  // namespace
415 
Tile(int tile_number,const uint8_t * const data,size_t size,const ObuSequenceHeader & sequence_header,const ObuFrameHeader & frame_header,RefCountedBuffer * const current_frame,const DecoderState & state,FrameScratchBuffer * const frame_scratch_buffer,const WedgeMaskArray & wedge_masks,const QuantizerMatrix & quantizer_matrix,SymbolDecoderContext * const saved_symbol_decoder_context,const SegmentationMap * prev_segment_ids,PostFilter * const post_filter,const dsp::Dsp * const dsp,ThreadPool * const thread_pool,BlockingCounterWithStatus * const pending_tiles,bool frame_parallel,bool use_intra_prediction_buffer)416 Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
417            const ObuSequenceHeader& sequence_header,
418            const ObuFrameHeader& frame_header,
419            RefCountedBuffer* const current_frame, const DecoderState& state,
420            FrameScratchBuffer* const frame_scratch_buffer,
421            const WedgeMaskArray& wedge_masks,
422            const QuantizerMatrix& quantizer_matrix,
423            SymbolDecoderContext* const saved_symbol_decoder_context,
424            const SegmentationMap* prev_segment_ids,
425            PostFilter* const post_filter, const dsp::Dsp* const dsp,
426            ThreadPool* const thread_pool,
427            BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
428            bool use_intra_prediction_buffer)
429     : number_(tile_number),
430       row_(number_ / frame_header.tile_info.tile_columns),
431       column_(number_ % frame_header.tile_info.tile_columns),
432       data_(data),
433       size_(size),
434       read_deltas_(false),
435       subsampling_x_{0, sequence_header.color_config.subsampling_x,
436                      sequence_header.color_config.subsampling_x},
437       subsampling_y_{0, sequence_header.color_config.subsampling_y,
438                      sequence_header.color_config.subsampling_y},
439       current_quantizer_index_(frame_header.quantizer.base_index),
440       sequence_header_(sequence_header),
441       frame_header_(frame_header),
442       reference_frame_sign_bias_(state.reference_frame_sign_bias),
443       reference_frames_(state.reference_frame),
444       motion_field_(frame_scratch_buffer->motion_field),
445       reference_order_hint_(state.reference_order_hint),
446       wedge_masks_(wedge_masks),
447       quantizer_matrix_(quantizer_matrix),
448       reader_(data_, size_, frame_header_.enable_cdf_update),
449       symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
450       saved_symbol_decoder_context_(saved_symbol_decoder_context),
451       prev_segment_ids_(prev_segment_ids),
452       dsp_(*dsp),
453       post_filter_(*post_filter),
454       block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
455       quantizer_(sequence_header_.color_config.bitdepth,
456                  &frame_header_.quantizer),
457       residual_size_((sequence_header_.color_config.bitdepth == 8)
458                          ? sizeof(int16_t)
459                          : sizeof(int32_t)),
460       intra_block_copy_lag_(
461           frame_header_.allow_intrabc
462               ? (sequence_header_.use_128x128_superblock ? 3 : 5)
463               : 1),
464       current_frame_(*current_frame),
465       cdef_index_(frame_scratch_buffer->cdef_index),
466       cdef_skip_(frame_scratch_buffer->cdef_skip),
467       inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
468       thread_pool_(thread_pool),
469       residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
470       tile_scratch_buffer_pool_(
471           &frame_scratch_buffer->tile_scratch_buffer_pool),
472       pending_tiles_(pending_tiles),
473       frame_parallel_(frame_parallel),
474       use_intra_prediction_buffer_(use_intra_prediction_buffer),
475       intra_prediction_buffer_(
476           use_intra_prediction_buffer_
477               ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
478               : nullptr) {
479   row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
480   row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
481   column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
482   column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
483   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
484   const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
485   superblock_rows_ =
486       (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
487   superblock_columns_ =
488       (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
489       block_width4x4_log2;
490   // If |split_parse_and_decode_| is true, we do the necessary setup for
491   // splitting the parsing and the decoding steps. This is done in the following
492   // two cases:
493   //  1) If there is multi-threading within a tile (this is done if
494   //     |thread_pool_| is not nullptr and if there are at least as many
495   //     superblock columns as |intra_block_copy_lag_|).
496   //  2) If |frame_parallel| is true.
497   split_parse_and_decode_ = (thread_pool_ != nullptr &&
498                              superblock_columns_ > intra_block_copy_lag_) ||
499                             frame_parallel;
500   if (frame_parallel_) {
501     reference_frame_progress_cache_.fill(INT_MIN);
502   }
503   memset(delta_lf_, 0, sizeof(delta_lf_));
504   delta_lf_all_zero_ = true;
505   const YuvBuffer& buffer = post_filter_.frame_buffer();
506   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
507     // Verify that the borders are big enough for Reconstruct(). max_tx_length
508     // is the maximum value of tx_width and tx_height for the plane.
509     const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
510     // Reconstruct() may overwrite on the right. Since the right border of a
511     // row is followed in memory by the left border of the next row, the
512     // number of extra pixels to the right of a row is at least the sum of the
513     // left and right borders.
514     //
515     // Note: This assertion actually checks the sum of the left and right
516     // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
517     // and vertically shifted version of |buffer|. Since the sum of the left and
518     // right borders is not changed by the shift, we can just check the sum of
519     // the left and right borders of |buffer|.
520     assert(buffer.left_border(plane) + buffer.right_border(plane) >=
521            max_tx_length - 1);
522     // Reconstruct() may overwrite on the bottom. We need an extra border row
523     // on the bottom because we need the left border of that row.
524     //
525     // Note: This assertion checks the bottom border of
526     // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
527     // shift that the PostFilter constructor applied to |buffer| and reduce the
528     // bottom border by that amount.
529 #ifndef NDEBUG
530     const int vertical_shift = static_cast<int>(
531         (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
532         buffer.stride(plane));
533     const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
534     assert(bottom_border >= max_tx_length);
535 #endif
536     // In AV1, a transform block of height H starts at a y coordinate that is
537     // a multiple of H. If a transform block at the bottom of the frame has
538     // height H, then Reconstruct() will write up to the row with index
539     // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
540     // rows Reconstruct() may write to is
541     // Align(buffer.height(plane), max_tx_length).
542     buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
543                          buffer.stride(plane),
544                          post_filter_.GetUnfilteredBuffer(plane));
545   }
546 }
547 
Init()548 bool Tile::Init() {
549   assert(coefficient_levels_.size() == dc_categories_.size());
550   for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
551     const int contexts_per_plane = (i == kEntropyContextLeft)
552                                        ? frame_header_.rows4x4
553                                        : frame_header_.columns4x4;
554     if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
555       LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
556       return false;
557     }
558     if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
559       LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
560       return false;
561     }
562   }
563   if (split_parse_and_decode_) {
564     assert(residual_buffer_pool_ != nullptr);
565     if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
566                                          /*zero_initialize=*/false)) {
567       LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
568       return false;
569     }
570   } else {
571     // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
572     // checks when parsing quantized coefficients.
573     residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
574         32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
575     if (residual_buffer_ == nullptr) {
576       LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
577       return false;
578     }
579     prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
580     if (prediction_parameters_ == nullptr) {
581       LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
582       return false;
583     }
584   }
585   if (frame_header_.use_ref_frame_mvs) {
586     assert(sequence_header_.enable_order_hint);
587     SetupMotionField(frame_header_, current_frame_, reference_frames_,
588                      row4x4_start_, row4x4_end_, column4x4_start_,
589                      column4x4_end_, &motion_field_);
590   }
591   ResetLoopRestorationParams();
592   if (!top_context_.Resize(superblock_columns_)) {
593     LIBGAV1_DLOG(ERROR, "Allocation of top_context_ failed.");
594     return false;
595   }
596   return true;
597 }
598 
599 template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
ProcessSuperBlockRow(int row4x4,TileScratchBuffer * const scratch_buffer)600 bool Tile::ProcessSuperBlockRow(int row4x4,
601                                 TileScratchBuffer* const scratch_buffer) {
602   if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
603   assert(scratch_buffer != nullptr);
604   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
605   for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
606        column4x4 += block_width4x4) {
607     if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
608                            processing_mode)) {
609       LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
610                    row4x4, column4x4);
611       return false;
612     }
613   }
614   if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
615     SaveSymbolDecoderContext();
616   }
617   if (processing_mode == kProcessingModeDecodeOnly ||
618       processing_mode == kProcessingModeParseAndDecode) {
619     PopulateIntraPredictionBuffer(row4x4);
620   }
621   return true;
622 }
623 
624 // Used in frame parallel mode. The symbol decoder context need not be saved in
625 // this case since it was done when parsing was complete.
626 template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
627     int row4x4, TileScratchBuffer* scratch_buffer);
628 // Used in non frame parallel mode.
629 template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
630     int row4x4, TileScratchBuffer* scratch_buffer);
631 
SaveSymbolDecoderContext()632 void Tile::SaveSymbolDecoderContext() {
633   if (frame_header_.enable_frame_end_update_cdf &&
634       number_ == frame_header_.tile_info.context_update_id) {
635     *saved_symbol_decoder_context_ = symbol_decoder_context_;
636   }
637 }
638 
ParseAndDecode()639 bool Tile::ParseAndDecode() {
640   if (split_parse_and_decode_) {
641     if (!ThreadedParseAndDecode()) return false;
642     SaveSymbolDecoderContext();
643     return true;
644   }
645   std::unique_ptr<TileScratchBuffer> scratch_buffer =
646       tile_scratch_buffer_pool_->Get();
647   if (scratch_buffer == nullptr) {
648     pending_tiles_->Decrement(false);
649     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
650     return false;
651   }
652   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
653   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
654        row4x4 += block_width4x4) {
655     if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
656             row4x4, scratch_buffer.get())) {
657       pending_tiles_->Decrement(false);
658       return false;
659     }
660   }
661   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
662   pending_tiles_->Decrement(true);
663   return true;
664 }
665 
Parse()666 bool Tile::Parse() {
667   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
668   std::unique_ptr<TileScratchBuffer> scratch_buffer =
669       tile_scratch_buffer_pool_->Get();
670   if (scratch_buffer == nullptr) {
671     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
672     return false;
673   }
674   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
675        row4x4 += block_width4x4) {
676     if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
677             row4x4, scratch_buffer.get())) {
678       return false;
679     }
680   }
681   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
682   SaveSymbolDecoderContext();
683   return true;
684 }
685 
Decode(std::mutex * const mutex,int * const superblock_row_progress,std::condition_variable * const superblock_row_progress_condvar)686 bool Tile::Decode(
687     std::mutex* const mutex, int* const superblock_row_progress,
688     std::condition_variable* const superblock_row_progress_condvar) {
689   const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
690   const int block_width4x4_log2 =
691       sequence_header_.use_128x128_superblock ? 5 : 4;
692   std::unique_ptr<TileScratchBuffer> scratch_buffer =
693       tile_scratch_buffer_pool_->Get();
694   if (scratch_buffer == nullptr) {
695     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
696     return false;
697   }
698   for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
699        row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
700     if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
701             row4x4, scratch_buffer.get())) {
702       return false;
703     }
704     if (post_filter_.DoDeblock()) {
705       // Apply vertical deblock filtering for all the columns in this tile
706       // except for the first 64 columns.
707       post_filter_.ApplyDeblockFilter(
708           kLoopFilterTypeVertical, row4x4,
709           column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
710           block_width4x4);
711       // If this is the first superblock row of the tile, then we cannot apply
712       // horizontal deblocking here since we don't know if the top row is
713       // available. So it will be done by the calling thread in that case.
714       if (row4x4 != row4x4_start_) {
715         // Apply horizontal deblock filtering for all the columns in this tile
716         // except for the first and the last 64 columns.
717         // Note about the last tile of each row: For the last tile,
718         // column4x4_end may not be a multiple of 16. In that case it is still
719         // okay to simply subtract 16 since ApplyDeblockFilter() will only do
720         // the filters in increments of 64 columns (or 32 columns for chroma
721         // with subsampling).
722         post_filter_.ApplyDeblockFilter(
723             kLoopFilterTypeHorizontal, row4x4,
724             column4x4_start_ + kNum4x4InLoopFilterUnit,
725             column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
726       }
727     }
728     bool notify;
729     {
730       std::unique_lock<std::mutex> lock(*mutex);
731       notify = ++superblock_row_progress[index] ==
732                frame_header_.tile_info.tile_columns;
733     }
734     if (notify) {
735       // We are done decoding this superblock row. Notify the post filtering
736       // thread.
737       superblock_row_progress_condvar[index].notify_one();
738     }
739   }
740   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
741   return true;
742 }
743 
ThreadedParseAndDecode()744 bool Tile::ThreadedParseAndDecode() {
745   {
746     std::lock_guard<std::mutex> lock(threading_.mutex);
747     if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
748       pending_tiles_->Decrement(false);
749       LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
750       return false;
751     }
752     // Account for the parsing job.
753     ++threading_.pending_jobs;
754   }
755 
756   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
757 
758   // Begin parsing.
759   std::unique_ptr<TileScratchBuffer> scratch_buffer =
760       tile_scratch_buffer_pool_->Get();
761   if (scratch_buffer == nullptr) {
762     pending_tiles_->Decrement(false);
763     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
764     return false;
765   }
766   for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
767        row4x4 += block_width4x4, ++row_index) {
768     for (int column4x4 = column4x4_start_, column_index = 0;
769          column4x4 < column4x4_end_;
770          column4x4 += block_width4x4, ++column_index) {
771       if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
772                              kProcessingModeParseOnly)) {
773         std::lock_guard<std::mutex> lock(threading_.mutex);
774         threading_.abort = true;
775         break;
776       }
777       std::unique_lock<std::mutex> lock(threading_.mutex);
778       if (threading_.abort) break;
779       threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
780       // Schedule the decoding of this superblock if it is allowed.
781       if (CanDecode(row_index, column_index)) {
782         ++threading_.pending_jobs;
783         threading_.sb_state[row_index][column_index] =
784             kSuperBlockStateScheduled;
785         lock.unlock();
786         thread_pool_->Schedule(
787             [this, row_index, column_index, block_width4x4]() {
788               DecodeSuperBlock(row_index, column_index, block_width4x4);
789             });
790       }
791     }
792     std::lock_guard<std::mutex> lock(threading_.mutex);
793     if (threading_.abort) break;
794   }
795   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
796 
797   // We are done parsing. We can return here since the calling thread will make
798   // sure that it waits for all the superblocks to be decoded.
799   //
800   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
801   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
802   // is called.
803   threading_.mutex.lock();
804   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
805   const bool job_succeeded = !threading_.abort;
806   threading_.mutex.unlock();
807   if (no_pending_jobs) {
808     // We are done parsing and decoding this tile.
809     pending_tiles_->Decrement(job_succeeded);
810   }
811   return job_succeeded;
812 }
813 
CanDecode(int row_index,int column_index) const814 bool Tile::CanDecode(int row_index, int column_index) const {
815   assert(row_index >= 0);
816   assert(column_index >= 0);
817   // If |threading_.sb_state[row_index][column_index]| is not equal to
818   // kSuperBlockStateParsed, then return false. This is ok because if
819   // |threading_.sb_state[row_index][column_index]| is equal to:
820   //   kSuperBlockStateNone - then the superblock is not yet parsed.
821   //   kSuperBlockStateScheduled - then the superblock is already scheduled for
822   //                               decode.
823   //   kSuperBlockStateDecoded - then the superblock has already been decoded.
824   if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
825       threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
826     return false;
827   }
828   // First superblock has no dependencies.
829   if (row_index == 0 && column_index == 0) {
830     return true;
831   }
832   // Superblocks in the first row only depend on the superblock to the left of
833   // it.
834   if (row_index == 0) {
835     return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
836   }
837   // All other superblocks depend on superblock to the left of it (if one
838   // exists) and superblock to the top right with a lag of
839   // |intra_block_copy_lag_| (if one exists).
840   const int top_right_column_index =
841       std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
842   return threading_.sb_state[row_index - 1][top_right_column_index] ==
843              kSuperBlockStateDecoded &&
844          (column_index == 0 ||
845           threading_.sb_state[row_index][column_index - 1] ==
846               kSuperBlockStateDecoded);
847 }
848 
DecodeSuperBlock(int row_index,int column_index,int block_width4x4)849 void Tile::DecodeSuperBlock(int row_index, int column_index,
850                             int block_width4x4) {
851   const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
852   const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
853   std::unique_ptr<TileScratchBuffer> scratch_buffer =
854       tile_scratch_buffer_pool_->Get();
855   bool ok = scratch_buffer != nullptr;
856   if (ok) {
857     ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
858                            kProcessingModeDecodeOnly);
859     tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
860   }
861   std::unique_lock<std::mutex> lock(threading_.mutex);
862   if (ok) {
863     threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
864     // Candidate rows and columns that we could potentially begin the decoding
865     // (if it is allowed to do so). The candidates are:
866     //   1) The superblock to the bottom-left of the current superblock with a
867     //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
868     //   row in case there are less than |intra_block_copy_lag_| superblock
869     //   columns in the Tile).
870     //   2) The superblock to the right of the current superblock.
871     const int candidate_row_indices[] = {row_index + 1, row_index};
872     const int candidate_column_indices[] = {
873         std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
874     for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
875          ++i) {
876       const int candidate_row_index = candidate_row_indices[i];
877       const int candidate_column_index = candidate_column_indices[i];
878       if (!CanDecode(candidate_row_index, candidate_column_index)) {
879         continue;
880       }
881       ++threading_.pending_jobs;
882       threading_.sb_state[candidate_row_index][candidate_column_index] =
883           kSuperBlockStateScheduled;
884       lock.unlock();
885       thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
886                               block_width4x4]() {
887         DecodeSuperBlock(candidate_row_index, candidate_column_index,
888                          block_width4x4);
889       });
890       lock.lock();
891     }
892   } else {
893     threading_.abort = true;
894   }
895   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
896   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
897   // is called.
898   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
899   const bool job_succeeded = !threading_.abort;
900   lock.unlock();
901   if (no_pending_jobs) {
902     // We are done parsing and decoding this tile.
903     pending_tiles_->Decrement(job_succeeded);
904   }
905 }
906 
PopulateIntraPredictionBuffer(int row4x4)907 void Tile::PopulateIntraPredictionBuffer(int row4x4) {
908   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
909   if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
910     return;
911   }
912   const size_t pixel_size =
913       (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
914                                                    : sizeof(uint16_t));
915   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
916     const int row_to_copy =
917         (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
918     const size_t pixels_to_copy =
919         (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
920          subsampling_x_[plane]) *
921         pixel_size;
922     const size_t column_start =
923         MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
924     void* start;
925 #if LIBGAV1_MAX_BITDEPTH >= 10
926     if (sequence_header_.color_config.bitdepth > 8) {
927       Array2DView<uint16_t> buffer(
928           buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
929           reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
930       start = &buffer[row_to_copy][column_start];
931     } else  // NOLINT
932 #endif
933     {
934       start = &buffer_[plane][row_to_copy][column_start];
935     }
936     memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
937            start, pixels_to_copy);
938   }
939 }
940 
GetTransformAllZeroContext(const Block & block,Plane plane,TransformSize tx_size,int x4,int y4,int w4,int h4)941 int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
942                                      TransformSize tx_size, int x4, int y4,
943                                      int w4, int h4) {
944   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
945   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
946 
947   const int tx_width = kTransformWidth[tx_size];
948   const int tx_height = kTransformHeight[tx_size];
949   const BlockSize plane_size = block.residual_size[plane];
950   const int block_width = kBlockWidthPixels[plane_size];
951   const int block_height = kBlockHeightPixels[plane_size];
952 
953   int top = 0;
954   int left = 0;
955   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
956   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
957   if (plane == kPlaneY) {
958     if (block_width == tx_width && block_height == tx_height) return 0;
959     const uint8_t* coefficient_levels =
960         &coefficient_levels_[kEntropyContextTop][plane][x4];
961     for (int i = 0; i < num_top_elements; ++i) {
962       top = std::max(top, static_cast<int>(coefficient_levels[i]));
963     }
964     coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
965     for (int i = 0; i < num_left_elements; ++i) {
966       left = std::max(left, static_cast<int>(coefficient_levels[i]));
967     }
968     assert(top <= 4);
969     assert(left <= 4);
970     // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
971     // for top and left.
972     return kAllZeroContextsByTopLeft[top][left];
973   }
974   const uint8_t* coefficient_levels =
975       &coefficient_levels_[kEntropyContextTop][plane][x4];
976   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
977   for (int i = 0; i < num_top_elements; ++i) {
978     top |= coefficient_levels[i];
979     top |= dc_categories[i];
980   }
981   coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
982   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
983   for (int i = 0; i < num_left_elements; ++i) {
984     left |= coefficient_levels[i];
985     left |= dc_categories[i];
986   }
987   return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
988          3 * static_cast<int>(block_width * block_height >
989                               tx_width * tx_height);
990 }
991 
GetTransformSet(TransformSize tx_size,bool is_inter) const992 TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
993   const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
994   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
995   if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
996   if (is_inter) {
997     if (frame_header_.reduced_tx_set ||
998         tx_size_square_max == kTransformSize32x32) {
999       return kTransformSetInter3;
1000     }
1001     if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
1002     return kTransformSetInter1;
1003   }
1004   if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
1005   if (frame_header_.reduced_tx_set ||
1006       tx_size_square_min == kTransformSize16x16) {
1007     return kTransformSetIntra2;
1008   }
1009   return kTransformSetIntra1;
1010 }
1011 
ComputeTransformType(const Block & block,Plane plane,TransformSize tx_size,int block_x,int block_y)1012 TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
1013                                          TransformSize tx_size, int block_x,
1014                                          int block_y) {
1015   const BlockParameters& bp = *block.bp;
1016   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
1017   if (frame_header_.segmentation
1018           .lossless[bp.prediction_parameters->segment_id] ||
1019       tx_size_square_max == kTransformSize64x64) {
1020     return kTransformTypeDctDct;
1021   }
1022   if (plane == kPlaneY) {
1023     return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
1024   }
1025   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1026   TransformType tx_type;
1027   if (bp.is_inter) {
1028     const int x4 =
1029         std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
1030     const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
1031     tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
1032   } else {
1033     tx_type = kModeToTransformType[bp.prediction_parameters->uv_mode];
1034   }
1035   return kTransformTypeInSetMask[tx_set].Contains(tx_type)
1036              ? tx_type
1037              : kTransformTypeDctDct;
1038 }
1039 
ReadTransformType(const Block & block,int x4,int y4,TransformSize tx_size)1040 void Tile::ReadTransformType(const Block& block, int x4, int y4,
1041                              TransformSize tx_size) {
1042   BlockParameters& bp = *block.bp;
1043   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1044 
1045   TransformType tx_type = kTransformTypeDctDct;
1046   if (tx_set != kTransformSetDctOnly &&
1047       frame_header_.segmentation.qindex[bp.prediction_parameters->segment_id] >
1048           0) {
1049     const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
1050     const int cdf_tx_size_index =
1051         TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
1052     uint16_t* cdf;
1053     if (bp.is_inter) {
1054       cdf = symbol_decoder_context_
1055                 .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
1056       switch (tx_set) {
1057         case kTransformSetInter1:
1058           tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
1059           break;
1060         case kTransformSetInter2:
1061           tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
1062           break;
1063         default:
1064           assert(tx_set == kTransformSetInter3);
1065           tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
1066           break;
1067       }
1068     } else {
1069       const PredictionMode intra_direction =
1070           block.bp->prediction_parameters->use_filter_intra
1071               ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
1072                                                      ->filter_intra_mode]
1073               : bp.y_mode;
1074       cdf =
1075           symbol_decoder_context_
1076               .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
1077       assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
1078       tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
1079                                                ? reader_.ReadSymbol<7>(cdf)
1080                                                : reader_.ReadSymbol<5>(cdf));
1081     }
1082 
1083     // This array does not contain an entry for kTransformSetDctOnly, so the
1084     // first dimension needs to be offset by 1.
1085     tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
1086   }
1087   SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
1088                    kTransformHeight4x4[tx_size], tx_type, transform_types_);
1089 }
1090 
1091 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1092 // Bottom boundary checks are avoided by the padded rows.
1093 // For a coefficient near the right boundary, the two right neighbors and the
1094 // one bottom-right neighbor may be out of boundary. We don't check the right
1095 // boundary for them, because the out of boundary neighbors project to positions
1096 // above the diagonal line which goes through the current coefficient and these
1097 // positions are still all 0s according to the diagonal scan order.
1098 template <typename ResidualType>
ReadCoeffBase2D(const uint16_t * scan,TransformSize tx_size,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1099 void Tile::ReadCoeffBase2D(
1100     const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1101     int eob,
1102     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1103     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1104                                  [kCoeffBaseRangeSymbolCount + 1],
1105     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1106   const int tx_width = 1 << adjusted_tx_width_log2;
1107   for (int i = eob - 2; i >= 1; --i) {
1108     const uint16_t pos = scan[i];
1109     const int row = pos >> adjusted_tx_width_log2;
1110     const int column = pos & (tx_width - 1);
1111     auto* const quantized = &quantized_buffer[pos];
1112     auto* const levels = &level_buffer[pos];
1113     const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
1114                              levels[tx_width + 1] + levels[2] +
1115                              levels[MultiplyBy2(tx_width)];
1116     const int context =
1117         ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1118         kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
1119     int level =
1120         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1121     levels[0] = level;
1122     if (level > kNumQuantizerBaseLevels) {
1123       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1124       // + 1, because we clip the overall output to 6 and the unclipped
1125       // quantized values will always result in an output of greater than 6.
1126       int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1127                                           quantized[tx_width] +       // {1, 0}
1128                                           quantized[tx_width + 1]));  // {1, 1}
1129       context += 14 >> static_cast<int>((row | column) < 2);
1130       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1131     }
1132     quantized[0] = level;
1133   }
1134   // Read position 0.
1135   {
1136     auto* const quantized = &quantized_buffer[0];
1137     int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
1138     level_buffer[0] = level;
1139     if (level > kNumQuantizerBaseLevels) {
1140       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1141       // + 1, because we clip the overall output to 6 and the unclipped
1142       // quantized values will always result in an output of greater than 6.
1143       const int context =
1144           std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1145                                 quantized[tx_width] +       // {1, 0}
1146                                 quantized[tx_width + 1]));  // {1, 1}
1147       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1148     }
1149     quantized[0] = level;
1150   }
1151 }
1152 
1153 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1154 // Bottom boundary checks are avoided by the padded rows.
1155 // For a coefficient near the right boundary, the four right neighbors may be
1156 // out of boundary. We don't do the boundary check for the first three right
1157 // neighbors, because even for the transform blocks with smallest width 4, the
1158 // first three out of boundary neighbors project to positions left of the
1159 // current coefficient and these positions are still all 0s according to the
1160 // column scan order. However, when transform block width is 4 and the current
1161 // coefficient is on the right boundary, its fourth right neighbor projects to
1162 // the under position on the same column, which could be nonzero. Therefore, we
1163 // must skip the fourth right neighbor. To make it simple, for any coefficient,
1164 // we always do the boundary check for its fourth right neighbor.
1165 template <typename ResidualType>
ReadCoeffBaseHorizontal(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1166 void Tile::ReadCoeffBaseHorizontal(
1167     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1168     int eob,
1169     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1170     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1171                                  [kCoeffBaseRangeSymbolCount + 1],
1172     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1173   const int tx_width = 1 << adjusted_tx_width_log2;
1174   int i = eob - 2;
1175   do {
1176     const uint16_t pos = scan[i];
1177     const int column = pos & (tx_width - 1);
1178     auto* const quantized = &quantized_buffer[pos];
1179     auto* const levels = &level_buffer[pos];
1180     const int neighbor_sum =
1181         1 + (levels[1] +                                  // {0, 1}
1182              levels[tx_width] +                           // {1, 0}
1183              levels[2] +                                  // {0, 2}
1184              levels[3] +                                  // {0, 3}
1185              ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
1186     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1187                         kCoeffBasePositionContextOffset[column];
1188     int level =
1189         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1190     levels[0] = level;
1191     if (level > kNumQuantizerBaseLevels) {
1192       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1193       // + 1, because we clip the overall output to 6 and the unclipped
1194       // quantized values will always result in an output of greater than 6.
1195       int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
1196                                           quantized[tx_width] +  // {1, 0}
1197                                           quantized[2]));        // {0, 2}
1198       if (pos != 0) {
1199         context += 14 >> static_cast<int>(column == 0);
1200       }
1201       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1202     }
1203     quantized[0] = level;
1204   } while (--i >= 0);
1205 }
1206 
1207 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1208 // Bottom boundary checks are avoided by the padded rows.
1209 // Right boundary check is performed explicitly.
1210 template <typename ResidualType>
ReadCoeffBaseVertical(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1211 void Tile::ReadCoeffBaseVertical(
1212     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1213     int eob,
1214     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1215     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1216                                  [kCoeffBaseRangeSymbolCount + 1],
1217     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1218   const int tx_width = 1 << adjusted_tx_width_log2;
1219   int i = eob - 2;
1220   do {
1221     const uint16_t pos = scan[i];
1222     const int row = pos >> adjusted_tx_width_log2;
1223     const int column = pos & (tx_width - 1);
1224     auto* const quantized = &quantized_buffer[pos];
1225     auto* const levels = &level_buffer[pos];
1226     const int neighbor_sum =
1227         1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
1228              levels[tx_width] +                           // {1, 0}
1229              levels[MultiplyBy2(tx_width)] +              // {2, 0}
1230              levels[tx_width * 3] +                       // {3, 0}
1231              levels[MultiplyBy4(tx_width)]);              // {4, 0}
1232     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1233                         kCoeffBasePositionContextOffset[row];
1234     int level =
1235         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1236     levels[0] = level;
1237     if (level > kNumQuantizerBaseLevels) {
1238       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1239       // + 1, because we clip the overall output to 6 and the unclipped
1240       // quantized values will always result in an output of greater than 6.
1241       const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
1242       int context =
1243           std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
1244                                 quantized[tx_width] +                // {1, 0}
1245                                 quantized[MultiplyBy2(tx_width)]));  // {2, 0}
1246       if (pos != 0) {
1247         context += 14 >> static_cast<int>(row == 0);
1248       }
1249       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1250     }
1251     quantized[0] = level;
1252   } while (--i >= 0);
1253 }
1254 
GetDcSignContext(int x4,int y4,int w4,int h4,Plane plane)1255 int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
1256   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1257   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
1258   // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
1259   int8_t dc_sign = std::accumulate(
1260       dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
1261   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1262   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
1263   dc_sign = std::accumulate(
1264       dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
1265   // This return statement is equivalent to:
1266   //   if (dc_sign < 0) return 1;
1267   //   if (dc_sign > 0) return 2;
1268   //   return 0;
1269   // And it is better than:
1270   //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
1271   return static_cast<int>(dc_sign < 0) +
1272          MultiplyBy2(static_cast<int>(dc_sign > 0));
1273 }
1274 
SetEntropyContexts(int x4,int y4,int w4,int h4,Plane plane,uint8_t coefficient_level,int8_t dc_category)1275 void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
1276                               uint8_t coefficient_level, int8_t dc_category) {
1277   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1278   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
1279   memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
1280          num_top_elements);
1281   memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
1282          num_top_elements);
1283   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1284   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
1285   memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
1286          coefficient_level, num_left_elements);
1287   memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
1288          num_left_elements);
1289 }
1290 
1291 template <typename ResidualType, bool is_dc_coefficient>
ReadSignAndApplyDequantization(const uint16_t * const scan,int i,int q_value,const uint8_t * const quantizer_matrix,int shift,int max_value,uint16_t * const dc_sign_cdf,int8_t * const dc_category,int * const coefficient_level,ResidualType * residual_buffer)1292 bool Tile::ReadSignAndApplyDequantization(
1293     const uint16_t* const scan, int i, int q_value,
1294     const uint8_t* const quantizer_matrix, int shift, int max_value,
1295     uint16_t* const dc_sign_cdf, int8_t* const dc_category,
1296     int* const coefficient_level, ResidualType* residual_buffer) {
1297   const int pos = is_dc_coefficient ? 0 : scan[i];
1298   // If residual_buffer[pos] is zero, then the rest of the function has no
1299   // effect.
1300   int level = residual_buffer[pos];
1301   if (level == 0) return true;
1302   const int sign = is_dc_coefficient
1303                        ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
1304                        : reader_.ReadBit();
1305   if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
1306     int length = 0;
1307     bool golomb_length_bit = false;
1308     do {
1309       golomb_length_bit = reader_.ReadBit() != 0;
1310       ++length;
1311       if (length > 20) {
1312         LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
1313         return false;
1314       }
1315     } while (!golomb_length_bit);
1316     int x = 1;
1317     for (int i = length - 2; i >= 0; --i) {
1318       x = (x << 1) | reader_.ReadBit();
1319     }
1320     level += x - 1;
1321   }
1322   if (is_dc_coefficient) {
1323     *dc_category = (sign != 0) ? -1 : 1;
1324   }
1325   level &= 0xfffff;
1326   *coefficient_level += level;
1327   // Apply dequantization. Step 1 of section 7.12.3 in the spec.
1328   int q = q_value;
1329   if (quantizer_matrix != nullptr) {
1330     q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
1331   }
1332   // The intermediate multiplication can exceed 32 bits, so it has to be
1333   // performed by promoting one of the values to int64_t.
1334   int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
1335   dequantized_value >>= shift;
1336   // At this point:
1337   //   * |dequantized_value| is always non-negative.
1338   //   * |sign| can be either 0 or 1.
1339   //   * min_value = -(max_value + 1).
1340   // We need to apply the following:
1341   // dequantized_value = sign ? -dequantized_value : dequantized_value;
1342   // dequantized_value = Clip3(dequantized_value, min_value, max_value);
1343   //
1344   // Note that -x == ~(x - 1).
1345   //
1346   // Now, The above two lines can be done with a std::min and xor as follows:
1347   dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
1348   residual_buffer[pos] = dequantized_value;
1349   return true;
1350 }
1351 
ReadCoeffBaseRange(uint16_t * cdf)1352 int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
1353   int level = 0;
1354   for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
1355     const int coeff_base_range =
1356         reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
1357     level += coeff_base_range;
1358     if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
1359   }
1360   return level;
1361 }
1362 
1363 template <typename ResidualType>
ReadTransformCoefficients(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType * const tx_type)1364 int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
1365                                     int start_x, int start_y,
1366                                     TransformSize tx_size,
1367                                     TransformType* const tx_type) {
1368   const int x4 = DivideBy4(start_x);
1369   const int y4 = DivideBy4(start_y);
1370   const int w4 = kTransformWidth4x4[tx_size];
1371   const int h4 = kTransformHeight4x4[tx_size];
1372   const int tx_size_context = kTransformSizeContext[tx_size];
1373   int context =
1374       GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
1375   const bool all_zero = reader_.ReadSymbol(
1376       symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
1377   if (all_zero) {
1378     if (plane == kPlaneY) {
1379       SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
1380                        transform_types_);
1381     }
1382     SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
1383     // This is not used in this case, so it can be set to any value.
1384     *tx_type = kNumTransformTypes;
1385     return 0;
1386   }
1387   const int tx_width = kTransformWidth[tx_size];
1388   const int tx_height = kTransformHeight[tx_size];
1389   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
1390   const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
1391   const int tx_padding =
1392       (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
1393   auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
1394   // Clear padding to avoid bottom boundary checks when parsing quantized
1395   // coefficients.
1396   memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
1397   uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
1398   memset(
1399       level_buffer, 0,
1400       kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
1401           tx_padding);
1402   const int clamped_tx_height = std::min(tx_height, 32);
1403   if (plane == kPlaneY) {
1404     ReadTransformType(block, x4, y4, tx_size);
1405   }
1406   BlockParameters& bp = *block.bp;
1407   *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
1408   const int eob_multi_size = kEobMultiSizeLookup[tx_size];
1409   const PlaneType plane_type = GetPlaneType(plane);
1410   const TransformClass tx_class = GetTransformClass(*tx_type);
1411   context = static_cast<int>(tx_class != kTransformClass2D);
1412   int eob_pt = 1;
1413   switch (eob_multi_size) {
1414     case 0:
1415       eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
1416           symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
1417       break;
1418     case 1:
1419       eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
1420           symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
1421       break;
1422     case 2:
1423       eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
1424           symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
1425       break;
1426     case 3:
1427       eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
1428           symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
1429       break;
1430     case 4:
1431       eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
1432           symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
1433       break;
1434     case 5:
1435       eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
1436           symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
1437       break;
1438     case 6:
1439     default:
1440       eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
1441           symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
1442       break;
1443   }
1444   int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
1445   if (eob_pt >= 3) {
1446     context = eob_pt - 3;
1447     const bool eob_extra = reader_.ReadSymbol(
1448         symbol_decoder_context_
1449             .eob_extra_cdf[tx_size_context][plane_type][context]);
1450     if (eob_extra) eob += 1 << (eob_pt - 3);
1451     for (int i = 1; i < eob_pt - 2; ++i) {
1452       assert(eob_pt - i >= 3);
1453       assert(eob_pt <= kEobPt1024SymbolCount);
1454       if (reader_.ReadBit() != 0) {
1455         eob += 1 << (eob_pt - i - 3);
1456       }
1457     }
1458   }
1459   const uint16_t* scan = kScan[tx_class][tx_size];
1460   const int clamped_tx_size_context = std::min(tx_size_context, 3);
1461   auto coeff_base_range_cdf =
1462       symbol_decoder_context_
1463           .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
1464   // Read the last coefficient.
1465   {
1466     context = GetCoeffBaseContextEob(tx_size, eob - 1);
1467     const uint16_t pos = scan[eob - 1];
1468     int level =
1469         1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
1470                 symbol_decoder_context_
1471                     .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
1472     level_buffer[pos] = level;
1473     if (level > kNumQuantizerBaseLevels) {
1474       level +=
1475           ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
1476               adjusted_tx_width_log2, pos, tx_class)]);
1477     }
1478     residual[pos] = level;
1479   }
1480   if (eob > 1) {
1481     // Read all the other coefficients.
1482     // Lookup used to call the right variant of ReadCoeffBase*() based on the
1483     // transform class.
1484     static constexpr void (Tile::*kGetCoeffBaseFunc[])(
1485         const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1486         int eob,
1487         uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1488         uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1489                                      [kCoeffBaseRangeSymbolCount + 1],
1490         ResidualType* quantized_buffer,
1491         uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
1492                                   &Tile::ReadCoeffBaseHorizontal<ResidualType>,
1493                                   &Tile::ReadCoeffBaseVertical<ResidualType>};
1494     (this->*kGetCoeffBaseFunc[tx_class])(
1495         scan, tx_size, adjusted_tx_width_log2, eob,
1496         symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
1497         coeff_base_range_cdf, residual, level_buffer);
1498   }
1499   const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
1500   const int current_quantizer_index =
1501       GetQIndex(frame_header_.segmentation,
1502                 bp.prediction_parameters->segment_id, current_quantizer_index_);
1503   const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
1504   const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
1505   const int shift = kQuantizationShift[tx_size];
1506   const uint8_t* const quantizer_matrix =
1507       (frame_header_.quantizer.use_matrix &&
1508        *tx_type < kTransformTypeIdentityIdentity &&
1509        !frame_header_.segmentation
1510             .lossless[bp.prediction_parameters->segment_id] &&
1511        frame_header_.quantizer.matrix_level[plane] < 15)
1512           ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
1513                              [plane_type][adjusted_tx_size]
1514                                  .get()
1515           : nullptr;
1516   int coefficient_level = 0;
1517   int8_t dc_category = 0;
1518   uint16_t* const dc_sign_cdf =
1519       (residual[0] != 0)
1520           ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
1521                 x4, y4, w4, h4, plane)]
1522           : nullptr;
1523   assert(scan[0] == 0);
1524   if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
1525           scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
1526           &dc_category, &coefficient_level, residual)) {
1527     return -1;
1528   }
1529   if (eob > 1) {
1530     int i = 1;
1531     do {
1532       if (!ReadSignAndApplyDequantization<ResidualType,
1533                                           /*is_dc_coefficient=*/false>(
1534               scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
1535               nullptr, &coefficient_level, residual)) {
1536         return -1;
1537       }
1538     } while (++i < eob);
1539     MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
1540   }
1541   SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
1542                      dc_category);
1543   if (split_parse_and_decode_) {
1544     *block.residual += tx_width * tx_height * residual_size_;
1545   }
1546   return eob;
1547 }
1548 
1549 // CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
1550 // |function| depending on the value of |sequence_header_.color_config.bitdepth|
1551 // with the variadic arguments.
1552 #if LIBGAV1_MAX_BITDEPTH >= 10
1553 #define CALL_BITDEPTH_FUNCTION(function, ...)         \
1554   do {                                                \
1555     if (sequence_header_.color_config.bitdepth > 8) { \
1556       function<uint16_t>(__VA_ARGS__);                \
1557     } else {                                          \
1558       function<uint8_t>(__VA_ARGS__);                 \
1559     }                                                 \
1560   } while (false)
1561 #else
1562 #define CALL_BITDEPTH_FUNCTION(function, ...) \
1563   do {                                        \
1564     function<uint8_t>(__VA_ARGS__);           \
1565   } while (false)
1566 #endif
1567 
TransformBlock(const Block & block,Plane plane,int base_x,int base_y,TransformSize tx_size,int x,int y,ProcessingMode mode)1568 bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
1569                           int base_y, TransformSize tx_size, int x, int y,
1570                           ProcessingMode mode) {
1571   BlockParameters& bp = *block.bp;
1572   const int subsampling_x = subsampling_x_[plane];
1573   const int subsampling_y = subsampling_y_[plane];
1574   const int start_x = base_x + MultiplyBy4(x);
1575   const int start_y = base_y + MultiplyBy4(y);
1576   const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
1577   const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
1578   if (start_x >= max_x || start_y >= max_y) return true;
1579   const int row = DivideBy4(start_y << subsampling_y);
1580   const int column = DivideBy4(start_x << subsampling_x);
1581   const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
1582   const int sub_block_row4x4 = row & mask;
1583   const int sub_block_column4x4 = column & mask;
1584   const int step_x = kTransformWidth4x4[tx_size];
1585   const int step_y = kTransformHeight4x4[tx_size];
1586   const bool do_decode = mode == kProcessingModeDecodeOnly ||
1587                          mode == kProcessingModeParseAndDecode;
1588   if (do_decode && !bp.is_inter) {
1589     if (bp.prediction_parameters->palette_mode_info.size[GetPlaneType(plane)] >
1590         0) {
1591       CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
1592                              x, y, tx_size);
1593     } else {
1594       const PredictionMode mode =
1595           (plane == kPlaneY) ? bp.y_mode
1596                              : (bp.prediction_parameters->uv_mode ==
1597                                         kPredictionModeChromaFromLuma
1598                                     ? kPredictionModeDc
1599                                     : bp.prediction_parameters->uv_mode);
1600       const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
1601       const int tr_column4x4 =
1602           (sub_block_column4x4 >> subsampling_x) + step_x + 1;
1603       const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
1604       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
1605       const bool has_left = x > 0 || block.left_available[plane];
1606       const bool has_top = y > 0 || block.top_available[plane];
1607 
1608       CALL_BITDEPTH_FUNCTION(
1609           IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
1610           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
1611           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
1612           mode, tx_size);
1613       if (plane != kPlaneY &&
1614           bp.prediction_parameters->uv_mode == kPredictionModeChromaFromLuma) {
1615         CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
1616                                start_y, tx_size);
1617       }
1618     }
1619     if (plane == kPlaneY) {
1620       block.bp->prediction_parameters->max_luma_width =
1621           start_x + MultiplyBy4(step_x);
1622       block.bp->prediction_parameters->max_luma_height =
1623           start_y + MultiplyBy4(step_y);
1624       block.scratch_buffer->cfl_luma_buffer_valid = false;
1625     }
1626   }
1627   if (!bp.skip) {
1628     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
1629     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
1630     if (mode == kProcessingModeDecodeOnly) {
1631       Queue<TransformParameters>& tx_params =
1632           *residual_buffer_threaded_[sb_row_index][sb_column_index]
1633                ->transform_parameters();
1634       ReconstructBlock(block, plane, start_x, start_y, tx_size,
1635                        tx_params.Front().type,
1636                        tx_params.Front().non_zero_coeff_count);
1637       tx_params.Pop();
1638     } else {
1639       TransformType tx_type;
1640       int non_zero_coeff_count;
1641 #if LIBGAV1_MAX_BITDEPTH >= 10
1642       if (sequence_header_.color_config.bitdepth > 8) {
1643         non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
1644             block, plane, start_x, start_y, tx_size, &tx_type);
1645       } else  // NOLINT
1646 #endif
1647       {
1648         non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
1649             block, plane, start_x, start_y, tx_size, &tx_type);
1650       }
1651       if (non_zero_coeff_count < 0) return false;
1652       if (mode == kProcessingModeParseAndDecode) {
1653         ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
1654                          non_zero_coeff_count);
1655       } else {
1656         assert(mode == kProcessingModeParseOnly);
1657         residual_buffer_threaded_[sb_row_index][sb_column_index]
1658             ->transform_parameters()
1659             ->Push(TransformParameters(tx_type, non_zero_coeff_count));
1660       }
1661     }
1662   }
1663   if (do_decode) {
1664     bool* block_decoded =
1665         &block.scratch_buffer
1666              ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
1667                             [(sub_block_column4x4 >> subsampling_x) + 1];
1668     SetBlockValues<bool>(step_y, step_x, true, block_decoded,
1669                          TileScratchBuffer::kBlockDecodedStride);
1670   }
1671   return true;
1672 }
1673 
TransformTree(const Block & block,int start_x,int start_y,BlockSize plane_size,ProcessingMode mode)1674 bool Tile::TransformTree(const Block& block, int start_x, int start_y,
1675                          BlockSize plane_size, ProcessingMode mode) {
1676   assert(plane_size <= kBlock64x64);
1677   // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
1678   // required is (4 - 1) * 4 + 1 = 13.
1679   Stack<TransformTreeNode, 13> stack;
1680   // It is okay to cast BlockSize to TransformSize here since the enum are
1681   // equivalent for all BlockSize values <= kBlock64x64.
1682   stack.Push(TransformTreeNode(start_x, start_y,
1683                                static_cast<TransformSize>(plane_size)));
1684 
1685   do {
1686     TransformTreeNode node = stack.Pop();
1687     const int row = DivideBy4(node.y);
1688     const int column = DivideBy4(node.x);
1689     if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
1690       continue;
1691     }
1692     const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
1693     const int width = kTransformWidth[node.tx_size];
1694     const int height = kTransformHeight[node.tx_size];
1695     if (width <= kTransformWidth[inter_tx_size] &&
1696         height <= kTransformHeight[inter_tx_size]) {
1697       if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
1698                           mode)) {
1699         return false;
1700       }
1701       continue;
1702     }
1703     // The split transform size look up gives the right transform size that we
1704     // should push in the stack.
1705     //   if (width > height) => transform size whose width is half.
1706     //   if (width < height) => transform size whose height is half.
1707     //   if (width == height) => transform size whose width and height are half.
1708     const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
1709     const int half_width = DivideBy2(width);
1710     if (width > height) {
1711       stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1712       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1713       continue;
1714     }
1715     const int half_height = DivideBy2(height);
1716     if (width < height) {
1717       stack.Push(
1718           TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1719       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1720       continue;
1721     }
1722     stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
1723                                  split_tx_size));
1724     stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1725     stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1726     stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1727   } while (!stack.Empty());
1728   return true;
1729 }
1730 
ReconstructBlock(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType tx_type,int non_zero_coeff_count)1731 void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
1732                             int start_y, TransformSize tx_size,
1733                             TransformType tx_type, int non_zero_coeff_count) {
1734   // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
1735   assert(non_zero_coeff_count >= 0);
1736   if (non_zero_coeff_count == 0) return;
1737 #if LIBGAV1_MAX_BITDEPTH >= 10
1738   if (sequence_header_.color_config.bitdepth > 8) {
1739     Array2DView<uint16_t> buffer(
1740         buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
1741         reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
1742     Reconstruct(dsp_, tx_type, tx_size,
1743                 frame_header_.segmentation
1744                     .lossless[block.bp->prediction_parameters->segment_id],
1745                 reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
1746                 &buffer, non_zero_coeff_count);
1747   } else  // NOLINT
1748 #endif
1749   {
1750     Reconstruct(dsp_, tx_type, tx_size,
1751                 frame_header_.segmentation
1752                     .lossless[block.bp->prediction_parameters->segment_id],
1753                 reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
1754                 &buffer_[plane], non_zero_coeff_count);
1755   }
1756   if (split_parse_and_decode_) {
1757     *block.residual +=
1758         kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
1759   }
1760 }
1761 
Residual(const Block & block,ProcessingMode mode)1762 bool Tile::Residual(const Block& block, ProcessingMode mode) {
1763   const int width_chunks = std::max(1, block.width >> 6);
1764   const int height_chunks = std::max(1, block.height >> 6);
1765   const BlockSize size_chunk4x4 =
1766       (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
1767   const BlockParameters& bp = *block.bp;
1768   for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
1769     for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
1770       const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1771       int plane = kPlaneY;
1772       do {
1773         const int subsampling_x = subsampling_x_[plane];
1774         const int subsampling_y = subsampling_y_[plane];
1775         // For Y Plane, when lossless is true |bp.transform_size| is always
1776         // kTransformSize4x4. So we can simply use |bp.transform_size| here as
1777         // the Y plane's transform size (part of Section 5.11.37 in the spec).
1778         const TransformSize tx_size =
1779             (plane == kPlaneY)
1780                 ? inter_transform_sizes_[block.row4x4][block.column4x4]
1781                 : bp.uv_transform_size;
1782         const BlockSize plane_size =
1783             kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
1784         assert(plane_size != kBlockInvalid);
1785         if (bp.is_inter &&
1786             !frame_header_.segmentation
1787                  .lossless[bp.prediction_parameters->segment_id] &&
1788             plane == kPlaneY) {
1789           const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
1790           const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
1791           const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
1792           const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
1793           if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
1794             return false;
1795           }
1796         } else {
1797           const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
1798           const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
1799           const int step_x = kTransformWidth4x4[tx_size];
1800           const int step_y = kTransformHeight4x4[tx_size];
1801           const int num4x4_wide = kNum4x4BlocksWide[plane_size];
1802           const int num4x4_high = kNum4x4BlocksHigh[plane_size];
1803           for (int y = 0; y < num4x4_high; y += step_y) {
1804             for (int x = 0; x < num4x4_wide; x += step_x) {
1805               if (!TransformBlock(
1806                       block, static_cast<Plane>(plane), base_x, base_y, tx_size,
1807                       x + (MultiplyBy16(chunk_x) >> subsampling_x),
1808                       y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
1809                 return false;
1810               }
1811             }
1812           }
1813         }
1814       } while (++plane < num_planes);
1815     }
1816   }
1817   return true;
1818 }
1819 
1820 // The purpose of this function is to limit the maximum size of motion vectors
1821 // and also, if use_intra_block_copy is true, to additionally constrain the
1822 // motion vector so that the data is fetched from parts of the tile that have
1823 // already been decoded and are not too close to the current block (in order to
1824 // make a pipelined decoder implementation feasible).
IsMvValid(const Block & block,bool is_compound) const1825 bool Tile::IsMvValid(const Block& block, bool is_compound) const {
1826   const BlockParameters& bp = *block.bp;
1827   for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
1828     for (int mv_component : bp.mv.mv[i].mv) {
1829       if (std::abs(mv_component) >= (1 << 14)) {
1830         return false;
1831       }
1832     }
1833   }
1834   if (!block.bp->prediction_parameters->use_intra_block_copy) {
1835     return true;
1836   }
1837   if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
1838     return false;
1839   }
1840   const int delta_row = bp.mv.mv[0].mv[0] >> 3;
1841   const int delta_column = bp.mv.mv[0].mv[1] >> 3;
1842   int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
1843   int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
1844   const int src_bottom_edge = src_top_edge + block.height;
1845   const int src_right_edge = src_left_edge + block.width;
1846   if (block.HasChroma()) {
1847     if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
1848       src_left_edge -= 4;
1849     }
1850     if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
1851       src_top_edge -= 4;
1852     }
1853   }
1854   if (src_top_edge < MultiplyBy4(row4x4_start_) ||
1855       src_left_edge < MultiplyBy4(column4x4_start_) ||
1856       src_bottom_edge > MultiplyBy4(row4x4_end_) ||
1857       src_right_edge > MultiplyBy4(column4x4_end_)) {
1858     return false;
1859   }
1860   // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
1861   const int sb_height_log2 =
1862       6 + static_cast<int>(sequence_header_.use_128x128_superblock);
1863   const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
1864   const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
1865   const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
1866   const int src_64x64_block_column = (src_right_edge - 1) >> 6;
1867   const int total_64x64_blocks_per_row =
1868       ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
1869   const int active_64x64_block =
1870       active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
1871   const int src_64x64_block =
1872       src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
1873   if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
1874     return false;
1875   }
1876 
1877   // Wavefront constraint: use only top left area of frame for reference.
1878   if (src_sb_row > active_sb_row) return false;
1879   const int gradient =
1880       1 + kIntraBlockCopyDelay64x64Blocks +
1881       static_cast<int>(sequence_header_.use_128x128_superblock);
1882   const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
1883   return src_64x64_block_column < active_64x64_block_column -
1884                                       kIntraBlockCopyDelay64x64Blocks +
1885                                       wavefront_offset;
1886 }
1887 
AssignInterMv(const Block & block,bool is_compound)1888 bool Tile::AssignInterMv(const Block& block, bool is_compound) {
1889   int min[2];
1890   int max[2];
1891   GetClampParameters(block, min, max);
1892   BlockParameters& bp = *block.bp;
1893   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1894   bp.mv.mv64 = 0;
1895   if (is_compound) {
1896     for (int i = 0; i < 2; ++i) {
1897       const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
1898       MotionVector predicted_mv;
1899       if (mode == kPredictionModeGlobalMv) {
1900         predicted_mv = prediction_parameters.global_mv[i];
1901       } else {
1902         const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1903                                   (mode == kPredictionModeNewMv &&
1904                                    prediction_parameters.ref_mv_count <= 1))
1905                                      ? 0
1906                                      : prediction_parameters.ref_mv_index;
1907         predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
1908         if (ref_mv_index < prediction_parameters.ref_mv_count) {
1909           predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1910           predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1911         }
1912       }
1913       if (mode == kPredictionModeNewMv) {
1914         ReadMotionVector(block, i);
1915         bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
1916         bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
1917       } else {
1918         bp.mv.mv[i] = predicted_mv;
1919       }
1920     }
1921   } else {
1922     const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
1923     MotionVector predicted_mv;
1924     if (mode == kPredictionModeGlobalMv) {
1925       predicted_mv = prediction_parameters.global_mv[0];
1926     } else {
1927       const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1928                                 (mode == kPredictionModeNewMv &&
1929                                  prediction_parameters.ref_mv_count <= 1))
1930                                    ? 0
1931                                    : prediction_parameters.ref_mv_index;
1932       predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
1933       if (ref_mv_index < prediction_parameters.ref_mv_count) {
1934         predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1935         predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1936       }
1937     }
1938     if (mode == kPredictionModeNewMv) {
1939       ReadMotionVector(block, 0);
1940       bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
1941       bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
1942     } else {
1943       bp.mv.mv[0] = predicted_mv;
1944     }
1945   }
1946   return IsMvValid(block, is_compound);
1947 }
1948 
AssignIntraMv(const Block & block)1949 bool Tile::AssignIntraMv(const Block& block) {
1950   // TODO(linfengz): Check if the clamping process is necessary.
1951   int min[2];
1952   int max[2];
1953   GetClampParameters(block, min, max);
1954   BlockParameters& bp = *block.bp;
1955   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1956   const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
1957   bp.mv.mv64 = 0;
1958   ReadMotionVector(block, 0);
1959   if (ref_mv_0.mv32 == 0) {
1960     const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
1961     if (ref_mv_1.mv32 == 0) {
1962       const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
1963       if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
1964         bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
1965         bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
1966       } else {
1967         bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
1968       }
1969     } else {
1970       bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
1971       bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
1972     }
1973   } else {
1974     bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
1975     bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
1976   }
1977   return IsMvValid(block, /*is_compound=*/false);
1978 }
1979 
ResetEntropyContext(const Block & block)1980 void Tile::ResetEntropyContext(const Block& block) {
1981   const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1982   int plane = kPlaneY;
1983   do {
1984     const int subsampling_x = subsampling_x_[plane];
1985     const int start_x = block.column4x4 >> subsampling_x;
1986     const int end_x =
1987         std::min((block.column4x4 + block.width4x4) >> subsampling_x,
1988                  frame_header_.columns4x4);
1989     memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
1990            end_x - start_x);
1991     memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
1992            end_x - start_x);
1993     const int subsampling_y = subsampling_y_[plane];
1994     const int start_y = block.row4x4 >> subsampling_y;
1995     const int end_y =
1996         std::min((block.row4x4 + block.height4x4) >> subsampling_y,
1997                  frame_header_.rows4x4);
1998     memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
1999            end_y - start_y);
2000     memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
2001            end_y - start_y);
2002   } while (++plane < num_planes);
2003 }
2004 
ComputePrediction(const Block & block)2005 bool Tile::ComputePrediction(const Block& block) {
2006   const BlockParameters& bp = *block.bp;
2007   if (!bp.is_inter) return true;
2008   const int mask =
2009       (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
2010       1;
2011   const int sub_block_row4x4 = block.row4x4 & mask;
2012   const int sub_block_column4x4 = block.column4x4 & mask;
2013   const int plane_count = block.HasChroma() ? PlaneCount() : 1;
2014   // Returns true if this block applies local warping. The state is determined
2015   // in the Y plane and carried for use in the U/V planes.
2016   // But the U/V planes will not apply warping when the block size is smaller
2017   // than 8x8, even if this variable is true.
2018   bool is_local_valid = false;
2019   // Local warping parameters, similar usage as is_local_valid.
2020   GlobalMotion local_warp_params;
2021   int plane = kPlaneY;
2022   do {
2023     const int8_t subsampling_x = subsampling_x_[plane];
2024     const int8_t subsampling_y = subsampling_y_[plane];
2025     const BlockSize plane_size = block.residual_size[plane];
2026     const int block_width4x4 = kNum4x4BlocksWide[plane_size];
2027     const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
2028     const int block_width = MultiplyBy4(block_width4x4);
2029     const int block_height = MultiplyBy4(block_height4x4);
2030     const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
2031     const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
2032     if (bp.reference_frame[1] == kReferenceFrameIntra) {
2033       const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
2034       const int tr_column4x4 =
2035           (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
2036       const int bl_row4x4 =
2037           (sub_block_row4x4 >> subsampling_y) + block_height4x4;
2038       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
2039       const TransformSize tx_size =
2040           k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
2041                                  [k4x4HeightLog2[plane_size]];
2042       const bool has_left = block.left_available[plane];
2043       const bool has_top = block.top_available[plane];
2044       CALL_BITDEPTH_FUNCTION(
2045           IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
2046           has_left, has_top,
2047           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
2048           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
2049           kInterIntraToIntraMode[block.bp->prediction_parameters
2050                                      ->inter_intra_mode],
2051           tx_size);
2052     }
2053     int candidate_row = block.row4x4;
2054     int candidate_column = block.column4x4;
2055     bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
2056     if (!some_use_intra && plane != 0) {
2057       candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
2058       candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
2059       if (candidate_row != block.row4x4) {
2060         // Top block.
2061         const BlockParameters& bp_top =
2062             *block_parameters_holder_.Find(candidate_row, block.column4x4);
2063         some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
2064         if (!some_use_intra && candidate_column != block.column4x4) {
2065           // Top-left block.
2066           const BlockParameters& bp_top_left =
2067               *block_parameters_holder_.Find(candidate_row, candidate_column);
2068           some_use_intra =
2069               bp_top_left.reference_frame[0] == kReferenceFrameIntra;
2070         }
2071       }
2072       if (!some_use_intra && candidate_column != block.column4x4) {
2073         // Left block.
2074         const BlockParameters& bp_left =
2075             *block_parameters_holder_.Find(block.row4x4, candidate_column);
2076         some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
2077       }
2078     }
2079     int prediction_width;
2080     int prediction_height;
2081     if (some_use_intra) {
2082       candidate_row = block.row4x4;
2083       candidate_column = block.column4x4;
2084       prediction_width = block_width;
2085       prediction_height = block_height;
2086     } else {
2087       prediction_width = block.width >> subsampling_x;
2088       prediction_height = block.height >> subsampling_y;
2089     }
2090     int r = 0;
2091     int y = 0;
2092     do {
2093       int c = 0;
2094       int x = 0;
2095       do {
2096         if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
2097                              base_y + y, prediction_width, prediction_height,
2098                              candidate_row + r, candidate_column + c,
2099                              &is_local_valid, &local_warp_params)) {
2100           return false;
2101         }
2102         ++c;
2103         x += prediction_width;
2104       } while (x < block_width);
2105       ++r;
2106       y += prediction_height;
2107     } while (y < block_height);
2108   } while (++plane < plane_count);
2109   return true;
2110 }
2111 
2112 #undef CALL_BITDEPTH_FUNCTION
2113 
PopulateDeblockFilterLevel(const Block & block)2114 void Tile::PopulateDeblockFilterLevel(const Block& block) {
2115   if (!post_filter_.DoDeblock()) return;
2116   BlockParameters& bp = *block.bp;
2117   const int mode_id =
2118       static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
2119   for (int i = 0; i < kFrameLfCount; ++i) {
2120     if (delta_lf_all_zero_) {
2121       bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
2122           bp.prediction_parameters->segment_id, i, bp.reference_frame[0],
2123           mode_id);
2124     } else {
2125       bp.deblock_filter_level[i] =
2126           deblock_filter_levels_[bp.prediction_parameters->segment_id][i]
2127                                 [bp.reference_frame[0]][mode_id];
2128     }
2129   }
2130 }
2131 
PopulateCdefSkip(const Block & block)2132 void Tile::PopulateCdefSkip(const Block& block) {
2133   if (!post_filter_.DoCdef() || block.bp->skip ||
2134       (frame_header_.cdef.bits > 0 &&
2135        cdef_index_[DivideBy16(block.row4x4)][DivideBy16(block.column4x4)] ==
2136            -1)) {
2137     return;
2138   }
2139   // The rest of this function is an efficient version of the following code:
2140   // for (int y = block.row4x4; y < block.row4x4 + block.height4x4; y++) {
2141   //   for (int x = block.column4x4; y < block.column4x4 + block.width4x4;
2142   //        x++) {
2143   //     const uint8_t mask = uint8_t{1} << ((x >> 1) & 0x7);
2144   //     cdef_skip_[y >> 1][x >> 4] |= mask;
2145   //   }
2146   // }
2147 
2148   // For all block widths other than 32, the mask will fit in uint8_t. For
2149   // block width == 32, the mask is always 0xFFFF.
2150   const int bw4 =
2151       std::max(DivideBy2(block.width4x4) + (block.column4x4 & 1), 1);
2152   const uint8_t mask = (block.width4x4 == 32)
2153                            ? 0xFF
2154                            : (uint8_t{0xFF} >> (8 - bw4))
2155                                  << (DivideBy2(block.column4x4) & 0x7);
2156   uint8_t* cdef_skip = &cdef_skip_[block.row4x4 >> 1][block.column4x4 >> 4];
2157   const int stride = cdef_skip_.columns();
2158   int row = 0;
2159   do {
2160     *cdef_skip |= mask;
2161     if (block.width4x4 == 32) {
2162       *(cdef_skip + 1) = 0xFF;
2163     }
2164     cdef_skip += stride;
2165     row += 2;
2166   } while (row < block.height4x4);
2167 }
2168 
ProcessBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2169 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
2170                         TileScratchBuffer* const scratch_buffer,
2171                         ResidualPtr* residual) {
2172   // Do not process the block if the starting point is beyond the visible frame.
2173   // This is equivalent to the has_row/has_column check in the
2174   // decode_partition() section of the spec when partition equals
2175   // kPartitionHorizontal or kPartitionVertical.
2176   if (row4x4 >= frame_header_.rows4x4 ||
2177       column4x4 >= frame_header_.columns4x4) {
2178     return true;
2179   }
2180 
2181   if (split_parse_and_decode_) {
2182     // Push block ordering info to the queue. DecodeBlock() will use this queue
2183     // to decode the blocks in the correct order.
2184     const int sb_row_index = SuperBlockRowIndex(row4x4);
2185     const int sb_column_index = SuperBlockColumnIndex(column4x4);
2186     residual_buffer_threaded_[sb_row_index][sb_column_index]
2187         ->partition_tree_order()
2188         ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
2189   }
2190 
2191   BlockParameters* bp_ptr =
2192       block_parameters_holder_.Get(row4x4, column4x4, block_size);
2193   if (bp_ptr == nullptr) {
2194     LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
2195     return false;
2196   }
2197   BlockParameters& bp = *bp_ptr;
2198   Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
2199   bp.size = block_size;
2200   bp.prediction_parameters =
2201       split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
2202                                     new (std::nothrow) PredictionParameters())
2203                               : std::move(prediction_parameters_);
2204   if (bp.prediction_parameters == nullptr) return false;
2205   if (!DecodeModeInfo(block)) return false;
2206   PopulateDeblockFilterLevel(block);
2207   if (!ReadPaletteTokens(block)) return false;
2208   DecodeTransformSize(block);
2209   // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
2210   bp.uv_transform_size =
2211       frame_header_.segmentation.lossless[bp.prediction_parameters->segment_id]
2212           ? kTransformSize4x4
2213           : kUVTransformSize[block.residual_size[kPlaneU]];
2214   if (bp.skip) ResetEntropyContext(block);
2215   PopulateCdefSkip(block);
2216   if (split_parse_and_decode_) {
2217     if (!Residual(block, kProcessingModeParseOnly)) return false;
2218   } else {
2219     if (!ComputePrediction(block) ||
2220         !Residual(block, kProcessingModeParseAndDecode)) {
2221       return false;
2222     }
2223   }
2224   // If frame_header_.segmentation.enabled is false,
2225   // bp.prediction_parameters->segment_id is 0 for all blocks. We don't need to
2226   // call save bp.prediction_parameters->segment_id in the current frame because
2227   // the current frame's segmentation map will be cleared to all 0s.
2228   //
2229   // If frame_header_.segmentation.enabled is true and
2230   // frame_header_.segmentation.update_map is false, we will copy the previous
2231   // frame's segmentation map to the current frame. So we don't need to call
2232   // save bp.prediction_parameters->segment_id in the current frame.
2233   if (frame_header_.segmentation.enabled &&
2234       frame_header_.segmentation.update_map) {
2235     const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
2236                                  static_cast<int>(block.width4x4));
2237     const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
2238                                  static_cast<int>(block.height4x4));
2239     current_frame_.segmentation_map()->FillBlock(
2240         row4x4, column4x4, x_limit, y_limit,
2241         bp.prediction_parameters->segment_id);
2242   }
2243   StoreMotionFieldMvsIntoCurrentFrame(block);
2244   if (!split_parse_and_decode_) {
2245     prediction_parameters_ = std::move(bp.prediction_parameters);
2246   }
2247   return true;
2248 }
2249 
DecodeBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2250 bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
2251                        TileScratchBuffer* const scratch_buffer,
2252                        ResidualPtr* residual) {
2253   if (row4x4 >= frame_header_.rows4x4 ||
2254       column4x4 >= frame_header_.columns4x4) {
2255     return true;
2256   }
2257   Block block(this, block_size, row4x4, column4x4, scratch_buffer, residual);
2258   if (!ComputePrediction(block) ||
2259       !Residual(block, kProcessingModeDecodeOnly)) {
2260     return false;
2261   }
2262   block.bp->prediction_parameters.reset(nullptr);
2263   return true;
2264 }
2265 
ProcessPartition(int row4x4_start,int column4x4_start,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2266 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
2267                             TileScratchBuffer* const scratch_buffer,
2268                             ResidualPtr* residual) {
2269   Stack<PartitionTreeNode, kDfsStackSize> stack;
2270 
2271   // Set up the first iteration.
2272   stack.Push(
2273       PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
2274 
2275   // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
2276   // Otherwise, the children are pushed into the stack for future processing.
2277   do {
2278     PartitionTreeNode node = stack.Pop();
2279     int row4x4 = node.row4x4;
2280     int column4x4 = node.column4x4;
2281     BlockSize block_size = node.block_size;
2282 
2283     if (row4x4 >= frame_header_.rows4x4 ||
2284         column4x4 >= frame_header_.columns4x4) {
2285       continue;
2286     }
2287     const int block_width4x4 = kNum4x4BlocksWide[block_size];
2288     assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
2289     const int half_block4x4 = block_width4x4 >> 1;
2290     const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
2291     const bool has_columns =
2292         (column4x4 + half_block4x4) < frame_header_.columns4x4;
2293     Partition partition;
2294     if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
2295                        &partition)) {
2296       LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
2297                    row4x4, column4x4);
2298       return false;
2299     }
2300     const BlockSize sub_size = kSubSize[partition][block_size];
2301     // Section 6.10.4: It is a requirement of bitstream conformance that
2302     // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
2303     // every time subSize is computed.
2304     if (sub_size == kBlockInvalid ||
2305         kPlaneResidualSize[sub_size]
2306                           [sequence_header_.color_config.subsampling_x]
2307                           [sequence_header_.color_config.subsampling_y] ==
2308             kBlockInvalid) {
2309       LIBGAV1_DLOG(
2310           ERROR,
2311           "Invalid sub-block/plane size for row: %d column: %d partition: "
2312           "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
2313           row4x4, column4x4, partition, block_size, sub_size,
2314           sequence_header_.color_config.subsampling_x,
2315           sequence_header_.color_config.subsampling_y);
2316       return false;
2317     }
2318 
2319     const int quarter_block4x4 = half_block4x4 >> 1;
2320     const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
2321     assert(partition == kPartitionNone || sub_size != kBlockInvalid);
2322     switch (partition) {
2323       case kPartitionNone:
2324         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2325                           residual)) {
2326           return false;
2327         }
2328         break;
2329       case kPartitionSplit:
2330         // The children must be added in reverse order since a stack is being
2331         // used.
2332         stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
2333                                      column4x4 + half_block4x4, sub_size));
2334         stack.Push(
2335             PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
2336         stack.Push(
2337             PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
2338         stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
2339         break;
2340       case kPartitionHorizontal:
2341         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2342                           residual) ||
2343             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2344                           scratch_buffer, residual)) {
2345           return false;
2346         }
2347         break;
2348       case kPartitionVertical:
2349         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2350                           residual) ||
2351             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2352                           scratch_buffer, residual)) {
2353           return false;
2354         }
2355         break;
2356       case kPartitionHorizontalWithTopSplit:
2357         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2358                           residual) ||
2359             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2360                           scratch_buffer, residual) ||
2361             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2362                           scratch_buffer, residual)) {
2363           return false;
2364         }
2365         break;
2366       case kPartitionHorizontalWithBottomSplit:
2367         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2368                           residual) ||
2369             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2370                           scratch_buffer, residual) ||
2371             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2372                           split_size, scratch_buffer, residual)) {
2373           return false;
2374         }
2375         break;
2376       case kPartitionVerticalWithLeftSplit:
2377         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2378                           residual) ||
2379             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2380                           scratch_buffer, residual) ||
2381             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2382                           scratch_buffer, residual)) {
2383           return false;
2384         }
2385         break;
2386       case kPartitionVerticalWithRightSplit:
2387         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2388                           residual) ||
2389             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2390                           scratch_buffer, residual) ||
2391             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2392                           split_size, scratch_buffer, residual)) {
2393           return false;
2394         }
2395         break;
2396       case kPartitionHorizontal4:
2397         for (int i = 0; i < 4; ++i) {
2398           if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
2399                             scratch_buffer, residual)) {
2400             return false;
2401           }
2402         }
2403         break;
2404       case kPartitionVertical4:
2405         for (int i = 0; i < 4; ++i) {
2406           if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
2407                             scratch_buffer, residual)) {
2408             return false;
2409           }
2410         }
2411         break;
2412     }
2413   } while (!stack.Empty());
2414   return true;
2415 }
2416 
ResetLoopRestorationParams()2417 void Tile::ResetLoopRestorationParams() {
2418   for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
2419     for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
2420       reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
2421           kSgrProjDefaultMultiplier[i];
2422       for (int j = 0; j < kNumWienerCoefficients; ++j) {
2423         reference_unit_info_[plane].wiener_info.filter[i][j] =
2424             kWienerDefaultFilter[j];
2425       }
2426     }
2427   }
2428 }
2429 
ResetCdef(const int row4x4,const int column4x4)2430 void Tile::ResetCdef(const int row4x4, const int column4x4) {
2431   if (frame_header_.cdef.bits == 0) return;
2432   const int row = DivideBy16(row4x4);
2433   const int column = DivideBy16(column4x4);
2434   cdef_index_[row][column] = -1;
2435   if (sequence_header_.use_128x128_superblock) {
2436     const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
2437     const int border_row = DivideBy16(row4x4 + cdef_size4x4);
2438     const int border_column = DivideBy16(column4x4 + cdef_size4x4);
2439     cdef_index_[row][border_column] = -1;
2440     cdef_index_[border_row][column] = -1;
2441     cdef_index_[border_row][border_column] = -1;
2442   }
2443 }
2444 
ClearBlockDecoded(TileScratchBuffer * const scratch_buffer,int row4x4,int column4x4)2445 void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
2446                              int row4x4, int column4x4) {
2447   // Set everything to false.
2448   memset(scratch_buffer->block_decoded, 0,
2449          sizeof(scratch_buffer->block_decoded));
2450   // Set specific edge cases to true.
2451   const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
2452   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2453     const int subsampling_x = subsampling_x_[plane];
2454     const int subsampling_y = subsampling_y_[plane];
2455     const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
2456     const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
2457     // The memset is equivalent to the following lines in the spec:
2458     // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
2459     //   if ( y < 0 && x < sbWidth4 ) {
2460     //     BlockDecoded[plane][y][x] = 1
2461     //   }
2462     // }
2463     const int num_elements =
2464         std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
2465     memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
2466     // The for loop is equivalent to the following lines in the spec:
2467     // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
2468     //   if ( x < 0 && y < sbHeight4 )
2469     //     BlockDecoded[plane][y][x] = 1
2470     //   }
2471     // }
2472     // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
2473     for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
2474          ++y) {
2475       scratch_buffer->block_decoded[plane][y + 1][0] = true;
2476     }
2477   }
2478 }
2479 
ProcessSuperBlock(int row4x4,int column4x4,TileScratchBuffer * const scratch_buffer,ProcessingMode mode)2480 bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
2481                              TileScratchBuffer* const scratch_buffer,
2482                              ProcessingMode mode) {
2483   const bool parsing =
2484       mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
2485   const bool decoding = mode == kProcessingModeDecodeOnly ||
2486                         mode == kProcessingModeParseAndDecode;
2487   if (parsing) {
2488     read_deltas_ = frame_header_.delta_q.present;
2489     ResetCdef(row4x4, column4x4);
2490   }
2491   if (decoding) {
2492     ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
2493   }
2494   const BlockSize block_size = SuperBlockSize();
2495   if (parsing) {
2496     ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
2497   }
2498   if (parsing && decoding) {
2499     uint8_t* residual_buffer = residual_buffer_.get();
2500     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2501                           &residual_buffer)) {
2502       LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
2503                    column4x4);
2504       return false;
2505     }
2506     return true;
2507   }
2508   const int sb_row_index = SuperBlockRowIndex(row4x4);
2509   const int sb_column_index = SuperBlockColumnIndex(column4x4);
2510   if (parsing) {
2511     residual_buffer_threaded_[sb_row_index][sb_column_index] =
2512         residual_buffer_pool_->Get();
2513     if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
2514       LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
2515       return false;
2516     }
2517     uint8_t* residual_buffer =
2518         residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2519     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2520                           &residual_buffer)) {
2521       LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
2522                    column4x4);
2523       return false;
2524     }
2525   } else {
2526     if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
2527       LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
2528                    row4x4, column4x4);
2529       return false;
2530     }
2531     residual_buffer_pool_->Release(
2532         std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
2533   }
2534   return true;
2535 }
2536 
DecodeSuperBlock(int sb_row_index,int sb_column_index,TileScratchBuffer * const scratch_buffer)2537 bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
2538                             TileScratchBuffer* const scratch_buffer) {
2539   uint8_t* residual_buffer =
2540       residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2541   Queue<PartitionTreeNode>& partition_tree_order =
2542       *residual_buffer_threaded_[sb_row_index][sb_column_index]
2543            ->partition_tree_order();
2544   while (!partition_tree_order.Empty()) {
2545     PartitionTreeNode block = partition_tree_order.Front();
2546     if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
2547                      scratch_buffer, &residual_buffer)) {
2548       LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
2549                    block.row4x4, block.column4x4);
2550       return false;
2551     }
2552     partition_tree_order.Pop();
2553   }
2554   return true;
2555 }
2556 
ReadLoopRestorationCoefficients(int row4x4,int column4x4,BlockSize block_size)2557 void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
2558                                            BlockSize block_size) {
2559   if (frame_header_.allow_intrabc) return;
2560   LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
2561   const bool is_superres_scaled =
2562       frame_header_.width != frame_header_.upscaled_width;
2563   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2564     LoopRestorationUnitInfo unit_info;
2565     if (restoration_info->PopulateUnitInfoForSuperBlock(
2566             static_cast<Plane>(plane), block_size, is_superres_scaled,
2567             frame_header_.superres_scale_denominator, row4x4, column4x4,
2568             &unit_info)) {
2569       for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
2570            ++unit_row) {
2571         for (int unit_column = unit_info.column_start;
2572              unit_column < unit_info.column_end; ++unit_column) {
2573           const int unit_id = unit_row * restoration_info->num_horizontal_units(
2574                                              static_cast<Plane>(plane)) +
2575                               unit_column;
2576           restoration_info->ReadUnitCoefficients(
2577               &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
2578               unit_id, &reference_unit_info_);
2579         }
2580       }
2581     }
2582   }
2583 }
2584 
StoreMotionFieldMvsIntoCurrentFrame(const Block & block)2585 void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
2586   if (frame_header_.refresh_frame_flags == 0 ||
2587       IsIntraFrame(frame_header_.frame_type)) {
2588     return;
2589   }
2590   // Iterate over odd rows/columns beginning at the first odd row/column for the
2591   // block. It is done this way because motion field mvs are only needed at a
2592   // 8x8 granularity.
2593   const int row_start4x4 = block.row4x4 | 1;
2594   const int row_limit4x4 =
2595       std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
2596   if (row_start4x4 >= row_limit4x4) return;
2597   const int column_start4x4 = block.column4x4 | 1;
2598   const int column_limit4x4 =
2599       std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
2600   if (column_start4x4 >= column_limit4x4) return;
2601 
2602   // The largest reference MV component that can be saved.
2603   constexpr int kRefMvsLimit = (1 << 12) - 1;
2604   const BlockParameters& bp = *block.bp;
2605   ReferenceInfo* reference_info = current_frame_.reference_info();
2606   for (int i = 1; i >= 0; --i) {
2607     const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
2608     // Must make a local copy so that StoreMotionFieldMvs() knows there is no
2609     // overlap between load and store.
2610     const MotionVector mv_to_store = bp.mv.mv[i];
2611     const int mv_row = std::abs(mv_to_store.mv[0]);
2612     const int mv_column = std::abs(mv_to_store.mv[1]);
2613     if (reference_frame_to_store > kReferenceFrameIntra &&
2614         // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
2615         // absolute values and then compare with kRefMvsLimit to save a branch.
2616         // The next line is equivalent to:
2617         // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
2618         (mv_row | mv_column) <= kRefMvsLimit &&
2619         reference_info->relative_distance_from[reference_frame_to_store] < 0) {
2620       const int row_start8x8 = DivideBy2(row_start4x4);
2621       const int row_limit8x8 = DivideBy2(row_limit4x4);
2622       const int column_start8x8 = DivideBy2(column_start4x4);
2623       const int column_limit8x8 = DivideBy2(column_limit4x4);
2624       const int rows = row_limit8x8 - row_start8x8;
2625       const int columns = column_limit8x8 - column_start8x8;
2626       const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
2627       ReferenceFrameType* const reference_frame_row_start =
2628           &reference_info
2629                ->motion_field_reference_frame[row_start8x8][column_start8x8];
2630       MotionVector* const mv =
2631           &reference_info->motion_field_mv[row_start8x8][column_start8x8];
2632 
2633       // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
2634       // and simplifies std::fill() for these cases.
2635       if (columns <= 1) {
2636         // Don't change the above condition to (columns == 1).
2637         // Condition (columns <= 1) may help the compiler simplify the inlining
2638         // of the general case of StoreMotionFieldMvs() by eliminating the
2639         // (columns == 0) case.
2640         assert(columns == 1);
2641         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2642                             1, reference_frame_row_start, mv);
2643       } else if (columns == 2) {
2644         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2645                             2, reference_frame_row_start, mv);
2646       } else if (columns == 4) {
2647         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2648                             4, reference_frame_row_start, mv);
2649       } else if (columns == 8) {
2650         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2651                             8, reference_frame_row_start, mv);
2652       } else if (columns == 16) {
2653         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2654                             16, reference_frame_row_start, mv);
2655       } else if (columns < 16) {
2656         // This always true condition (columns < 16) may help the compiler
2657         // simplify the inlining of the following function.
2658         // This general case is rare and usually only happens to the blocks
2659         // which contain the right boundary of the frame.
2660         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2661                             columns, reference_frame_row_start, mv);
2662       } else {
2663         assert(false);
2664       }
2665       return;
2666     }
2667   }
2668 }
2669 
2670 }  // namespace libgav1
2671