1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 #include "src/post_filter.h"
15 #include "src/utils/blocking_counter.h"
16 #include "src/utils/compiler_attributes.h"
17 #include "src/utils/constants.h"
18 
19 namespace libgav1 {
20 namespace {
21 
22 constexpr int kStep64x64 = 16;  // =64/4.
23 constexpr int kCdefSkip = 8;
24 
25 constexpr uint8_t kCdefUvDirection[2][2][8] = {
26     {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
27     {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
28 
29 constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
30 
31 template <typename Pixel>
CopyRowForCdef(const Pixel * src,int block_width,int unit_width,bool is_frame_left,bool is_frame_right,uint16_t * const dst,const Pixel * left_border=nullptr)32 void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
33                     bool is_frame_left, bool is_frame_right,
34                     uint16_t* const dst, const Pixel* left_border = nullptr) {
35   if (sizeof(src[0]) == sizeof(dst[0])) {
36     if (is_frame_left) {
37       Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
38     } else if (left_border == nullptr) {
39       memcpy(dst - kCdefBorder, src - kCdefBorder,
40              kCdefBorder * sizeof(dst[0]));
41     } else {
42       memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
43     }
44     memcpy(dst, src, block_width * sizeof(dst[0]));
45     if (is_frame_right) {
46       Memset(dst + block_width, kCdefLargeValue,
47              unit_width + kCdefBorder - block_width);
48     } else {
49       memcpy(dst + block_width, src + block_width,
50              (unit_width + kCdefBorder - block_width) * sizeof(dst[0]));
51     }
52     return;
53   }
54   if (is_frame_left) {
55     for (int x = -kCdefBorder; x < 0; ++x) {
56       dst[x] = static_cast<uint16_t>(kCdefLargeValue);
57     }
58   } else if (left_border == nullptr) {
59     for (int x = -kCdefBorder; x < 0; ++x) {
60       dst[x] = src[x];
61     }
62   } else {
63     for (int x = -kCdefBorder; x < 0; ++x) {
64       dst[x] = left_border[x + kCdefBorder];
65     }
66   }
67   for (int x = 0; x < block_width; ++x) {
68     dst[x] = src[x];
69   }
70   for (int x = block_width; x < unit_width + kCdefBorder; ++x) {
71     dst[x] = is_frame_right ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
72   }
73 }
74 
75 // For |height| rows, copy |width| pixels of size |pixel_size| from |src| to
76 // |dst|.
CopyPixels(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width,int height,size_t pixel_size)77 void CopyPixels(const uint8_t* src, int src_stride, uint8_t* dst,
78                 int dst_stride, int width, int height, size_t pixel_size) {
79   int y = height;
80   do {
81     memcpy(dst, src, width * pixel_size);
82     src += src_stride;
83     dst += dst_stride;
84   } while (--y != 0);
85 }
86 
87 }  // namespace
88 
SetupCdefBorder(int row4x4)89 void PostFilter::SetupCdefBorder(int row4x4) {
90   assert(row4x4 >= 0);
91   assert(DoCdef());
92   int plane = kPlaneY;
93   do {
94     const ptrdiff_t src_stride = frame_buffer_.stride(plane);
95     const ptrdiff_t dst_stride = cdef_border_.stride(plane);
96     const int row_offset = DivideBy4(row4x4);
97     const int num_pixels = SubsampledValue(
98         MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
99     const int row_width = num_pixels << pixel_size_log2_;
100     const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
101                                              subsampling_y_[plane]);
102     for (int i = 0; i < 4; ++i) {
103       const int row = kCdefBorderRows[subsampling_y_[plane]][i];
104       const int absolute_row =
105           (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
106       if (absolute_row >= plane_height) break;
107       const uint8_t* src =
108           GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
109           row * src_stride;
110       uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
111       memcpy(dst, src, row_width);
112     }
113   } while (++plane < planes_);
114 }
115 
116 template <typename Pixel>
PrepareCdefBlock(int block_width4x4,int block_height4x4,int row4x4,int column4x4,uint16_t * cdef_source,ptrdiff_t cdef_stride,const bool y_plane,const uint8_t border_columns[kMaxPlanes][256],bool use_border_columns)117 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
118                                   int row4x4, int column4x4,
119                                   uint16_t* cdef_source, ptrdiff_t cdef_stride,
120                                   const bool y_plane,
121                                   const uint8_t border_columns[kMaxPlanes][256],
122                                   bool use_border_columns) {
123   assert(y_plane || planes_ == kMaxPlanes);
124   const int max_planes = y_plane ? 1 : kMaxPlanes;
125   const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
126   const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
127   const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
128   const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
129   const int plane_width = SubsampledValue(frame_header_.width, subsampling_x);
130   const int plane_height = SubsampledValue(frame_header_.height, subsampling_y);
131   const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
132   const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
133   // unit_width, unit_height are the same as block_width, block_height unless
134   // it reaches the frame boundary, where block_width < 64 or
135   // block_height < 64. unit_width, unit_height guarantee we build blocks on
136   // a multiple of 8.
137   const int unit_width = Align(block_width, 8 >> subsampling_x);
138   const int unit_height = Align(block_height, 8 >> subsampling_y);
139   const bool is_frame_left = column4x4 == 0;
140   const bool is_frame_right = start_x + block_width >= plane_width;
141   const bool is_frame_top = row4x4 == 0;
142   const bool is_frame_bottom = start_y + block_height >= plane_height;
143   const int y_offset = is_frame_top ? 0 : kCdefBorder;
144   const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
145 
146   for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
147     uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
148                                            kCdefUnitSizeWithBorders *
149                                            kCdefUnitSizeWithBorders;
150     const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
151     const Pixel* src_buffer =
152         reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
153         (start_y - y_offset) * src_stride + start_x;
154     const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
155     const Pixel* cdef_border =
156         (thread_pool_ == nullptr)
157             ? nullptr
158             : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
159                   cdef_border_row_offset * cdef_border_stride + start_x;
160 
161     // All the copying code will use negative indices for populating the left
162     // border. So the starting point is set to kCdefBorder.
163     cdef_src += kCdefBorder;
164 
165     // Copy the top 2 rows as follows;
166     // If is_frame_top is true, both the rows are set to kCdefLargeValue.
167     // Otherwise:
168     //   If multi-threaded filtering is off, the rows are copied from
169     //   |src_buffer|.
170     //   Otherwise, the rows are copied from |cdef_border|.
171     if (is_frame_top) {
172       for (int y = 0; y < kCdefBorder; ++y) {
173         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
174                unit_width + 2 * kCdefBorder);
175         cdef_src += cdef_stride;
176       }
177     } else {
178       const Pixel* top_border =
179           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
180       const int top_border_stride =
181           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
182       for (int y = 0; y < kCdefBorder; ++y) {
183         CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
184                        is_frame_right, cdef_src);
185         top_border += top_border_stride;
186         cdef_src += cdef_stride;
187         // We need to increment |src_buffer| and |cdef_border| in this loop to
188         // set them up for the subsequent loops below.
189         src_buffer += src_stride;
190         cdef_border += cdef_border_stride;
191       }
192     }
193 
194     // Copy the body as follows;
195     // If multi-threaded filtering is off or if is_frame_bottom is true, all the
196     // rows are copied from |src_buffer|.
197     // Otherwise, the first |block_height|-kCdefBorder rows are copied from
198     // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
199     int y = block_height;
200     const int y_threshold =
201         (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
202     const Pixel* left_border =
203         (thread_pool_ == nullptr || !use_border_columns)
204             ? nullptr
205             : reinterpret_cast<const Pixel*>(border_columns[plane]);
206     do {
207       CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
208                      is_frame_right, cdef_src, left_border);
209       cdef_src += cdef_stride;
210       src_buffer += src_stride;
211       if (left_border != nullptr) left_border += kCdefBorder;
212     } while (--y != y_threshold);
213 
214     if (y > 0) {
215       assert(y == kCdefBorder);
216       // |cdef_border| now points to the top 2 rows of the current block. For
217       // the next loop, we need it to point to the bottom 2 rows of the
218       // current block. So increment it by 2 rows.
219       cdef_border += MultiplyBy2(cdef_border_stride);
220       for (int i = 0; i < kCdefBorder; ++i) {
221         CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
222                        is_frame_right, cdef_src);
223         cdef_src += cdef_stride;
224         cdef_border += cdef_border_stride;
225       }
226     }
227 
228     // Copy the bottom 2 rows as follows;
229     // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
230     // Otherwise:
231     //   If multi-threaded filtering is off, the rows are copied from
232     //   |src_buffer|.
233     //   Otherwise, the rows are copied from |cdef_border|.
234     y = 0;
235     if (is_frame_bottom) {
236       do {
237         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
238                unit_width + 2 * kCdefBorder);
239         cdef_src += cdef_stride;
240       } while (++y < kCdefBorder + unit_height - block_height);
241     } else {
242       const Pixel* bottom_border =
243           (thread_pool_ == nullptr) ? src_buffer : cdef_border;
244       const int bottom_border_stride =
245           (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
246       do {
247         CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
248                        is_frame_right, cdef_src);
249         bottom_border += bottom_border_stride;
250         cdef_src += cdef_stride;
251       } while (++y < kCdefBorder + unit_height - block_height);
252     }
253   }
254 }
255 
256 template <typename Pixel>
ApplyCdefForOneUnit(uint16_t * cdef_block,const int index,const int block_width4x4,const int block_height4x4,const int row4x4_start,const int column4x4_start,uint8_t border_columns[2][kMaxPlanes][256],bool use_border_columns[2][2])257 void PostFilter::ApplyCdefForOneUnit(uint16_t* cdef_block, const int index,
258                                      const int block_width4x4,
259                                      const int block_height4x4,
260                                      const int row4x4_start,
261                                      const int column4x4_start,
262                                      uint8_t border_columns[2][kMaxPlanes][256],
263                                      bool use_border_columns[2][2]) {
264   // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
265   static constexpr int kStep = 8;
266   static constexpr int kStep4x4 = 2;
267 
268   int cdef_buffer_row_base_stride[kMaxPlanes];
269   uint8_t* cdef_buffer_row_base[kMaxPlanes];
270   int src_buffer_row_base_stride[kMaxPlanes];
271   const uint8_t* src_buffer_row_base[kMaxPlanes];
272   const uint16_t* cdef_src_row_base[kMaxPlanes];
273   int cdef_src_row_base_stride[kMaxPlanes];
274   int column_step[kMaxPlanes];
275   assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
276   int plane = kPlaneY;
277   do {
278     cdef_buffer_row_base[plane] =
279         GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
280     cdef_buffer_row_base_stride[plane] =
281         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
282     src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
283                                                  row4x4_start, column4x4_start);
284     src_buffer_row_base_stride[plane] =
285         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
286     cdef_src_row_base[plane] =
287         cdef_block +
288         static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
289             kCdefUnitSizeWithBorders +
290         kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
291     cdef_src_row_base_stride[plane] =
292         kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
293     column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
294   } while (++plane < planes_);
295 
296   // |border_columns| contains two buffers. In each call to this function, we
297   // will use one of them as the "destination" for the current call. And the
298   // other one as the "source" for the current call (which would have been the
299   // "destination" of the previous call). We will use the src_index to populate
300   // the borders which were backed up in the previous call. We will use the
301   // dst_index to populate the borders to be used in the next call.
302   const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
303   const int border_columns_dst_index = border_columns_src_index ^ 1;
304 
305   if (index == -1) {
306     if (thread_pool_ == nullptr) {
307       int plane = kPlaneY;
308       do {
309         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
310                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
311                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
312                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
313                    sizeof(Pixel));
314       } while (++plane < planes_);
315     }
316     use_border_columns[border_columns_dst_index][0] = false;
317     use_border_columns[border_columns_dst_index][1] = false;
318     return;
319   }
320 
321   const bool is_frame_right =
322       MultiplyBy4(column4x4_start + block_width4x4) >= frame_header_.width;
323   if (!is_frame_right && thread_pool_ != nullptr) {
324     // Backup the last 2 columns for use in the next iteration.
325     use_border_columns[border_columns_dst_index][0] = true;
326     const uint8_t* src_line =
327         GetSourceBuffer(kPlaneY, row4x4_start,
328                         column4x4_start + block_width4x4) -
329         kCdefBorder * sizeof(Pixel);
330     CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
331                border_columns[border_columns_dst_index][kPlaneY],
332                kCdefBorder * sizeof(Pixel), kCdefBorder,
333                MultiplyBy4(block_height4x4), sizeof(Pixel));
334   }
335 
336   PrepareCdefBlock<Pixel>(
337       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
338       cdef_block, kCdefUnitSizeWithBorders, true,
339       (border_columns != nullptr) ? border_columns[border_columns_src_index]
340                                   : nullptr,
341       use_border_columns[border_columns_src_index][0]);
342 
343   // Stored direction used during the u/v pass.  If bit 3 is set, then block is
344   // a skip.
345   uint8_t direction_y[8 * 8];
346   int y_index = 0;
347 
348   const uint8_t y_primary_strength =
349       frame_header_.cdef.y_primary_strength[index];
350   const uint8_t y_secondary_strength =
351       frame_header_.cdef.y_secondary_strength[index];
352   // y_strength_index is 0 for both primary and secondary strengths being
353   // non-zero, 1 for primary only, 2 for secondary only. This will be updated
354   // with y_primary_strength after variance is applied.
355   int y_strength_index = static_cast<int>(y_secondary_strength == 0);
356 
357   const bool compute_direction_and_variance =
358       (y_primary_strength | frame_header_.cdef.uv_primary_strength[index]) != 0;
359   const uint8_t* skip_row =
360       &cdef_skip_[row4x4_start >> 1][column4x4_start >> 4];
361   const int skip_stride = cdef_skip_.columns();
362   int row4x4 = row4x4_start;
363   do {
364     uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
365     const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
366     const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
367     int column4x4 = column4x4_start;
368 
369     if (*skip_row == 0) {
370       for (int i = 0; i < DivideBy2(block_width4x4); ++i, ++y_index) {
371         direction_y[y_index] = kCdefSkip;
372       }
373       if (thread_pool_ == nullptr) {
374         CopyPixels(src_buffer_base, frame_buffer_.stride(kPlaneY),
375                    cdef_buffer_base, frame_buffer_.stride(kPlaneY), 64, kStep,
376                    sizeof(Pixel));
377       }
378     } else {
379       do {
380         const int block_width = kStep;
381         const int block_height = kStep;
382         const int cdef_stride = frame_buffer_.stride(kPlaneY);
383         uint8_t* const cdef_buffer = cdef_buffer_base;
384         const uint16_t* const cdef_src = cdef_src_base;
385         const int src_stride = frame_buffer_.stride(kPlaneY);
386         const uint8_t* const src_buffer = src_buffer_base;
387 
388         const uint8_t skip_shift = (column4x4 >> 1) & 0x7;
389         const bool skip = ((*skip_row >> skip_shift) & 1) == 0;
390         if (skip) {  // No cdef filtering.
391           direction_y[y_index] = kCdefSkip;
392           if (thread_pool_ == nullptr) {
393             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
394                        block_width, block_height, sizeof(Pixel));
395           }
396         } else {
397           // Zero out residual skip flag.
398           direction_y[y_index] = 0;
399 
400           int variance = 0;
401           if (compute_direction_and_variance) {
402             if (thread_pool_ == nullptr ||
403                 row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
404               dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
405                                   &variance);
406             } else if (sizeof(Pixel) == 2) {
407               dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
408                                   &direction_y[y_index], &variance);
409             } else {
410               // If we are in the last row4x4 for this unit, then the last two
411               // input rows have to come from |cdef_border_|. Since we already
412               // have |cdef_src| populated correctly, use that as the input
413               // for the direction process.
414               uint8_t direction_src[8][8];
415               const uint16_t* cdef_src_line = cdef_src;
416               for (auto& direction_src_line : direction_src) {
417                 for (int i = 0; i < 8; ++i) {
418                   direction_src_line[i] = cdef_src_line[i];
419                 }
420                 cdef_src_line += kCdefUnitSizeWithBorders;
421               }
422               dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
423                                   &variance);
424             }
425           }
426           const int direction =
427               (y_primary_strength == 0) ? 0 : direction_y[y_index];
428           const int variance_strength =
429               ((variance >> 6) != 0) ? std::min(FloorLog2(variance >> 6), 12)
430                                      : 0;
431           const uint8_t primary_strength =
432               (variance != 0)
433                   ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
434                   : 0;
435           if ((primary_strength | y_secondary_strength) == 0) {
436             if (thread_pool_ == nullptr) {
437               CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
438                          block_width, block_height, sizeof(Pixel));
439             }
440           } else {
441             const int strength_index =
442                 y_strength_index |
443                 (static_cast<int>(primary_strength == 0) << 1);
444             dsp_.cdef_filters[1][strength_index](
445                 cdef_src, kCdefUnitSizeWithBorders, block_height,
446                 primary_strength, y_secondary_strength,
447                 frame_header_.cdef.damping, direction, cdef_buffer,
448                 cdef_stride);
449           }
450         }
451         cdef_buffer_base += column_step[kPlaneY];
452         src_buffer_base += column_step[kPlaneY];
453         cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
454 
455         column4x4 += kStep4x4;
456         y_index++;
457       } while (column4x4 < column4x4_start + block_width4x4);
458     }
459 
460     cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
461     src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
462     cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
463     skip_row += skip_stride;
464     row4x4 += kStep4x4;
465   } while (row4x4 < row4x4_start + block_height4x4);
466 
467   if (planes_ == kMaxPlanesMonochrome) {
468     return;
469   }
470 
471   const uint8_t uv_primary_strength =
472       frame_header_.cdef.uv_primary_strength[index];
473   const uint8_t uv_secondary_strength =
474       frame_header_.cdef.uv_secondary_strength[index];
475 
476   if ((uv_primary_strength | uv_secondary_strength) == 0) {
477     if (thread_pool_ == nullptr) {
478       for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
479         CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
480                    cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
481                    MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
482                    MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
483                    sizeof(Pixel));
484       }
485     }
486     use_border_columns[border_columns_dst_index][1] = false;
487     return;
488   }
489 
490   if (!is_frame_right && thread_pool_ != nullptr) {
491     use_border_columns[border_columns_dst_index][1] = true;
492     for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
493       // Backup the last 2 columns for use in the next iteration.
494       const uint8_t* src_line =
495           GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
496                           column4x4_start + block_width4x4) -
497           kCdefBorder * sizeof(Pixel);
498       CopyPixels(src_line, frame_buffer_.stride(plane),
499                  border_columns[border_columns_dst_index][plane],
500                  kCdefBorder * sizeof(Pixel), kCdefBorder,
501                  MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
502                  sizeof(Pixel));
503     }
504   }
505 
506   PrepareCdefBlock<Pixel>(
507       block_width4x4, block_height4x4, row4x4_start, column4x4_start,
508       cdef_block, kCdefUnitSizeWithBorders, false,
509       (border_columns != nullptr) ? border_columns[border_columns_src_index]
510                                   : nullptr,
511       use_border_columns[border_columns_src_index][1]);
512 
513   // uv_strength_index is 0 for both primary and secondary strengths being
514   // non-zero, 1 for primary only, 2 for secondary only.
515   const int uv_strength_index =
516       (static_cast<int>(uv_primary_strength == 0) << 1) |
517       static_cast<int>(uv_secondary_strength == 0);
518   for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
519     const int8_t subsampling_x = subsampling_x_[plane];
520     const int8_t subsampling_y = subsampling_y_[plane];
521     const int block_width = kStep >> subsampling_x;
522     const int block_height = kStep >> subsampling_y;
523     int row4x4 = row4x4_start;
524 
525     y_index = 0;
526     do {
527       uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
528       const uint8_t* src_buffer_base = src_buffer_row_base[plane];
529       const uint16_t* cdef_src_base = cdef_src_row_base[plane];
530       int column4x4 = column4x4_start;
531       do {
532         const int cdef_stride = frame_buffer_.stride(plane);
533         uint8_t* const cdef_buffer = cdef_buffer_base;
534         const int src_stride = frame_buffer_.stride(plane);
535         const uint8_t* const src_buffer = src_buffer_base;
536         const uint16_t* const cdef_src = cdef_src_base;
537         const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
538         int dual_cdef = 0;
539 
540         if (skip) {  // No cdef filtering.
541           if (thread_pool_ == nullptr) {
542             CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
543                        block_width, block_height, sizeof(Pixel));
544           }
545         } else {
546           // Make sure block pair is not out of bounds.
547           if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
548             // Enable dual processing if subsampling_x is 1.
549             dual_cdef = subsampling_x;
550           }
551 
552           int direction = (uv_primary_strength == 0)
553                               ? 0
554                               : kCdefUvDirection[subsampling_x][subsampling_y]
555                                                 [direction_y[y_index]];
556 
557           if (dual_cdef != 0) {
558             if (uv_primary_strength &&
559                 direction_y[y_index] != direction_y[y_index + 1]) {
560               // Disable dual processing if the second block of the pair does
561               // not have the same direction.
562               dual_cdef = 0;
563             }
564 
565             // Disable dual processing if the second block of the pair is a
566             // skip.
567             if (direction_y[y_index + 1] == kCdefSkip) {
568               dual_cdef = 0;
569             }
570           }
571 
572           // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
573           const int width_index = dual_cdef | (subsampling_x ^ 1);
574           dsp_.cdef_filters[width_index][uv_strength_index](
575               cdef_src, kCdefUnitSizeWithBorders, block_height,
576               uv_primary_strength, uv_secondary_strength,
577               frame_header_.cdef.damping - 1, direction, cdef_buffer,
578               cdef_stride);
579         }
580         // When dual_cdef is set, the above cdef_filter() will process 2 blocks,
581         // so adjust the pointers and indexes for 2 blocks.
582         cdef_buffer_base += column_step[plane] << dual_cdef;
583         src_buffer_base += column_step[plane] << dual_cdef;
584         cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
585         column4x4 += kStep4x4 << dual_cdef;
586         y_index += 1 << dual_cdef;
587       } while (column4x4 < column4x4_start + block_width4x4);
588 
589       cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
590       src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
591       cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
592       row4x4 += kStep4x4;
593     } while (row4x4 < row4x4_start + block_height4x4);
594   }
595 }
596 
ApplyCdefForOneSuperBlockRowHelper(uint16_t * cdef_block,uint8_t border_columns[2][kMaxPlanes][256],int row4x4,int block_height4x4)597 void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
598     uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
599     int row4x4, int block_height4x4) {
600   bool use_border_columns[2][2] = {};
601   const bool non_zero_index = frame_header_.cdef.bits > 0;
602   const int8_t* cdef_index =
603       non_zero_index ? cdef_index_[DivideBy16(row4x4)] : nullptr;
604   int column4x4 = 0;
605   do {
606     const int index = non_zero_index ? *cdef_index++ : 0;
607     const int block_width4x4 =
608         std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
609 
610 #if LIBGAV1_MAX_BITDEPTH >= 10
611     if (bitdepth_ >= 10) {
612       ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
613                                     block_height4x4, row4x4, column4x4,
614                                     border_columns, use_border_columns);
615     } else  // NOLINT
616 #endif      // LIBGAV1_MAX_BITDEPTH >= 10
617     {
618       ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
619                                    block_height4x4, row4x4, column4x4,
620                                    border_columns, use_border_columns);
621     }
622     column4x4 += kStep64x64;
623   } while (column4x4 < frame_header_.columns4x4);
624 }
625 
ApplyCdefForOneSuperBlockRow(int row4x4_start,int sb4x4,bool is_last_row)626 void PostFilter::ApplyCdefForOneSuperBlockRow(int row4x4_start, int sb4x4,
627                                               bool is_last_row) {
628   assert(row4x4_start >= 0);
629   assert(DoCdef());
630   int row4x4 = row4x4_start;
631   const int row4x4_limit = row4x4_start + sb4x4;
632   do {
633     if (row4x4 >= frame_header_.rows4x4) return;
634 
635     // Apply cdef for the last 8 rows of the previous superblock row.
636     // One exception: If the superblock size is 128x128 and is_last_row is true,
637     // then we simply apply cdef for the entire superblock row without any lag.
638     // In that case, apply cdef for the previous superblock row only during the
639     // first iteration (row4x4 == row4x4_start).
640     if (row4x4 > 0 && (!is_last_row || row4x4 == row4x4_start)) {
641       assert(row4x4 >= 16);
642       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
643     }
644 
645     // Apply cdef for the current superblock row. If this is the last superblock
646     // row we apply cdef for all the rows, otherwise we leave out the last 8
647     // rows.
648     const int block_height4x4 =
649         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
650     const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
651     if (height4x4 > 0) {
652       ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
653                                          height4x4);
654     }
655     row4x4 += kStep64x64;
656   } while (row4x4 < row4x4_limit);
657 }
658 
ApplyCdefWorker(std::atomic<int> * row4x4_atomic)659 void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
660   int row4x4;
661   uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
662   // Each border_column buffer has to store 64 rows and 2 columns for each
663   // plane. For 10bit, that is 64*2*2 = 256 bytes.
664   alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
665   while ((row4x4 = row4x4_atomic->fetch_add(
666               kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
667     const int block_height4x4 =
668         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
669     ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
670                                        block_height4x4);
671   }
672 }
673 
674 }  // namespace libgav1
675