1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  *
11  */
12 
13 #include <math.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 #include "config/aom_scale_rtcd.h"
18 
19 #include "aom_mem/aom_mem.h"
20 #include "av1/common/onyxc_int.h"
21 #include "av1/common/resize.h"
22 #include "av1/common/restoration.h"
23 #include "aom_dsp/aom_dsp_common.h"
24 #include "aom_mem/aom_mem.h"
25 
26 #include "aom_ports/mem.h"
27 
28 // The 's' values are calculated based on original 'r' and 'e' values in the
29 // spec using GenSgrprojVtable().
30 // Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
31 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
32   { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
33   { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
34   { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
35   { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
36   { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
37   { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
38   { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
39   { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
40 };
41 
av1_whole_frame_rect(const AV1_COMMON * cm,int is_uv)42 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
43   AV1PixelRect rect;
44 
45   int ss_x = is_uv && cm->seq_params.subsampling_x;
46   int ss_y = is_uv && cm->seq_params.subsampling_y;
47 
48   rect.top = 0;
49   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
50   rect.left = 0;
51   rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
52   return rect;
53 }
54 
55 // Count horizontal or vertical units per tile (use a width or height for
56 // tile_size, respectively). We basically want to divide the tile size by the
57 // size of a restoration unit. Rather than rounding up unconditionally as you
58 // might expect, we round to nearest, which models the way a right or bottom
59 // restoration unit can extend to up to 150% its normal width or height. The
60 // max with 1 is to deal with tiles that are smaller than half of a restoration
61 // unit.
av1_lr_count_units_in_tile(int unit_size,int tile_size)62 int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
63   return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
64 }
65 
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rsi,int is_uv)66 void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
67                                   int is_uv) {
68   // We need to allocate enough space for restoration units to cover the
69   // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
70   // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
71   // to do the computation ourselves, iterating over the tiles and keeping
72   // track of the largest width and height, then upscaling.
73   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
74   const int max_tile_w = tile_rect.right - tile_rect.left;
75   const int max_tile_h = tile_rect.bottom - tile_rect.top;
76 
77   // To calculate hpertile and vpertile (horizontal and vertical units per
78   // tile), we basically want to divide the largest tile width or height by the
79   // size of a restoration unit. Rather than rounding up unconditionally as you
80   // might expect, we round to nearest, which models the way a right or bottom
81   // restoration unit can extend to up to 150% its normal width or height. The
82   // max with 1 is to deal with tiles that are smaller than half of a
83   // restoration unit.
84   const int unit_size = rsi->restoration_unit_size;
85   const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
86   const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
87 
88   rsi->units_per_tile = hpertile * vpertile;
89   rsi->horz_units_per_tile = hpertile;
90   rsi->vert_units_per_tile = vpertile;
91 
92   const int ntiles = 1;
93   const int nunits = ntiles * rsi->units_per_tile;
94 
95   aom_free(rsi->unit_info);
96   CHECK_MEM_ERROR(cm, rsi->unit_info,
97                   (RestorationUnitInfo *)aom_memalign(
98                       16, sizeof(*rsi->unit_info) * nunits));
99 }
100 
av1_free_restoration_struct(RestorationInfo * rst_info)101 void av1_free_restoration_struct(RestorationInfo *rst_info) {
102   aom_free(rst_info->unit_info);
103   rst_info->unit_info = NULL;
104 }
105 
106 #if 0
107 // Pair of values for each sgrproj parameter:
108 // Index 0 corresponds to r[0], e[0]
109 // Index 1 corresponds to r[1], e[1]
110 int sgrproj_mtable[SGRPROJ_PARAMS][2];
111 
112 static void GenSgrprojVtable() {
113   for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
114     const sgr_params_type *const params = &sgr_params[i];
115     for (int j = 0; j < 2; ++j) {
116       const int e = params->e[j];
117       const int r = params->r[j];
118       if (r == 0) {                 // filter is disabled
119         sgrproj_mtable[i][j] = -1;  // mark invalid
120       } else {                      // filter is enabled
121         const int n = (2 * r + 1) * (2 * r + 1);
122         const int n2e = n * n * e;
123         assert(n2e != 0);
124         sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
125       }
126     }
127   }
128 }
129 #endif
130 
av1_loop_restoration_precal()131 void av1_loop_restoration_precal() {
132 #if 0
133   GenSgrprojVtable();
134 #endif
135 }
136 
extend_frame_lowbd(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)137 static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
138                                int border_horz, int border_vert) {
139   uint8_t *data_p;
140   int i;
141   for (i = 0; i < height; ++i) {
142     data_p = data + i * stride;
143     memset(data_p - border_horz, data_p[0], border_horz);
144     memset(data_p + width, data_p[width - 1], border_horz);
145   }
146   data_p = data - border_horz;
147   for (i = -border_vert; i < 0; ++i) {
148     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
149   }
150   for (i = height; i < height + border_vert; ++i) {
151     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
152            width + 2 * border_horz);
153   }
154 }
155 
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)156 static void extend_frame_highbd(uint16_t *data, int width, int height,
157                                 int stride, int border_horz, int border_vert) {
158   uint16_t *data_p;
159   int i, j;
160   for (i = 0; i < height; ++i) {
161     data_p = data + i * stride;
162     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
163     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
164   }
165   data_p = data - border_horz;
166   for (i = -border_vert; i < 0; ++i) {
167     memcpy(data_p + i * stride, data_p,
168            (width + 2 * border_horz) * sizeof(uint16_t));
169   }
170   for (i = height; i < height + border_vert; ++i) {
171     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
172            (width + 2 * border_horz) * sizeof(uint16_t));
173   }
174 }
175 
extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert,int highbd)176 void extend_frame(uint8_t *data, int width, int height, int stride,
177                   int border_horz, int border_vert, int highbd) {
178   if (highbd)
179     extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
180                         border_horz, border_vert);
181   else
182     extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
183 }
184 
copy_tile_lowbd(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride)185 static void copy_tile_lowbd(int width, int height, const uint8_t *src,
186                             int src_stride, uint8_t *dst, int dst_stride) {
187   for (int i = 0; i < height; ++i)
188     memcpy(dst + i * dst_stride, src + i * src_stride, width);
189 }
190 
copy_tile_highbd(int width,int height,const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride)191 static void copy_tile_highbd(int width, int height, const uint16_t *src,
192                              int src_stride, uint16_t *dst, int dst_stride) {
193   for (int i = 0; i < height; ++i)
194     memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
195 }
196 
copy_tile(int width,int height,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int highbd)197 static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
198                       uint8_t *dst, int dst_stride, int highbd) {
199   if (highbd)
200     copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
201                      CONVERT_TO_SHORTPTR(dst), dst_stride);
202   else
203     copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
204 }
205 
206 #define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
207 
208 // With striped loop restoration, the filtering for each 64-pixel stripe gets
209 // most of its input from the output of CDEF (stored in data8), but we need to
210 // fill out a border of 3 pixels above/below the stripe according to the
211 // following
212 // rules:
213 //
214 // * At a frame boundary, we copy the outermost row of CDEF pixels three times.
215 //   This extension is done by a call to extend_frame() at the start of the loop
216 //   restoration process, so the value of copy_above/copy_below doesn't strictly
217 //   matter.
218 //   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
219 //   across tiles is disabled, we can allow
220 //   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
221 //   data has always been copied, simplifying the behaviour at the left and
222 //   right edges of tiles.
223 //
224 // * If we're at a tile boundary and loop filtering across tiles is enabled,
225 //   then there is a logical stripe which is 64 pixels high, but which is split
226 //   into an 8px high and a 56px high stripe so that the processing (and
227 //   coefficient set usage) can be aligned to tiles.
228 //   In this case, we use the 3 rows of CDEF output across the boundary for
229 //   context; this corresponds to leaving the frame buffer as-is.
230 //
231 // * If we're at a tile boundary and loop filtering across tiles is disabled,
232 //   then we take the outermost row of CDEF pixels *within the current tile*
233 //   and copy it three times. Thus we behave exactly as if the tile were a full
234 //   frame.
235 //
236 // * Otherwise, we're at a stripe boundary within a tile. In that case, we
237 //   take 2 rows of deblocked pixels and extend them to 3 rows of context.
238 //
239 // The distinction between the latter two cases is handled by the
240 // av1_loop_restoration_save_boundary_lines() function, so here we just need
241 // to decide if we're overwriting the above/below boundary pixels or not.
get_stripe_boundary_info(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int ss_y,int * copy_above,int * copy_below)242 static void get_stripe_boundary_info(const RestorationTileLimits *limits,
243                                      const AV1PixelRect *tile_rect, int ss_y,
244                                      int *copy_above, int *copy_below) {
245   *copy_above = 1;
246   *copy_below = 1;
247 
248   const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
249   const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
250 
251   const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
252   const int this_stripe_height =
253       full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
254   const int last_stripe_in_tile =
255       (limits->v_start + this_stripe_height >= tile_rect->bottom);
256 
257   if (first_stripe_in_tile) *copy_above = 0;
258   if (last_stripe_in_tile) *copy_below = 0;
259 }
260 
261 // Overwrite the border pixels around a processing stripe so that the conditions
262 // listed above get_stripe_boundary_info() are preserved.
263 // We save the pixels which get overwritten into a temporary buffer, so that
264 // they can be restored by restore_processing_stripe_boundary() after we've
265 // processed the stripe.
266 //
267 // limits gives the rectangular limits of the remaining stripes for the current
268 // restoration unit. rsb is the stored stripe boundaries (taken from either
269 // deblock or CDEF output as necessary).
270 //
271 // tile_rect is the limits of the current tile and tile_stripe0 is the index of
272 // the first stripe in this tile (needed to convert the tile-relative stripe
273 // index we get from limits into something we can look up in rsb).
setup_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationStripeBoundaries * rsb,int rsb_row,int use_highbd,int h,uint8_t * data8,int data_stride,RestorationLineBuffers * rlbs,int copy_above,int copy_below,int opt)274 static void setup_processing_stripe_boundary(
275     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
276     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
277     RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
278   // Offsets within the line buffers. The buffer logically starts at column
279   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
280   // has column x0 in the buffer.
281   const int buf_stride = rsb->stripe_boundary_stride;
282   const int buf_x0_off = limits->h_start;
283   const int line_width =
284       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
285   const int line_size = line_width << use_highbd;
286 
287   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
288 
289   // Replace RESTORATION_BORDER pixels above the top of the stripe
290   // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
291   // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
292   // duplicating the topmost of the 2 lines (see the AOMMAX call when
293   // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
294   //
295   // Special case: If we're at the top of a tile, which isn't on the topmost
296   // tile row, and we're allowed to loop filter across tiles, then we have a
297   // logical 64-pixel-high stripe which has been split into an 8-pixel high
298   // stripe and a 56-pixel high stripe (the current one). So, in this case,
299   // we want to leave the boundary alone!
300   if (!opt) {
301     if (copy_above) {
302       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
303 
304       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
305         const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
306         const int buf_off = buf_x0_off + buf_row * buf_stride;
307         const uint8_t *buf =
308             rsb->stripe_boundary_above + (buf_off << use_highbd);
309         uint8_t *dst8 = data8_tl + i * data_stride;
310         // Save old pixels, then replace with data from stripe_boundary_above
311         memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
312                REAL_PTR(use_highbd, dst8), line_size);
313         memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
314       }
315     }
316 
317     // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
318     // The second buffer row is repeated, so src_row gets the values 0, 1, 1
319     // for i = 0, 1, 2.
320     if (copy_below) {
321       const int stripe_end = limits->v_start + h;
322       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
323 
324       for (int i = 0; i < RESTORATION_BORDER; ++i) {
325         const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
326         const int buf_off = buf_x0_off + buf_row * buf_stride;
327         const uint8_t *src =
328             rsb->stripe_boundary_below + (buf_off << use_highbd);
329 
330         uint8_t *dst8 = data8_bl + i * data_stride;
331         // Save old pixels, then replace with data from stripe_boundary_below
332         memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
333         memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
334       }
335     }
336   } else {
337     if (copy_above) {
338       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
339 
340       // Only save and overwrite i=-RESTORATION_BORDER line.
341       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
342       // Save old pixels, then replace with data from stripe_boundary_above
343       memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
344       memcpy(REAL_PTR(use_highbd, dst8),
345              REAL_PTR(use_highbd,
346                       data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
347              line_size);
348     }
349 
350     if (copy_below) {
351       const int stripe_end = limits->v_start + h;
352       uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
353 
354       // Only save and overwrite i=2 line.
355       uint8_t *dst8 = data8_bl + 2 * data_stride;
356       // Save old pixels, then replace with data from stripe_boundary_below
357       memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
358       memcpy(REAL_PTR(use_highbd, dst8),
359              REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
360     }
361   }
362 }
363 
364 // This function restores the boundary lines modified by
365 // setup_processing_stripe_boundary.
366 //
367 // Note: We need to be careful when handling the corners of the processing
368 // unit, because (eg.) the top-left corner is considered to be part of
369 // both the left and top borders. This means that, depending on the
370 // loop_filter_across_tiles_enabled flag, the corner pixels might get
371 // overwritten twice, once as part of the "top" border and once as part
372 // of the "left" border (or similar for other corners).
373 //
374 // Everything works out fine as long as we make sure to reverse the order
375 // when restoring, ie. we need to restore the left/right borders followed
376 // by the top/bottom borders.
restore_processing_stripe_boundary(const RestorationTileLimits * limits,const RestorationLineBuffers * rlbs,int use_highbd,int h,uint8_t * data8,int data_stride,int copy_above,int copy_below,int opt)377 static void restore_processing_stripe_boundary(
378     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
379     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
380     int copy_below, int opt) {
381   const int line_width =
382       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
383   const int line_size = line_width << use_highbd;
384 
385   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
386 
387   if (!opt) {
388     if (copy_above) {
389       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
390       for (int i = -RESTORATION_BORDER; i < 0; ++i) {
391         uint8_t *dst8 = data8_tl + i * data_stride;
392         memcpy(REAL_PTR(use_highbd, dst8),
393                rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
394       }
395     }
396 
397     if (copy_below) {
398       const int stripe_bottom = limits->v_start + h;
399       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
400 
401       for (int i = 0; i < RESTORATION_BORDER; ++i) {
402         if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
403 
404         uint8_t *dst8 = data8_bl + i * data_stride;
405         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
406       }
407     }
408   } else {
409     if (copy_above) {
410       uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
411 
412       // Only restore i=-RESTORATION_BORDER line.
413       uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
414       memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
415     }
416 
417     if (copy_below) {
418       const int stripe_bottom = limits->v_start + h;
419       uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
420 
421       // Only restore i=2 line.
422       if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
423         uint8_t *dst8 = data8_bl + 2 * data_stride;
424         memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
425       }
426     }
427   }
428 }
429 
wiener_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)430 static void wiener_filter_stripe(const RestorationUnitInfo *rui,
431                                  int stripe_width, int stripe_height,
432                                  int procunit_width, const uint8_t *src,
433                                  int src_stride, uint8_t *dst, int dst_stride,
434                                  int32_t *tmpbuf, int bit_depth) {
435   (void)tmpbuf;
436   (void)bit_depth;
437   assert(bit_depth == 8);
438   const ConvolveParams conv_params = get_conv_params_wiener(8);
439 
440   for (int j = 0; j < stripe_width; j += procunit_width) {
441     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
442     const uint8_t *src_p = src + j;
443     uint8_t *dst_p = dst + j;
444     av1_wiener_convolve_add_src(
445         src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
446         rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
447   }
448 }
449 
450 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
451    over the input. The window is of size (2r + 1)x(2r + 1), and we
452    specialize to r = 1, 2, 3. A default function is used for r > 3.
453 
454    Each loop follows the same format: We keep a window's worth of input
455    in individual variables and select data out of that as appropriate.
456 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)457 static void boxsum1(int32_t *src, int width, int height, int src_stride,
458                     int sqr, int32_t *dst, int dst_stride) {
459   int i, j, a, b, c;
460   assert(width > 2 * SGRPROJ_BORDER_HORZ);
461   assert(height > 2 * SGRPROJ_BORDER_VERT);
462 
463   // Vertical sum over 3-pixel regions, from src into dst.
464   if (!sqr) {
465     for (j = 0; j < width; ++j) {
466       a = src[j];
467       b = src[src_stride + j];
468       c = src[2 * src_stride + j];
469 
470       dst[j] = a + b;
471       for (i = 1; i < height - 2; ++i) {
472         // Loop invariant: At the start of each iteration,
473         // a = src[(i - 1) * src_stride + j]
474         // b = src[(i    ) * src_stride + j]
475         // c = src[(i + 1) * src_stride + j]
476         dst[i * dst_stride + j] = a + b + c;
477         a = b;
478         b = c;
479         c = src[(i + 2) * src_stride + j];
480       }
481       dst[i * dst_stride + j] = a + b + c;
482       dst[(i + 1) * dst_stride + j] = b + c;
483     }
484   } else {
485     for (j = 0; j < width; ++j) {
486       a = src[j] * src[j];
487       b = src[src_stride + j] * src[src_stride + j];
488       c = src[2 * src_stride + j] * src[2 * src_stride + j];
489 
490       dst[j] = a + b;
491       for (i = 1; i < height - 2; ++i) {
492         dst[i * dst_stride + j] = a + b + c;
493         a = b;
494         b = c;
495         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
496       }
497       dst[i * dst_stride + j] = a + b + c;
498       dst[(i + 1) * dst_stride + j] = b + c;
499     }
500   }
501 
502   // Horizontal sum over 3-pixel regions of dst
503   for (i = 0; i < height; ++i) {
504     a = dst[i * dst_stride];
505     b = dst[i * dst_stride + 1];
506     c = dst[i * dst_stride + 2];
507 
508     dst[i * dst_stride] = a + b;
509     for (j = 1; j < width - 2; ++j) {
510       // Loop invariant: At the start of each iteration,
511       // a = src[i * src_stride + (j - 1)]
512       // b = src[i * src_stride + (j    )]
513       // c = src[i * src_stride + (j + 1)]
514       dst[i * dst_stride + j] = a + b + c;
515       a = b;
516       b = c;
517       c = dst[i * dst_stride + (j + 2)];
518     }
519     dst[i * dst_stride + j] = a + b + c;
520     dst[i * dst_stride + (j + 1)] = b + c;
521   }
522 }
523 
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)524 static void boxsum2(int32_t *src, int width, int height, int src_stride,
525                     int sqr, int32_t *dst, int dst_stride) {
526   int i, j, a, b, c, d, e;
527   assert(width > 2 * SGRPROJ_BORDER_HORZ);
528   assert(height > 2 * SGRPROJ_BORDER_VERT);
529 
530   // Vertical sum over 5-pixel regions, from src into dst.
531   if (!sqr) {
532     for (j = 0; j < width; ++j) {
533       a = src[j];
534       b = src[src_stride + j];
535       c = src[2 * src_stride + j];
536       d = src[3 * src_stride + j];
537       e = src[4 * src_stride + j];
538 
539       dst[j] = a + b + c;
540       dst[dst_stride + j] = a + b + c + d;
541       for (i = 2; i < height - 3; ++i) {
542         // Loop invariant: At the start of each iteration,
543         // a = src[(i - 2) * src_stride + j]
544         // b = src[(i - 1) * src_stride + j]
545         // c = src[(i    ) * src_stride + j]
546         // d = src[(i + 1) * src_stride + j]
547         // e = src[(i + 2) * src_stride + j]
548         dst[i * dst_stride + j] = a + b + c + d + e;
549         a = b;
550         b = c;
551         c = d;
552         d = e;
553         e = src[(i + 3) * src_stride + j];
554       }
555       dst[i * dst_stride + j] = a + b + c + d + e;
556       dst[(i + 1) * dst_stride + j] = b + c + d + e;
557       dst[(i + 2) * dst_stride + j] = c + d + e;
558     }
559   } else {
560     for (j = 0; j < width; ++j) {
561       a = src[j] * src[j];
562       b = src[src_stride + j] * src[src_stride + j];
563       c = src[2 * src_stride + j] * src[2 * src_stride + j];
564       d = src[3 * src_stride + j] * src[3 * src_stride + j];
565       e = src[4 * src_stride + j] * src[4 * src_stride + j];
566 
567       dst[j] = a + b + c;
568       dst[dst_stride + j] = a + b + c + d;
569       for (i = 2; i < height - 3; ++i) {
570         dst[i * dst_stride + j] = a + b + c + d + e;
571         a = b;
572         b = c;
573         c = d;
574         d = e;
575         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
576       }
577       dst[i * dst_stride + j] = a + b + c + d + e;
578       dst[(i + 1) * dst_stride + j] = b + c + d + e;
579       dst[(i + 2) * dst_stride + j] = c + d + e;
580     }
581   }
582 
583   // Horizontal sum over 5-pixel regions of dst
584   for (i = 0; i < height; ++i) {
585     a = dst[i * dst_stride];
586     b = dst[i * dst_stride + 1];
587     c = dst[i * dst_stride + 2];
588     d = dst[i * dst_stride + 3];
589     e = dst[i * dst_stride + 4];
590 
591     dst[i * dst_stride] = a + b + c;
592     dst[i * dst_stride + 1] = a + b + c + d;
593     for (j = 2; j < width - 3; ++j) {
594       // Loop invariant: At the start of each iteration,
595       // a = src[i * src_stride + (j - 2)]
596       // b = src[i * src_stride + (j - 1)]
597       // c = src[i * src_stride + (j    )]
598       // d = src[i * src_stride + (j + 1)]
599       // e = src[i * src_stride + (j + 2)]
600       dst[i * dst_stride + j] = a + b + c + d + e;
601       a = b;
602       b = c;
603       c = d;
604       d = e;
605       e = dst[i * dst_stride + (j + 3)];
606     }
607     dst[i * dst_stride + j] = a + b + c + d + e;
608     dst[i * dst_stride + (j + 1)] = b + c + d + e;
609     dst[i * dst_stride + (j + 2)] = c + d + e;
610   }
611 }
612 
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)613 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
614                    int sqr, int32_t *dst, int dst_stride) {
615   if (r == 1)
616     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
617   else if (r == 2)
618     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
619   else
620     assert(0 && "Invalid value of r in self-guided filter");
621 }
622 
decode_xq(const int * xqd,int * xq,const sgr_params_type * params)623 void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
624   if (params->r[0] == 0) {
625     xq[0] = 0;
626     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
627   } else if (params->r[1] == 0) {
628     xq[0] = xqd[0];
629     xq[1] = 0;
630   } else {
631     xq[0] = xqd[0];
632     xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
633   }
634 }
635 
636 const int32_t x_by_xplus1[256] = {
637   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
638   // instead of 0. See comments in selfguided_restoration_internal() for why
639   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
640   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
641   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
642   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
643   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
644   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
645   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
646   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
647   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
648   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
649   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
650   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
651   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
652   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
653   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
654   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
655   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
656   256,
657 };
658 
659 const int32_t one_by_x[MAX_NELEM] = {
660   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
661   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
662 };
663 
calculate_intermediate_result(int32_t * dgd,int width,int height,int dgd_stride,int bit_depth,int sgr_params_idx,int radius_idx,int pass,int32_t * A,int32_t * B)664 static void calculate_intermediate_result(int32_t *dgd, int width, int height,
665                                           int dgd_stride, int bit_depth,
666                                           int sgr_params_idx, int radius_idx,
667                                           int pass, int32_t *A, int32_t *B) {
668   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
669   const int r = params->r[radius_idx];
670   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
671   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
672   // Adjusting the stride of A and B here appears to avoid bad cache effects,
673   // leading to a significant speed improvement.
674   // We also align the stride to a multiple of 16 bytes, for consistency
675   // with the SIMD version of this function.
676   int buf_stride = ((width_ext + 3) & ~3) + 16;
677   const int step = pass == 0 ? 1 : 2;
678   int i, j;
679 
680   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
681   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
682          "Need SGRPROJ_BORDER_* >= r+1");
683 
684   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
685          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
686   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
687          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
688   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
689   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
690   // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
691   // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
692   for (i = -1; i < height + 1; i += step) {
693     for (j = -1; j < width + 1; ++j) {
694       const int k = i * buf_stride + j;
695       const int n = (2 * r + 1) * (2 * r + 1);
696 
697       // a < 2^16 * n < 2^22 regardless of bit depth
698       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
699       // b < 2^8 * n < 2^14 regardless of bit depth
700       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
701 
702       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
703       // and p itself satisfies p < 2^14 * n^2 < 2^26.
704       // This bound on p is due to:
705       // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
706       //
707       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
708       // This is an artefact of rounding, and can only happen if all pixels
709       // are (almost) identical, so in this case we saturate to p=0.
710       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
711 
712       const uint32_t s = params->s[radius_idx];
713 
714       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
715       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
716       // (this holds even after accounting for the rounding in s)
717       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
718 
719       // Note: We have to be quite careful about the value of A[k].
720       // This is used as a blend factor between individual pixel values and the
721       // local mean. So it logically has a range of [0, 256], including both
722       // endpoints.
723       //
724       // This is a pain for hardware, as we'd like something which can be stored
725       // in exactly 8 bits.
726       // Further, in the calculation of B[k] below, if z == 0 and r == 2,
727       // then A[k] "should be" 0. But then we can end up setting B[k] to a value
728       // slightly above 2^(8 + bit depth), due to rounding in the value of
729       // one_by_x[25-1].
730       //
731       // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
732       // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
733       // overflow), without significantly affecting the final result: z == 0
734       // implies that the image is essentially "flat", so the local mean and
735       // individual pixel values are very similar.
736       //
737       // Note that saturating on the other side, ie. requring A[k] <= 255,
738       // would be a bad idea, as that corresponds to the case where the image
739       // is very variable, when we want to preserve the local pixel value as
740       // much as possible.
741       A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
742 
743       // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
744       // one_by_x[n - 1] = round(2^12 / n)
745       // => the product here is < 2^(20 + bit_depth) <= 2^32,
746       // and B[k] is set to a value < 2^(8 + bit depth)
747       // This holds even with the rounding in one_by_x and in the overall
748       // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
749       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
750                                              (uint32_t)B[k] *
751                                              (uint32_t)one_by_x[n - 1],
752                                          SGRPROJ_RECIP_BITS);
753     }
754   }
755 }
756 
selfguided_restoration_fast_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)757 static void selfguided_restoration_fast_internal(
758     int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
759     int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
760   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
761   const int r = params->r[radius_idx];
762   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
763   // Adjusting the stride of A and B here appears to avoid bad cache effects,
764   // leading to a significant speed improvement.
765   // We also align the stride to a multiple of 16 bytes, for consistency
766   // with the SIMD version of this function.
767   int buf_stride = ((width_ext + 3) & ~3) + 16;
768   int32_t A_[RESTORATION_PROC_UNIT_PELS];
769   int32_t B_[RESTORATION_PROC_UNIT_PELS];
770   int32_t *A = A_;
771   int32_t *B = B_;
772   int i, j;
773   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
774                                 sgr_params_idx, radius_idx, 1, A, B);
775   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
776   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
777 
778   // Use the A[] and B[] arrays to calculate the filtered image
779   (void)r;
780   assert(r == 2);
781   for (i = 0; i < height; ++i) {
782     if (!(i & 1)) {  // even row
783       for (j = 0; j < width; ++j) {
784         const int k = i * buf_stride + j;
785         const int l = i * dgd_stride + j;
786         const int m = i * dst_stride + j;
787         const int nb = 5;
788         const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
789                           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
790                            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
791                               5;
792         const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
793                           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
794                            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
795                               5;
796         const int32_t v = a * dgd[l] + b;
797         dst[m] =
798             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
799       }
800     } else {  // odd row
801       for (j = 0; j < width; ++j) {
802         const int k = i * buf_stride + j;
803         const int l = i * dgd_stride + j;
804         const int m = i * dst_stride + j;
805         const int nb = 4;
806         const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
807         const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
808         const int32_t v = a * dgd[l] + b;
809         dst[m] =
810             ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
811       }
812     }
813   }
814 }
815 
selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)816 static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
817                                             int dgd_stride, int32_t *dst,
818                                             int dst_stride, int bit_depth,
819                                             int sgr_params_idx,
820                                             int radius_idx) {
821   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
822   // Adjusting the stride of A and B here appears to avoid bad cache effects,
823   // leading to a significant speed improvement.
824   // We also align the stride to a multiple of 16 bytes, for consistency
825   // with the SIMD version of this function.
826   int buf_stride = ((width_ext + 3) & ~3) + 16;
827   int32_t A_[RESTORATION_PROC_UNIT_PELS];
828   int32_t B_[RESTORATION_PROC_UNIT_PELS];
829   int32_t *A = A_;
830   int32_t *B = B_;
831   int i, j;
832   calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
833                                 sgr_params_idx, radius_idx, 0, A, B);
834   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
835   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
836 
837   // Use the A[] and B[] arrays to calculate the filtered image
838   for (i = 0; i < height; ++i) {
839     for (j = 0; j < width; ++j) {
840       const int k = i * buf_stride + j;
841       const int l = i * dgd_stride + j;
842       const int m = i * dst_stride + j;
843       const int nb = 5;
844       const int32_t a =
845           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
846               4 +
847           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
848            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
849               3;
850       const int32_t b =
851           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
852               4 +
853           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
854            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
855               3;
856       const int32_t v = a * dgd[l] + b;
857       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
858     }
859   }
860 }
861 
av1_selfguided_restoration_c(const uint8_t * dgd8,int width,int height,int dgd_stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)862 int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
863                                  int dgd_stride, int32_t *flt0, int32_t *flt1,
864                                  int flt_stride, int sgr_params_idx,
865                                  int bit_depth, int highbd) {
866   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
867   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
868   int32_t *dgd32 =
869       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
870 
871   if (highbd) {
872     const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
873     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
874       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
875         dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
876       }
877     }
878   } else {
879     for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
880       for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
881         dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
882       }
883     }
884   }
885 
886   const sgr_params_type *const params = &sgr_params[sgr_params_idx];
887   // If params->r == 0 we skip the corresponding filter. We only allow one of
888   // the radii to be 0, as having both equal to 0 would be equivalent to
889   // skipping SGR entirely.
890   assert(!(params->r[0] == 0 && params->r[1] == 0));
891 
892   if (params->r[0] > 0)
893     selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
894                                          flt0, flt_stride, bit_depth,
895                                          sgr_params_idx, 0);
896   if (params->r[1] > 0)
897     selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
898                                     flt_stride, bit_depth, sgr_params_idx, 1);
899   return 0;
900 }
901 
apply_selfguided_restoration_c(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)902 void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
903                                     int stride, int eps, const int *xqd,
904                                     uint8_t *dst8, int dst_stride,
905                                     int32_t *tmpbuf, int bit_depth,
906                                     int highbd) {
907   int32_t *flt0 = tmpbuf;
908   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
909   assert(width * height <= RESTORATION_UNITPELS_MAX);
910 
911   const int ret = av1_selfguided_restoration_c(
912       dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
913   (void)ret;
914   assert(!ret);
915   const sgr_params_type *const params = &sgr_params[eps];
916   int xq[2];
917   decode_xq(xqd, xq, params);
918   for (int i = 0; i < height; ++i) {
919     for (int j = 0; j < width; ++j) {
920       const int k = i * width + j;
921       uint8_t *dst8ij = dst8 + i * dst_stride + j;
922       const uint8_t *dat8ij = dat8 + i * stride + j;
923 
924       const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
925       const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
926       int32_t v = u << SGRPROJ_PRJ_BITS;
927       // If params->r == 0 then we skipped the filtering in
928       // av1_selfguided_restoration_c, i.e. flt[k] == u
929       if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
930       if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
931       const int16_t w =
932           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
933 
934       const uint16_t out = clip_pixel_highbd(w, bit_depth);
935       if (highbd)
936         *CONVERT_TO_SHORTPTR(dst8ij) = out;
937       else
938         *dst8ij = (uint8_t)out;
939     }
940   }
941 }
942 
sgrproj_filter_stripe(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int32_t * tmpbuf,int bit_depth)943 static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
944                                   int stripe_width, int stripe_height,
945                                   int procunit_width, const uint8_t *src,
946                                   int src_stride, uint8_t *dst, int dst_stride,
947                                   int32_t *tmpbuf, int bit_depth) {
948   (void)bit_depth;
949   assert(bit_depth == 8);
950 
951   for (int j = 0; j < stripe_width; j += procunit_width) {
952     int w = AOMMIN(procunit_width, stripe_width - j);
953     apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
954                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
955                                  dst + j, dst_stride, tmpbuf, bit_depth, 0);
956   }
957 }
958 
wiener_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)959 static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
960                                         int stripe_width, int stripe_height,
961                                         int procunit_width, const uint8_t *src8,
962                                         int src_stride, uint8_t *dst8,
963                                         int dst_stride, int32_t *tmpbuf,
964                                         int bit_depth) {
965   (void)tmpbuf;
966   const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
967 
968   for (int j = 0; j < stripe_width; j += procunit_width) {
969     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
970     const uint8_t *src8_p = src8 + j;
971     uint8_t *dst8_p = dst8 + j;
972     av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
973                                        rui->wiener_info.hfilter, 16,
974                                        rui->wiener_info.vfilter, 16, w,
975                                        stripe_height, &conv_params, bit_depth);
976   }
977 }
978 
sgrproj_filter_stripe_highbd(const RestorationUnitInfo * rui,int stripe_width,int stripe_height,int procunit_width,const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth)979 static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
980                                          int stripe_width, int stripe_height,
981                                          int procunit_width,
982                                          const uint8_t *src8, int src_stride,
983                                          uint8_t *dst8, int dst_stride,
984                                          int32_t *tmpbuf, int bit_depth) {
985   for (int j = 0; j < stripe_width; j += procunit_width) {
986     int w = AOMMIN(procunit_width, stripe_width - j);
987     apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
988                                  rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
989                                  dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
990   }
991 }
992 
993 typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
994                                   int stripe_width, int stripe_height,
995                                   int procunit_width, const uint8_t *src,
996                                   int src_stride, uint8_t *dst, int dst_stride,
997                                   int32_t *tmpbuf, int bit_depth);
998 
999 #define NUM_STRIPE_FILTERS 4
1000 
1001 static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
1002   wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
1003   sgrproj_filter_stripe_highbd
1004 };
1005 
1006 // Filter one restoration unit
av1_loop_restoration_filter_unit(const RestorationTileLimits * limits,const RestorationUnitInfo * rui,const RestorationStripeBoundaries * rsb,RestorationLineBuffers * rlbs,const AV1PixelRect * tile_rect,int tile_stripe0,int ss_x,int ss_y,int highbd,int bit_depth,uint8_t * data8,int stride,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int optimized_lr)1007 void av1_loop_restoration_filter_unit(
1008     const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
1009     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
1010     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
1011     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
1012     int dst_stride, int32_t *tmpbuf, int optimized_lr) {
1013   RestorationType unit_rtype = rui->restoration_type;
1014 
1015   int unit_h = limits->v_end - limits->v_start;
1016   int unit_w = limits->h_end - limits->h_start;
1017   uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
1018   uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
1019 
1020   if (unit_rtype == RESTORE_NONE) {
1021     copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
1022     return;
1023   }
1024 
1025   const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
1026   assert(filter_idx < NUM_STRIPE_FILTERS);
1027   const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
1028 
1029   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
1030 
1031   // Convolve the whole tile one stripe at a time
1032   RestorationTileLimits remaining_stripes = *limits;
1033   int i = 0;
1034   while (i < unit_h) {
1035     int copy_above, copy_below;
1036     remaining_stripes.v_start = limits->v_start + i;
1037 
1038     get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
1039                              &copy_below);
1040 
1041     const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1042     const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
1043 
1044     // Work out where this stripe's boundaries are within
1045     // rsb->stripe_boundary_{above,below}
1046     const int tile_stripe =
1047         (remaining_stripes.v_start - tile_rect->top + runit_offset) /
1048         full_stripe_height;
1049     const int frame_stripe = tile_stripe0 + tile_stripe;
1050     const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
1051 
1052     // Calculate this stripe's height, based on two rules:
1053     // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
1054     // * We can't extend past the end of the current restoration unit
1055     const int nominal_stripe_height =
1056         full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
1057     const int h = AOMMIN(nominal_stripe_height,
1058                          remaining_stripes.v_end - remaining_stripes.v_start);
1059 
1060     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
1061                                      h, data8, stride, rlbs, copy_above,
1062                                      copy_below, optimized_lr);
1063 
1064     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
1065                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
1066 
1067     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
1068                                        data8, stride, copy_above, copy_below,
1069                                        optimized_lr);
1070 
1071     i += h;
1072   }
1073 }
1074 
filter_frame_on_tile(int tile_row,int tile_col,void * priv,AV1_COMMON * cm)1075 static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
1076                                  AV1_COMMON *cm) {
1077   (void)tile_col;
1078   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1079   ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
1080 }
1081 
filter_frame_on_unit(const RestorationTileLimits * limits,const AV1PixelRect * tile_rect,int rest_unit_idx,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1082 static void filter_frame_on_unit(const RestorationTileLimits *limits,
1083                                  const AV1PixelRect *tile_rect,
1084                                  int rest_unit_idx, void *priv, int32_t *tmpbuf,
1085                                  RestorationLineBuffers *rlbs) {
1086   FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
1087   const RestorationInfo *rsi = ctxt->rsi;
1088 
1089   av1_loop_restoration_filter_unit(
1090       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
1091       ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
1092       ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
1093       rsi->optimized_lr);
1094 }
1095 
av1_loop_restoration_filter_frame_init(AV1LrStruct * lr_ctxt,YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,int num_planes)1096 void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
1097                                             YV12_BUFFER_CONFIG *frame,
1098                                             AV1_COMMON *cm, int optimized_lr,
1099                                             int num_planes) {
1100   const SequenceHeader *const seq_params = &cm->seq_params;
1101   const int bit_depth = seq_params->bit_depth;
1102   const int highbd = seq_params->use_highbitdepth;
1103   lr_ctxt->dst = &cm->rst_frame;
1104 
1105   const int frame_width = frame->crop_widths[0];
1106   const int frame_height = frame->crop_heights[0];
1107   if (aom_realloc_frame_buffer(
1108           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
1109           seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
1110           cm->byte_alignment, NULL, NULL, NULL) < 0)
1111     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1112                        "Failed to allocate restoration dst buffer");
1113 
1114   lr_ctxt->on_rest_unit = filter_frame_on_unit;
1115   lr_ctxt->frame = frame;
1116   for (int plane = 0; plane < num_planes; ++plane) {
1117     RestorationInfo *rsi = &cm->rst_info[plane];
1118     RestorationType rtype = rsi->frame_restoration_type;
1119     rsi->optimized_lr = optimized_lr;
1120 
1121     if (rtype == RESTORE_NONE) {
1122       continue;
1123     }
1124 
1125     const int is_uv = plane > 0;
1126     const int plane_width = frame->crop_widths[is_uv];
1127     const int plane_height = frame->crop_heights[is_uv];
1128     FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
1129 
1130     extend_frame(frame->buffers[plane], plane_width, plane_height,
1131                  frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
1132                  highbd);
1133 
1134     lr_plane_ctxt->rsi = rsi;
1135     lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
1136     lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
1137     lr_plane_ctxt->highbd = highbd;
1138     lr_plane_ctxt->bit_depth = bit_depth;
1139     lr_plane_ctxt->data8 = frame->buffers[plane];
1140     lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
1141     lr_plane_ctxt->data_stride = frame->strides[is_uv];
1142     lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
1143     lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
1144     filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
1145   }
1146 }
1147 
av1_loop_restoration_copy_planes(AV1LrStruct * loop_rest_ctxt,AV1_COMMON * cm,int num_planes)1148 void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
1149                                       AV1_COMMON *cm, int num_planes) {
1150   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
1151                            YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
1152                            int vstart, int vend);
1153   static const copy_fun copy_funs[3] = {
1154     aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
1155   };
1156 
1157   for (int plane = 0; plane < num_planes; ++plane) {
1158     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
1159     AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
1160     copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
1161                      tile_rect.right, tile_rect.top, tile_rect.bottom);
1162   }
1163 }
1164 
foreach_rest_unit_in_planes(AV1LrStruct * lr_ctxt,AV1_COMMON * cm,int num_planes)1165 static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
1166                                         int num_planes) {
1167   FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
1168 
1169   for (int plane = 0; plane < num_planes; ++plane) {
1170     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
1171       continue;
1172     }
1173 
1174     av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
1175                                    &ctxt[plane], &ctxt[plane].tile_rect,
1176                                    cm->rst_tmpbuf, cm->rlbs);
1177   }
1178 }
1179 
av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int optimized_lr,void * lr_ctxt)1180 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
1181                                        AV1_COMMON *cm, int optimized_lr,
1182                                        void *lr_ctxt) {
1183   assert(!cm->all_lossless);
1184   const int num_planes = av1_num_planes(cm);
1185 
1186   AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
1187 
1188   av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
1189                                          optimized_lr, num_planes);
1190 
1191   foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
1192 
1193   av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
1194 }
1195 
av1_foreach_rest_unit_in_row(RestorationTileLimits * limits,const AV1PixelRect * tile_rect,rest_unit_visitor_t on_rest_unit,int row_number,int unit_size,int unit_idx0,int hunits_per_tile,int vunits_per_tile,int plane,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs,sync_read_fn_t on_sync_read,sync_write_fn_t on_sync_write,struct AV1LrSyncData * const lr_sync)1196 void av1_foreach_rest_unit_in_row(
1197     RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
1198     rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
1199     int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
1200     void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
1201     sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
1202     struct AV1LrSyncData *const lr_sync) {
1203   const int tile_w = tile_rect->right - tile_rect->left;
1204   const int ext_size = unit_size * 3 / 2;
1205   int x0 = 0, j = 0;
1206   while (x0 < tile_w) {
1207     int remaining_w = tile_w - x0;
1208     int w = (remaining_w < ext_size) ? remaining_w : unit_size;
1209 
1210     limits->h_start = tile_rect->left + x0;
1211     limits->h_end = tile_rect->left + x0 + w;
1212     assert(limits->h_end <= tile_rect->right);
1213 
1214     const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
1215 
1216     // No sync for even numbered rows
1217     // For odd numbered rows, Loop Restoration of current block requires the LR
1218     // of top-right and bottom-right blocks to be completed
1219 
1220     // top-right sync
1221     on_sync_read(lr_sync, row_number, j, plane);
1222     if ((row_number + 1) < vunits_per_tile)
1223       // bottom-right sync
1224       on_sync_read(lr_sync, row_number + 2, j, plane);
1225 
1226     on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
1227 
1228     on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
1229 
1230     x0 += w;
1231     ++j;
1232   }
1233 }
1234 
av1_lr_sync_read_dummy(void * const lr_sync,int r,int c,int plane)1235 void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
1236   (void)lr_sync;
1237   (void)r;
1238   (void)c;
1239   (void)plane;
1240 }
1241 
av1_lr_sync_write_dummy(void * const lr_sync,int r,int c,const int sb_cols,int plane)1242 void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
1243                              const int sb_cols, int plane) {
1244   (void)lr_sync;
1245   (void)r;
1246   (void)c;
1247   (void)sb_cols;
1248   (void)plane;
1249 }
1250 
foreach_rest_unit_in_tile(const AV1PixelRect * tile_rect,int tile_row,int tile_col,int tile_cols,int hunits_per_tile,int vunits_per_tile,int units_per_tile,int unit_size,int ss_y,int plane,rest_unit_visitor_t on_rest_unit,void * priv,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1251 static void foreach_rest_unit_in_tile(
1252     const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
1253     int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
1254     int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
1255     int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
1256   const int tile_h = tile_rect->bottom - tile_rect->top;
1257   const int ext_size = unit_size * 3 / 2;
1258 
1259   const int tile_idx = tile_col + tile_row * tile_cols;
1260   const int unit_idx0 = tile_idx * units_per_tile;
1261 
1262   int y0 = 0, i = 0;
1263   while (y0 < tile_h) {
1264     int remaining_h = tile_h - y0;
1265     int h = (remaining_h < ext_size) ? remaining_h : unit_size;
1266 
1267     RestorationTileLimits limits;
1268     limits.v_start = tile_rect->top + y0;
1269     limits.v_end = tile_rect->top + y0 + h;
1270     assert(limits.v_end <= tile_rect->bottom);
1271     // Offset the tile upwards to align with the restoration processing stripe
1272     const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
1273     limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
1274     if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
1275 
1276     av1_foreach_rest_unit_in_row(
1277         &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
1278         hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
1279         av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
1280 
1281     y0 += h;
1282     ++i;
1283   }
1284 }
1285 
av1_foreach_rest_unit_in_plane(const struct AV1Common * cm,int plane,rest_unit_visitor_t on_rest_unit,void * priv,AV1PixelRect * tile_rect,int32_t * tmpbuf,RestorationLineBuffers * rlbs)1286 void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
1287                                     rest_unit_visitor_t on_rest_unit,
1288                                     void *priv, AV1PixelRect *tile_rect,
1289                                     int32_t *tmpbuf,
1290                                     RestorationLineBuffers *rlbs) {
1291   const int is_uv = plane > 0;
1292   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1293 
1294   const RestorationInfo *rsi = &cm->rst_info[plane];
1295 
1296   foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
1297                             rsi->horz_units_per_tile, rsi->vert_units_per_tile,
1298                             rsi->units_per_tile, rsi->restoration_unit_size,
1299                             ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
1300 }
1301 
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1)1302 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1303                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
1304                                        int *rcol0, int *rcol1, int *rrow0,
1305                                        int *rrow1) {
1306   assert(rcol0 && rcol1 && rrow0 && rrow1);
1307 
1308   if (bsize != cm->seq_params.sb_size) return 0;
1309   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
1310 
1311   assert(!cm->all_lossless);
1312 
1313   const int is_uv = plane > 0;
1314 
1315   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1316   const int tile_w = tile_rect.right - tile_rect.left;
1317   const int tile_h = tile_rect.bottom - tile_rect.top;
1318 
1319   const int mi_top = 0;
1320   const int mi_left = 0;
1321 
1322   // Compute the mi-unit corners of the superblock relative to the top-left of
1323   // the tile
1324   const int mi_rel_row0 = mi_row - mi_top;
1325   const int mi_rel_col0 = mi_col - mi_left;
1326   const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
1327   const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
1328 
1329   const RestorationInfo *rsi = &cm->rst_info[plane];
1330   const int size = rsi->restoration_unit_size;
1331 
1332   // Calculate the number of restoration units in this tile (which might be
1333   // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
1334   const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
1335   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
1336 
1337   // The size of an MI-unit on this plane of the image
1338   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1339   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1340   const int mi_size_x = MI_SIZE >> ss_x;
1341   const int mi_size_y = MI_SIZE >> ss_y;
1342 
1343   // Write m for the relative mi column or row, D for the superres denominator
1344   // and N for the superres numerator. If u is the upscaled pixel offset then
1345   // we can write the downscaled pixel offset in two ways as:
1346   //
1347   //   MI_SIZE * m = N / D u
1348   //
1349   // from which we get u = D * MI_SIZE * m / N
1350   const int mi_to_num_x = av1_superres_scaled(cm)
1351                               ? mi_size_x * cm->superres_scale_denominator
1352                               : mi_size_x;
1353   const int mi_to_num_y = mi_size_y;
1354   const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
1355   const int denom_y = size;
1356 
1357   const int rnd_x = denom_x - 1;
1358   const int rnd_y = denom_y - 1;
1359 
1360   // rcol0/rrow0 should be the first column/row of restoration units (relative
1361   // to the top-left of the tile) that doesn't start left/below of
1362   // mi_col/mi_row. For this calculation, we need to round up the division (if
1363   // the sb starts at runit column 10.1, the first matching runit has column
1364   // index 11)
1365   *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
1366   *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
1367 
1368   // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
1369   // below-right. If we're at the bottom or right of the tile, this restoration
1370   // unit might not exist, in which case we'll clamp accordingly.
1371   *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
1372   *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
1373 
1374   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1375 }
1376 
1377 // Extend to left and right
extend_lines(uint8_t * buf,int width,int height,int stride,int extend,int use_highbitdepth)1378 static void extend_lines(uint8_t *buf, int width, int height, int stride,
1379                          int extend, int use_highbitdepth) {
1380   for (int i = 0; i < height; ++i) {
1381     if (use_highbitdepth) {
1382       uint16_t *buf16 = (uint16_t *)buf;
1383       aom_memset16(buf16 - extend, buf16[0], extend);
1384       aom_memset16(buf16 + width, buf16[width - 1], extend);
1385     } else {
1386       memset(buf - extend, buf[0], extend);
1387       memset(buf + width, buf[width - 1], extend);
1388     }
1389     buf += stride;
1390   }
1391 }
1392 
save_deblock_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1393 static void save_deblock_boundary_lines(
1394     const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
1395     int stripe, int use_highbd, int is_above,
1396     RestorationStripeBoundaries *boundaries) {
1397   const int is_uv = plane > 0;
1398   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1399   const int src_stride = frame->strides[is_uv] << use_highbd;
1400   const uint8_t *src_rows = src_buf + row * src_stride;
1401 
1402   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1403                                : boundaries->stripe_boundary_below;
1404   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1405   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1406   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1407 
1408   // There is a rare case in which a processing stripe can end 1px above the
1409   // crop border. In this case, we do want to use deblocked pixels from below
1410   // the stripe (hence why we ended up in this function), but instead of
1411   // fetching 2 "below" rows we need to fetch one and duplicate it.
1412   // This is equivalent to clamping the sample locations against the crop border
1413   const int lines_to_save =
1414       AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
1415   assert(lines_to_save == 1 || lines_to_save == 2);
1416 
1417   int upscaled_width;
1418   int line_bytes;
1419   if (av1_superres_scaled(cm)) {
1420     const int ss_x = is_uv && cm->seq_params.subsampling_x;
1421     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
1422     line_bytes = upscaled_width << use_highbd;
1423     if (use_highbd)
1424       av1_upscale_normative_rows(
1425           cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
1426           CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
1427           plane, lines_to_save);
1428     else
1429       av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
1430                                  boundaries->stripe_boundary_stride, plane,
1431                                  lines_to_save);
1432   } else {
1433     upscaled_width = frame->crop_widths[is_uv];
1434     line_bytes = upscaled_width << use_highbd;
1435     for (int i = 0; i < lines_to_save; i++) {
1436       memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
1437              line_bytes);
1438     }
1439   }
1440   // If we only saved one line, then copy it into the second line buffer
1441   if (lines_to_save == 1)
1442     memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
1443 
1444   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1445                RESTORATION_EXTRA_HORZ, use_highbd);
1446 }
1447 
save_cdef_boundary_lines(const YV12_BUFFER_CONFIG * frame,const AV1_COMMON * cm,int plane,int row,int stripe,int use_highbd,int is_above,RestorationStripeBoundaries * boundaries)1448 static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1449                                      const AV1_COMMON *cm, int plane, int row,
1450                                      int stripe, int use_highbd, int is_above,
1451                                      RestorationStripeBoundaries *boundaries) {
1452   const int is_uv = plane > 0;
1453   const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
1454   const int src_stride = frame->strides[is_uv] << use_highbd;
1455   const uint8_t *src_rows = src_buf + row * src_stride;
1456 
1457   uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
1458                                : boundaries->stripe_boundary_below;
1459   uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
1460   const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
1461   uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
1462   const int src_width = frame->crop_widths[is_uv];
1463 
1464   // At the point where this function is called, we've already applied
1465   // superres. So we don't need to extend the lines here, we can just
1466   // pull directly from the topmost row of the upscaled frame.
1467   const int ss_x = is_uv && cm->seq_params.subsampling_x;
1468   const int upscaled_width = av1_superres_scaled(cm)
1469                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
1470                                  : src_width;
1471   const int line_bytes = upscaled_width << use_highbd;
1472   for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
1473     // Copy the line at 'row' into both context lines. This is because
1474     // we want to (effectively) extend the outermost row of CDEF data
1475     // from this tile to produce a border, rather than using deblocked
1476     // pixels from the tile above/below.
1477     memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
1478   }
1479   extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
1480                RESTORATION_EXTRA_HORZ, use_highbd);
1481 }
1482 
save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG * frame,int use_highbd,int plane,AV1_COMMON * cm,int after_cdef)1483 static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1484                                          int use_highbd, int plane,
1485                                          AV1_COMMON *cm, int after_cdef) {
1486   const int is_uv = plane > 0;
1487   const int ss_y = is_uv && cm->seq_params.subsampling_y;
1488   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
1489   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
1490 
1491   // Get the tile rectangle, with height rounded up to the next multiple of 8
1492   // luma pixels (only relevant for the bottom tile of the frame)
1493   const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
1494   const int stripe0 = 0;
1495 
1496   RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
1497 
1498   const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
1499 
1500   int tile_stripe;
1501   for (tile_stripe = 0;; ++tile_stripe) {
1502     const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
1503     const int y0 = tile_rect.top + rel_y0;
1504     if (y0 >= tile_rect.bottom) break;
1505 
1506     const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
1507     const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
1508 
1509     const int frame_stripe = stripe0 + tile_stripe;
1510 
1511     // In this case, we should only use CDEF pixels at the top
1512     // and bottom of the frame as a whole; internal tile boundaries
1513     // can use deblocked pixels from adjacent tiles for context.
1514     const int use_deblock_above = (frame_stripe > 0);
1515     const int use_deblock_below = (y1 < plane_height);
1516 
1517     if (!after_cdef) {
1518       // Save deblocked context where needed.
1519       if (use_deblock_above) {
1520         save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
1521                                     frame_stripe, use_highbd, 1, boundaries);
1522       }
1523       if (use_deblock_below) {
1524         save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
1525                                     use_highbd, 0, boundaries);
1526       }
1527     } else {
1528       // Save CDEF context where needed. Note that we need to save the CDEF
1529       // context for a particular boundary iff we *didn't* save deblocked
1530       // context for that boundary.
1531       //
1532       // In addition, we need to save copies of the outermost line within
1533       // the tile, rather than using data from outside the tile.
1534       if (!use_deblock_above) {
1535         save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
1536                                  1, boundaries);
1537       }
1538       if (!use_deblock_below) {
1539         save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
1540                                  use_highbd, 0, boundaries);
1541       }
1542     }
1543   }
1544 }
1545 
1546 // For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
1547 // lines to be used as boundary in the loop restoration process. The
1548 // lines are saved in rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int after_cdef)1549 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
1550                                               AV1_COMMON *cm, int after_cdef) {
1551   const int num_planes = av1_num_planes(cm);
1552   const int use_highbd = cm->seq_params.use_highbitdepth;
1553   for (int p = 0; p < num_planes; ++p) {
1554     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
1555   }
1556 }
1557