1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  *
11  */
12 
13 #include <math.h>
14 
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17 #include "./aom_scale_rtcd.h"
18 #include "av1/common/onyxc_int.h"
19 #include "av1/common/restoration.h"
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_mem/aom_mem.h"
22 
23 #include "aom_ports/mem.h"
24 
25 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
26 #if USE_HIGHPASS_IN_SGRPROJ
27   // corner, edge, r2, eps2
28   { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
29   { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
30   { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
31   { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
32 #else
33 // r1, eps1, r2, eps2
34 #if MAX_RADIUS == 2
35   { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
36   { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
37   { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
38   { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
39 #else
40   { 2, 12, 1, 4 },  { 2, 15, 1, 6 },  { 2, 18, 1, 8 },  { 2, 20, 1, 9 },
41   { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
42   { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
43   { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
44 #endif  // MAX_RADIUS == 2
45 #endif
46 };
47 
48 typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
49                                   int stride, RestorationInternal *rst,
50                                   uint8_t *dst8, int dst_stride);
51 #if CONFIG_HIGHBITDEPTH
52 typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
53                                          int stride, RestorationInternal *rst,
54                                          int bit_depth, uint8_t *dst8,
55                                          int dst_stride);
56 #endif  // CONFIG_HIGHBITDEPTH
57 
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rst_info,int width,int height)58 int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
59                                  int width, int height) {
60   const int ntiles = av1_get_rest_ntiles(
61       width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
62   aom_free(rst_info->restoration_type);
63   CHECK_MEM_ERROR(cm, rst_info->restoration_type,
64                   (RestorationType *)aom_malloc(
65                       sizeof(*rst_info->restoration_type) * ntiles));
66   aom_free(rst_info->wiener_info);
67   CHECK_MEM_ERROR(
68       cm, rst_info->wiener_info,
69       (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
70   memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
71   aom_free(rst_info->sgrproj_info);
72   CHECK_MEM_ERROR(
73       cm, rst_info->sgrproj_info,
74       (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
75   return ntiles;
76 }
77 
av1_free_restoration_struct(RestorationInfo * rst_info)78 void av1_free_restoration_struct(RestorationInfo *rst_info) {
79   aom_free(rst_info->restoration_type);
80   rst_info->restoration_type = NULL;
81   aom_free(rst_info->wiener_info);
82   rst_info->wiener_info = NULL;
83   aom_free(rst_info->sgrproj_info);
84   rst_info->sgrproj_info = NULL;
85 }
86 
87 // TODO(debargha): This table can be substantially reduced since only a few
88 // values are actually used.
89 int sgrproj_mtable[MAX_EPS][MAX_NELEM];
90 
GenSgrprojVtable()91 static void GenSgrprojVtable() {
92   int e, n;
93   for (e = 1; e <= MAX_EPS; ++e)
94     for (n = 1; n <= MAX_NELEM; ++n) {
95       const int n2e = n * n * e;
96       sgrproj_mtable[e - 1][n - 1] =
97           (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
98     }
99 }
100 
av1_loop_restoration_precal()101 void av1_loop_restoration_precal() { GenSgrprojVtable(); }
102 
loop_restoration_init(RestorationInternal * rst,int kf)103 static void loop_restoration_init(RestorationInternal *rst, int kf) {
104   rst->keyframe = kf;
105 }
106 
extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)107 void extend_frame(uint8_t *data, int width, int height, int stride,
108                   int border_horz, int border_vert) {
109   uint8_t *data_p;
110   int i;
111   for (i = 0; i < height; ++i) {
112     data_p = data + i * stride;
113     memset(data_p - border_horz, data_p[0], border_horz);
114     memset(data_p + width, data_p[width - 1], border_horz);
115   }
116   data_p = data - border_horz;
117   for (i = -border_vert; i < 0; ++i) {
118     memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
119   }
120   for (i = height; i < height + border_vert; ++i) {
121     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
122            width + 2 * border_horz);
123   }
124 }
125 
126 #if CONFIG_STRIPED_LOOP_RESTORATION
127 
128 // This function setup a processing stripe by replacing the vertical
129 // stripe boundary (2 lines above and 2 lines below) by data coming
130 // from the above/below buffers. Before doing so the original
131 // frame data is saved into a temporary buffer, such that it
132 // can be restored by the restore_processing_stripe_boundary
133 // function after the filtering of the processing stripe.
134 // Returns the height of the processing stripe
setup_processing_stripe_boundary(int y0,int v_end,int h_start,int h_end,uint8_t * data,int stride,RestorationInternal * rst,int use_highbd)135 static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
136                                             int h_end, uint8_t *data,
137                                             int stride,
138                                             RestorationInternal *rst,
139                                             int use_highbd) {
140   int y, y_stripe_topmost, stripe_index, i;
141   int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
142   int stripe_height = rst->rsi->procunit_height;
143   int comp = rst->component;
144   uint8_t *boundary_above_buf = rst->stripe_boundary_above[comp];
145   uint8_t *boundary_below_buf = rst->stripe_boundary_below[comp];
146   int boundary_stride = rst->stripe_boundary_stride[comp];
147   int x0 = h_start - RESTORATION_EXTRA_HORZ;
148   int x1 = h_end + RESTORATION_EXTRA_HORZ;
149 
150   stripe_index = (y0 + tile_offset) / stripe_height;
151   y_stripe_topmost = stripe_index * stripe_height - tile_offset;
152   boundary_above_buf +=
153       ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
154       << use_highbd;
155   boundary_below_buf +=
156       (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
157       << use_highbd;
158 
159   // setup the 2 lines above the stripe
160   for (i = 0; i < 2; i++) {
161     y = y_stripe_topmost - 2 + i;
162     if (y >= 0 && y < y0 && y >= y0 - 2) {
163       uint8_t *p = data + ((y * stride + x0) << use_highbd);
164       uint8_t *new_data =
165           boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
166       // printf("above %3d %3d: %08x %08x : %08x %08x\n", y, x0,
167       // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
168       // ((uint32_t*)new_data)[1]);
169       // Save old pixels
170       memcpy(rst->tmp_save_above[i], p, (x1 - x0) << use_highbd);
171       // Replace width pixels from boundary_above_buf
172       memcpy(p, new_data, (x1 - x0) << use_highbd);
173     }
174   }
175   // setup the 2 lines below the stripe
176   for (i = 0; i < 2; i++) {
177     y = y_stripe_topmost + stripe_height + i;
178     if (y < v_end + 2) {
179       uint8_t *p = data + ((y * stride + x0) << use_highbd);
180       uint8_t *new_data =
181           boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
182       // printf("below %3d %3d: %08x %08x : %08x %08x\n", y, x0,
183       // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
184       // ((uint32_t*)new_data)[1]);
185       // Save old pixels
186       memcpy(rst->tmp_save_below[i], p, (x1 - x0) << use_highbd);
187       // Replace width pixels from boundary_below_buf
188       memcpy(p, new_data, (x1 - x0) << use_highbd);
189     }
190   }
191   // Return actual stripe height
192   return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
193 }
194 
195 // This function restores the boundary lines modified by
196 // setup_processing_stripe_boundary.
restore_processing_stripe_boundary(int y0,int v_end,int h_start,int h_end,uint8_t * data,int stride,RestorationInternal * rst,int use_highbd)197 static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
198                                                int h_end, uint8_t *data,
199                                                int stride,
200                                                RestorationInternal *rst,
201                                                int use_highbd) {
202   int y, y_stripe_topmost, i, stripe_index;
203   int tile_offset = 8 >> rst->subsampling_y;
204   int stripe_height = rst->rsi->procunit_height;
205   int x0 = h_start - RESTORATION_EXTRA_HORZ;
206   int x1 = h_end + RESTORATION_EXTRA_HORZ;
207 
208   stripe_index = (y0 + tile_offset) / stripe_height;
209   y_stripe_topmost = stripe_index * stripe_height - tile_offset;
210 
211   // restore the 2 lines above the stripe
212   for (i = 0; i < 2; i++) {
213     y = y_stripe_topmost - 2 + i;
214     if (y >= 0 && y < y0 && y >= y0 - 2) {
215       uint8_t *p = data + ((y * stride + x0) << use_highbd);
216       memcpy(p, rst->tmp_save_above[i], (x1 - x0) << use_highbd);
217     }
218   }
219   // restore the 2 lines below the stripe
220   for (i = 0; i < 2; i++) {
221     y = y_stripe_topmost + stripe_height + i;
222     if (y < v_end + 2) {
223       uint8_t *p = data + ((y * stride + x0) << use_highbd);
224       memcpy(p, rst->tmp_save_below[i], (x1 - x0) << use_highbd);
225     }
226   }
227 }
228 
229 #endif
230 
loop_copy_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)231 static void loop_copy_tile(uint8_t *data, int tile_idx, int width, int height,
232                            int stride, RestorationInternal *rst, uint8_t *dst,
233                            int dst_stride) {
234   const int tile_width = rst->tile_width;
235   const int tile_height = rst->tile_height;
236   RestorationTileLimits limits =
237       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
238 #if CONFIG_STRIPED_LOOP_RESTORATION
239                                tile_height, width, height, rst->subsampling_y);
240 #else
241                                tile_height, width, height);
242 #endif
243   for (int i = limits.v_start; i < limits.v_end; ++i)
244     memcpy(dst + i * dst_stride + limits.h_start,
245            data + i * stride + limits.h_start, limits.h_end - limits.h_start);
246 }
247 
stepdown_wiener_kernel(const InterpKernel orig,InterpKernel vert,int boundary_dist,int istop)248 static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
249                                    int boundary_dist, int istop) {
250   memcpy(vert, orig, sizeof(InterpKernel));
251   switch (boundary_dist) {
252     case 0:
253       vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
254       vert[2] = vert[1] = vert[0] = 0;
255       break;
256     case 1:
257       vert[2] += vert[1] + vert[0];
258       vert[1] = vert[0] = 0;
259       break;
260     case 2:
261       vert[1] += vert[0];
262       vert[0] = 0;
263       break;
264     default: break;
265   }
266   if (!istop) {
267     int tmp;
268     tmp = vert[0];
269     vert[0] = vert[WIENER_WIN - 1];
270     vert[WIENER_WIN - 1] = tmp;
271     tmp = vert[1];
272     vert[1] = vert[WIENER_WIN - 2];
273     vert[WIENER_WIN - 2] = tmp;
274     tmp = vert[2];
275     vert[2] = vert[WIENER_WIN - 3];
276     vert[WIENER_WIN - 3] = tmp;
277   }
278 }
279 
loop_wiener_filter_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)280 static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
281                                     int height, int stride,
282                                     RestorationInternal *rst, uint8_t *dst,
283                                     int dst_stride) {
284   const int procunit_width = rst->rsi->procunit_width;
285 #if CONFIG_STRIPED_LOOP_RESTORATION
286   int procunit_height;
287 #else
288   const int procunit_height = rst->rsi->procunit_height;
289 #endif
290   const int tile_width = rst->tile_width;
291   const int tile_height = rst->tile_height;
292   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
293     loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
294     return;
295   }
296   InterpKernel vertical_topbot;
297   RestorationTileLimits limits =
298       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
299 #if CONFIG_STRIPED_LOOP_RESTORATION
300                                tile_height, width, height, rst->subsampling_y);
301 #else
302                                tile_height, width, height);
303 #endif
304 
305   // Convolve the whole tile (done in blocks here to match the requirements
306   // of the vectorized convolve functions, but the result is equivalent)
307   for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
308 #if CONFIG_STRIPED_LOOP_RESTORATION
309     int h = setup_processing_stripe_boundary(
310         i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
311     h = ALIGN_POWER_OF_TWO(h, 1);
312     procunit_height = h;
313 #else
314     int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
315 #endif
316     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
317       int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
318       const uint8_t *data_p = data + i * stride + j;
319       uint8_t *dst_p = dst + i * dst_stride + j;
320       // Note h is at least 16
321       for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
322         stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
323                                vertical_topbot, WIENER_BORDER_VERT + b, 1);
324 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
325         aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
326                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
327                                   vertical_topbot, 16, w, 1);
328 #else
329         aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
330                               rst->rsi->wiener_info[tile_idx].hfilter, 16,
331                               vertical_topbot, 16, w, 1);
332 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
333         data_p += stride;
334         dst_p += dst_stride;
335       }
336 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
337       aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
338                                 rst->rsi->wiener_info[tile_idx].hfilter, 16,
339                                 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
340                                 h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
341 #else
342       aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
343                             rst->rsi->wiener_info[tile_idx].hfilter, 16,
344                             rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
345                             h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
346 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
347       data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
348       dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
349       for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
350         stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
351                                vertical_topbot, WIENER_BORDER_VERT + b, 0);
352 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
353         aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
354                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
355                                   vertical_topbot, 16, w, 1);
356 #else
357         aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
358                               rst->rsi->wiener_info[tile_idx].hfilter, 16,
359                               vertical_topbot, 16, w, 1);
360 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
361         data_p += stride;
362         dst_p += dst_stride;
363       }
364     }
365 #if CONFIG_STRIPED_LOOP_RESTORATION
366     restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
367                                        limits.h_end, data, stride, rst, 0);
368 #endif
369   }
370 }
371 
loop_wiener_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)372 static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
373                                RestorationInternal *rst, uint8_t *dst,
374                                int dst_stride) {
375   int tile_idx;
376   extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
377                WIENER_BORDER_VERT);
378   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
379     loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
380                             dst_stride);
381   }
382 }
383 
384 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
385    over the input. The window is of size (2r + 1)x(2r + 1), and we
386    specialize to r = 1, 2, 3. A default function is used for r > 3.
387 
388    Each loop follows the same format: We keep a window's worth of input
389    in individual variables and select data out of that as appropriate.
390 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)391 static void boxsum1(int32_t *src, int width, int height, int src_stride,
392                     int sqr, int32_t *dst, int dst_stride) {
393   int i, j, a, b, c;
394 
395   // Vertical sum over 3-pixel regions, from src into dst.
396   if (!sqr) {
397     for (j = 0; j < width; ++j) {
398       a = src[j];
399       b = src[src_stride + j];
400       c = src[2 * src_stride + j];
401 
402       dst[j] = a + b;
403       for (i = 1; i < height - 2; ++i) {
404         // Loop invariant: At the start of each iteration,
405         // a = src[(i - 1) * src_stride + j]
406         // b = src[(i    ) * src_stride + j]
407         // c = src[(i + 1) * src_stride + j]
408         dst[i * dst_stride + j] = a + b + c;
409         a = b;
410         b = c;
411         c = src[(i + 2) * src_stride + j];
412       }
413       dst[i * dst_stride + j] = a + b + c;
414       dst[(i + 1) * dst_stride + j] = b + c;
415     }
416   } else {
417     for (j = 0; j < width; ++j) {
418       a = src[j] * src[j];
419       b = src[src_stride + j] * src[src_stride + j];
420       c = src[2 * src_stride + j] * src[2 * src_stride + j];
421 
422       dst[j] = a + b;
423       for (i = 1; i < height - 2; ++i) {
424         dst[i * dst_stride + j] = a + b + c;
425         a = b;
426         b = c;
427         c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
428       }
429       dst[i * dst_stride + j] = a + b + c;
430       dst[(i + 1) * dst_stride + j] = b + c;
431     }
432   }
433 
434   // Horizontal sum over 3-pixel regions of dst
435   for (i = 0; i < height; ++i) {
436     a = dst[i * dst_stride];
437     b = dst[i * dst_stride + 1];
438     c = dst[i * dst_stride + 2];
439 
440     dst[i * dst_stride] = a + b;
441     for (j = 1; j < width - 2; ++j) {
442       // Loop invariant: At the start of each iteration,
443       // a = src[i * src_stride + (j - 1)]
444       // b = src[i * src_stride + (j    )]
445       // c = src[i * src_stride + (j + 1)]
446       dst[i * dst_stride + j] = a + b + c;
447       a = b;
448       b = c;
449       c = dst[i * dst_stride + (j + 2)];
450     }
451     dst[i * dst_stride + j] = a + b + c;
452     dst[i * dst_stride + (j + 1)] = b + c;
453   }
454 }
455 
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)456 static void boxsum2(int32_t *src, int width, int height, int src_stride,
457                     int sqr, int32_t *dst, int dst_stride) {
458   int i, j, a, b, c, d, e;
459 
460   // Vertical sum over 5-pixel regions, from src into dst.
461   if (!sqr) {
462     for (j = 0; j < width; ++j) {
463       a = src[j];
464       b = src[src_stride + j];
465       c = src[2 * src_stride + j];
466       d = src[3 * src_stride + j];
467       e = src[4 * src_stride + j];
468 
469       dst[j] = a + b + c;
470       dst[dst_stride + j] = a + b + c + d;
471       for (i = 2; i < height - 3; ++i) {
472         // Loop invariant: At the start of each iteration,
473         // a = src[(i - 2) * src_stride + j]
474         // b = src[(i - 1) * src_stride + j]
475         // c = src[(i    ) * src_stride + j]
476         // d = src[(i + 1) * src_stride + j]
477         // e = src[(i + 2) * src_stride + j]
478         dst[i * dst_stride + j] = a + b + c + d + e;
479         a = b;
480         b = c;
481         c = d;
482         d = e;
483         e = src[(i + 3) * src_stride + j];
484       }
485       dst[i * dst_stride + j] = a + b + c + d + e;
486       dst[(i + 1) * dst_stride + j] = b + c + d + e;
487       dst[(i + 2) * dst_stride + j] = c + d + e;
488     }
489   } else {
490     for (j = 0; j < width; ++j) {
491       a = src[j] * src[j];
492       b = src[src_stride + j] * src[src_stride + j];
493       c = src[2 * src_stride + j] * src[2 * src_stride + j];
494       d = src[3 * src_stride + j] * src[3 * src_stride + j];
495       e = src[4 * src_stride + j] * src[4 * src_stride + j];
496 
497       dst[j] = a + b + c;
498       dst[dst_stride + j] = a + b + c + d;
499       for (i = 2; i < height - 3; ++i) {
500         dst[i * dst_stride + j] = a + b + c + d + e;
501         a = b;
502         b = c;
503         c = d;
504         d = e;
505         e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
506       }
507       dst[i * dst_stride + j] = a + b + c + d + e;
508       dst[(i + 1) * dst_stride + j] = b + c + d + e;
509       dst[(i + 2) * dst_stride + j] = c + d + e;
510     }
511   }
512 
513   // Horizontal sum over 5-pixel regions of dst
514   for (i = 0; i < height; ++i) {
515     a = dst[i * dst_stride];
516     b = dst[i * dst_stride + 1];
517     c = dst[i * dst_stride + 2];
518     d = dst[i * dst_stride + 3];
519     e = dst[i * dst_stride + 4];
520 
521     dst[i * dst_stride] = a + b + c;
522     dst[i * dst_stride + 1] = a + b + c + d;
523     for (j = 2; j < width - 3; ++j) {
524       // Loop invariant: At the start of each iteration,
525       // a = src[i * src_stride + (j - 2)]
526       // b = src[i * src_stride + (j - 1)]
527       // c = src[i * src_stride + (j    )]
528       // d = src[i * src_stride + (j + 1)]
529       // e = src[i * src_stride + (j + 2)]
530       dst[i * dst_stride + j] = a + b + c + d + e;
531       a = b;
532       b = c;
533       c = d;
534       d = e;
535       e = dst[i * dst_stride + (j + 3)];
536     }
537     dst[i * dst_stride + j] = a + b + c + d + e;
538     dst[i * dst_stride + (j + 1)] = b + c + d + e;
539     dst[i * dst_stride + (j + 2)] = c + d + e;
540   }
541 }
542 
boxsum3(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)543 static void boxsum3(int32_t *src, int width, int height, int src_stride,
544                     int sqr, int32_t *dst, int dst_stride) {
545   int i, j, a, b, c, d, e, f, g;
546 
547   // Vertical sum over 7-pixel regions, from src into dst.
548   if (!sqr) {
549     for (j = 0; j < width; ++j) {
550       a = src[j];
551       b = src[1 * src_stride + j];
552       c = src[2 * src_stride + j];
553       d = src[3 * src_stride + j];
554       e = src[4 * src_stride + j];
555       f = src[5 * src_stride + j];
556       g = src[6 * src_stride + j];
557 
558       dst[j] = a + b + c + d;
559       dst[dst_stride + j] = a + b + c + d + e;
560       dst[2 * dst_stride + j] = a + b + c + d + e + f;
561       for (i = 3; i < height - 4; ++i) {
562         dst[i * dst_stride + j] = a + b + c + d + e + f + g;
563         a = b;
564         b = c;
565         c = d;
566         d = e;
567         e = f;
568         f = g;
569         g = src[(i + 4) * src_stride + j];
570       }
571       dst[i * dst_stride + j] = a + b + c + d + e + f + g;
572       dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
573       dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
574       dst[(i + 3) * dst_stride + j] = d + e + f + g;
575     }
576   } else {
577     for (j = 0; j < width; ++j) {
578       a = src[j] * src[j];
579       b = src[1 * src_stride + j] * src[1 * src_stride + j];
580       c = src[2 * src_stride + j] * src[2 * src_stride + j];
581       d = src[3 * src_stride + j] * src[3 * src_stride + j];
582       e = src[4 * src_stride + j] * src[4 * src_stride + j];
583       f = src[5 * src_stride + j] * src[5 * src_stride + j];
584       g = src[6 * src_stride + j] * src[6 * src_stride + j];
585 
586       dst[j] = a + b + c + d;
587       dst[dst_stride + j] = a + b + c + d + e;
588       dst[2 * dst_stride + j] = a + b + c + d + e + f;
589       for (i = 3; i < height - 4; ++i) {
590         dst[i * dst_stride + j] = a + b + c + d + e + f + g;
591         a = b;
592         b = c;
593         c = d;
594         d = e;
595         e = f;
596         f = g;
597         g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
598       }
599       dst[i * dst_stride + j] = a + b + c + d + e + f + g;
600       dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
601       dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
602       dst[(i + 3) * dst_stride + j] = d + e + f + g;
603     }
604   }
605 
606   // Horizontal sum over 7-pixel regions of dst
607   for (i = 0; i < height; ++i) {
608     a = dst[i * dst_stride];
609     b = dst[i * dst_stride + 1];
610     c = dst[i * dst_stride + 2];
611     d = dst[i * dst_stride + 3];
612     e = dst[i * dst_stride + 4];
613     f = dst[i * dst_stride + 5];
614     g = dst[i * dst_stride + 6];
615 
616     dst[i * dst_stride] = a + b + c + d;
617     dst[i * dst_stride + 1] = a + b + c + d + e;
618     dst[i * dst_stride + 2] = a + b + c + d + e + f;
619     for (j = 3; j < width - 4; ++j) {
620       dst[i * dst_stride + j] = a + b + c + d + e + f + g;
621       a = b;
622       b = c;
623       c = d;
624       d = e;
625       e = f;
626       f = g;
627       g = dst[i * dst_stride + (j + 4)];
628     }
629     dst[i * dst_stride + j] = a + b + c + d + e + f + g;
630     dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
631     dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
632     dst[i * dst_stride + (j + 3)] = d + e + f + g;
633   }
634 }
635 
636 // Generic version for any r. To be removed after experiments are done.
boxsumr(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)637 static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
638                     int sqr, int32_t *dst, int dst_stride) {
639   int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
640   int tmp_stride = width;
641   int i, j;
642   if (sqr) {
643     for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
644     for (j = 0; j < width; ++j)
645       for (i = 1; i < height; ++i)
646         tmp[i * tmp_stride + j] =
647             tmp[(i - 1) * tmp_stride + j] +
648             src[i * src_stride + j] * src[i * src_stride + j];
649   } else {
650     memcpy(tmp, src, sizeof(*tmp) * width);
651     for (j = 0; j < width; ++j)
652       for (i = 1; i < height; ++i)
653         tmp[i * tmp_stride + j] =
654             tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
655   }
656   for (i = 0; i <= r; ++i)
657     memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
658            sizeof(*tmp) * width);
659   for (i = r + 1; i < height - r; ++i)
660     for (j = 0; j < width; ++j)
661       dst[i * dst_stride + j] =
662           tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
663   for (i = height - r; i < height; ++i)
664     for (j = 0; j < width; ++j)
665       dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
666                                 tmp[(i - r - 1) * tmp_stride + j];
667 
668   for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
669   for (i = 0; i < height; ++i)
670     for (j = 1; j < width; ++j)
671       tmp[i * tmp_stride + j] =
672           tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
673 
674   for (j = 0; j <= r; ++j)
675     for (i = 0; i < height; ++i)
676       dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
677   for (j = r + 1; j < width - r; ++j)
678     for (i = 0; i < height; ++i)
679       dst[i * dst_stride + j] =
680           tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
681   for (j = width - r; j < width; ++j)
682     for (i = 0; i < height; ++i)
683       dst[i * dst_stride + j] =
684           tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
685   aom_free(tmp);
686 }
687 
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)688 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
689                    int sqr, int32_t *dst, int dst_stride) {
690   if (r == 1)
691     boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
692   else if (r == 2)
693     boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
694   else if (r == 3)
695     boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
696   else
697     boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
698 }
699 
boxnum(int width,int height,int r,int8_t * num,int num_stride)700 static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
701   int i, j;
702   for (i = 0; i <= r; ++i) {
703     for (j = 0; j <= r; ++j) {
704       num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
705       num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
706       num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
707       num[(height - 1 - i) * num_stride + (width - 1 - j)] =
708           num[i * num_stride + j];
709     }
710   }
711   for (j = 0; j <= r; ++j) {
712     const int val = (2 * r + 1) * (r + 1 + j);
713     for (i = r + 1; i < height - r; ++i) {
714       num[i * num_stride + j] = val;
715       num[i * num_stride + (width - 1 - j)] = val;
716     }
717   }
718   for (i = 0; i <= r; ++i) {
719     const int val = (2 * r + 1) * (r + 1 + i);
720     for (j = r + 1; j < width - r; ++j) {
721       num[i * num_stride + j] = val;
722       num[(height - 1 - i) * num_stride + j] = val;
723     }
724   }
725   for (i = r + 1; i < height - r; ++i) {
726     for (j = r + 1; j < width - r; ++j) {
727       num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
728     }
729   }
730 }
731 
decode_xq(int * xqd,int * xq)732 void decode_xq(int *xqd, int *xq) {
733   xq[0] = xqd[0];
734   xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
735 }
736 
737 const int32_t x_by_xplus1[256] = {
738   0,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
739   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
740   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
741   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
742   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
743   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
744   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
745   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
746   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
747   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
748   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
749   254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
750   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
751   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
752   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
753   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
754   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
755   256,
756 };
757 
758 const int32_t one_by_x[MAX_NELEM] = {
759   4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
760   293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
761 #if MAX_RADIUS > 2
762   158,  152,  146,  141,  137, 132, 128, 124, 120, 117, 114, 111, 108,
763   105,  102,  100,  98,   95,  93,  91,  89,  87,  85,  84
764 #endif  // MAX_RADIUS > 2
765 };
766 
av1_selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int r,int eps)767 static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
768                                                 int height, int dgd_stride,
769                                                 int32_t *dst, int dst_stride,
770                                                 int bit_depth, int r, int eps) {
771   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
773   const int num_stride = width_ext;
774   // Adjusting the stride of A and B here appears to avoid bad cache effects,
775   // leading to a significant speed improvement.
776   // We also align the stride to a multiple of 16 bytes, for consistency
777   // with the SIMD version of this function.
778   int buf_stride = ((width_ext + 3) & ~3) + 16;
779   int32_t A_[RESTORATION_PROC_UNIT_PELS];
780   int32_t B_[RESTORATION_PROC_UNIT_PELS];
781   int32_t *A = A_;
782   int32_t *B = B_;
783   int8_t num_[RESTORATION_PROC_UNIT_PELS];
784   int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
785   int i, j;
786 
787   // Don't filter tiles with dimensions < 5 on any axis
788   if ((width < 5) || (height < 5)) return;
789 
790   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
791          width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
792   boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
793          width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
794   boxnum(width_ext, height_ext, r, num_, num_stride);
795   assert(r <= 3);
796   A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
798   for (i = 0; i < height; ++i) {
799     for (j = 0; j < width; ++j) {
800       const int k = i * buf_stride + j;
801       const int n = num[i * num_stride + j];
802 
803       // a < 2^16 * n < 2^22 regardless of bit depth
804       uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
805       // b < 2^8 * n < 2^14 regardless of bit depth
806       uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
807 
808       // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
809       // and p itself satisfies p < 2^14 * n^2 < 2^26.
810       // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
811       // This is an artefact of rounding, and can only happen if all pixels
812       // are (almost) identical, so in this case we saturate to p=0.
813       uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
814       uint32_t s = sgrproj_mtable[eps - 1][n - 1];
815 
816       // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
817       // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
818       // (this holds even after accounting for the rounding in s)
819       const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
820 
821       A[k] = x_by_xplus1[AOMMIN(z, 255)];  // < 2^8
822 
823       // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
824       // one_by_x[n - 1] = round(2^12 / n)
825       // => the product here is < 2^(20 + bit_depth) <= 2^32,
826       // and B[k] is set to a value < 2^(8 + bit depth)
827       B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
828                                              (uint32_t)B[k] *
829                                              (uint32_t)one_by_x[n - 1],
830                                          SGRPROJ_RECIP_BITS);
831     }
832   }
833   i = 0;
834   j = 0;
835   {
836     const int k = i * buf_stride + j;
837     const int l = i * dgd_stride + j;
838     const int m = i * dst_stride + j;
839     const int nb = 3;
840     const int32_t a =
841         3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
842     const int32_t b =
843         3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
844     const int32_t v = a * dgd[l] + b;
845     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
846   }
847   i = 0;
848   j = width - 1;
849   {
850     const int k = i * buf_stride + j;
851     const int l = i * dgd_stride + j;
852     const int m = i * dst_stride + j;
853     const int nb = 3;
854     const int32_t a =
855         3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
856     const int32_t b =
857         3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
858     const int32_t v = a * dgd[l] + b;
859     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
860   }
861   i = height - 1;
862   j = 0;
863   {
864     const int k = i * buf_stride + j;
865     const int l = i * dgd_stride + j;
866     const int m = i * dst_stride + j;
867     const int nb = 3;
868     const int32_t a =
869         3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
870     const int32_t b =
871         3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
872     const int32_t v = a * dgd[l] + b;
873     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
874   }
875   i = height - 1;
876   j = width - 1;
877   {
878     const int k = i * buf_stride + j;
879     const int l = i * dgd_stride + j;
880     const int m = i * dst_stride + j;
881     const int nb = 3;
882     const int32_t a =
883         3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
884     const int32_t b =
885         3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
886     const int32_t v = a * dgd[l] + b;
887     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
888   }
889   i = 0;
890   for (j = 1; j < width - 1; ++j) {
891     const int k = i * buf_stride + j;
892     const int l = i * dgd_stride + j;
893     const int m = i * dst_stride + j;
894     const int nb = 3;
895     const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
896                       A[k + buf_stride - 1] + A[k + buf_stride + 1];
897     const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
898                       B[k + buf_stride - 1] + B[k + buf_stride + 1];
899     const int32_t v = a * dgd[l] + b;
900     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
901   }
902   i = height - 1;
903   for (j = 1; j < width - 1; ++j) {
904     const int k = i * buf_stride + j;
905     const int l = i * dgd_stride + j;
906     const int m = i * dst_stride + j;
907     const int nb = 3;
908     const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
909                       A[k - buf_stride - 1] + A[k - buf_stride + 1];
910     const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
911                       B[k - buf_stride - 1] + B[k - buf_stride + 1];
912     const int32_t v = a * dgd[l] + b;
913     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
914   }
915   j = 0;
916   for (i = 1; i < height - 1; ++i) {
917     const int k = i * buf_stride + j;
918     const int l = i * dgd_stride + j;
919     const int m = i * dst_stride + j;
920     const int nb = 3;
921     const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
922                       A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
923     const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
924                       B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
925     const int32_t v = a * dgd[l] + b;
926     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
927   }
928   j = width - 1;
929   for (i = 1; i < height - 1; ++i) {
930     const int k = i * buf_stride + j;
931     const int l = i * dgd_stride + j;
932     const int m = i * dst_stride + j;
933     const int nb = 3;
934     const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
935                       A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
936     const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
937                       B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
938     const int32_t v = a * dgd[l] + b;
939     dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
940   }
941   for (i = 1; i < height - 1; ++i) {
942     for (j = 1; j < width - 1; ++j) {
943       const int k = i * buf_stride + j;
944       const int l = i * dgd_stride + j;
945       const int m = i * dst_stride + j;
946       const int nb = 5;
947       const int32_t a =
948           (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
949               4 +
950           (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
951            A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
952               3;
953       const int32_t b =
954           (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
955               4 +
956           (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
957            B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
958               3;
959       const int32_t v = a * dgd[l] + b;
960       dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
961     }
962   }
963 }
964 
av1_selfguided_restoration_c(uint8_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int r,int eps)965 void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
966                                   int stride, int32_t *dst, int dst_stride,
967                                   int r, int eps) {
968   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
969   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
970   int32_t *dgd32 =
971       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
972   int i, j;
973   for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
974     for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
975       dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
976     }
977   }
978   av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
979                                       dst_stride, 8, r, eps);
980 }
981 
av1_highpass_filter_c(uint8_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int corner,int edge)982 void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
983                            int32_t *dst, int dst_stride, int corner, int edge) {
984   int i, j;
985   const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
986 
987   i = 0;
988   j = 0;
989   {
990     const int k = i * stride + j;
991     const int l = i * dst_stride + j;
992     dst[l] =
993         center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
994         corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
995   }
996   i = 0;
997   j = width - 1;
998   {
999     const int k = i * stride + j;
1000     const int l = i * dst_stride + j;
1001     dst[l] =
1002         center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
1003         corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
1004   }
1005   i = height - 1;
1006   j = 0;
1007   {
1008     const int k = i * stride + j;
1009     const int l = i * dst_stride + j;
1010     dst[l] =
1011         center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
1012         corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
1013   }
1014   i = height - 1;
1015   j = width - 1;
1016   {
1017     const int k = i * stride + j;
1018     const int l = i * dst_stride + j;
1019     dst[l] =
1020         center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
1021         corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
1022   }
1023   i = 0;
1024   for (j = 1; j < width - 1; ++j) {
1025     const int k = i * stride + j;
1026     const int l = i * dst_stride + j;
1027     dst[l] = center * dgd[k] +
1028              edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
1029              corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
1030                        dgd[k + 1]);
1031   }
1032   i = height - 1;
1033   for (j = 1; j < width - 1; ++j) {
1034     const int k = i * stride + j;
1035     const int l = i * dst_stride + j;
1036     dst[l] = center * dgd[k] +
1037              edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
1038              corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
1039                        dgd[k + 1]);
1040   }
1041   j = 0;
1042   for (i = 1; i < height - 1; ++i) {
1043     const int k = i * stride + j;
1044     const int l = i * dst_stride + j;
1045     dst[l] = center * dgd[k] +
1046              edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
1047              corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
1048                        dgd[k - stride] + dgd[k + stride]);
1049   }
1050   j = width - 1;
1051   for (i = 1; i < height - 1; ++i) {
1052     const int k = i * stride + j;
1053     const int l = i * dst_stride + j;
1054     dst[l] = center * dgd[k] +
1055              edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
1056              corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1057                        dgd[k - stride] + dgd[k + stride]);
1058   }
1059   for (i = 1; i < height - 1; ++i) {
1060     for (j = 1; j < width - 1; ++j) {
1061       const int k = i * stride + j;
1062       const int l = i * dst_stride + j;
1063       dst[l] =
1064           center * dgd[k] +
1065           edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
1066           corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1067                     dgd[k - stride + 1] + dgd[k + stride + 1]);
1068     }
1069   }
1070 }
1071 
apply_selfguided_restoration_c(uint8_t * dat,int width,int height,int stride,int eps,int * xqd,uint8_t * dst,int dst_stride,int32_t * tmpbuf)1072 void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
1073                                     int stride, int eps, int *xqd, uint8_t *dst,
1074                                     int dst_stride, int32_t *tmpbuf) {
1075   int xq[2];
1076   int32_t *flt1 = tmpbuf;
1077   int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
1078   int i, j;
1079   assert(width * height <= RESTORATION_TILEPELS_MAX);
1080 #if USE_HIGHPASS_IN_SGRPROJ
1081   av1_highpass_filter_c(dat, width, height, stride, flt1, width,
1082                         sgr_params[eps].corner, sgr_params[eps].edge);
1083 #else
1084   av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
1085                                sgr_params[eps].r1, sgr_params[eps].e1);
1086 #endif  // USE_HIGHPASS_IN_SGRPROJ
1087   av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
1088                                sgr_params[eps].r2, sgr_params[eps].e2);
1089   decode_xq(xqd, xq);
1090   for (i = 0; i < height; ++i) {
1091     for (j = 0; j < width; ++j) {
1092       const int k = i * width + j;
1093       const int l = i * stride + j;
1094       const int m = i * dst_stride + j;
1095       const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
1096       const int32_t f1 = (int32_t)flt1[k] - u;
1097       const int32_t f2 = (int32_t)flt2[k] - u;
1098       const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
1099       const int16_t w =
1100           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1101       dst[m] = clip_pixel(w);
1102     }
1103   }
1104 }
1105 
loop_sgrproj_filter_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1106 static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
1107                                      int height, int stride,
1108                                      RestorationInternal *rst, uint8_t *dst,
1109                                      int dst_stride) {
1110   const int procunit_width = rst->rsi->procunit_width;
1111 #if CONFIG_STRIPED_LOOP_RESTORATION
1112   int procunit_height;
1113 #else
1114   const int procunit_height = rst->rsi->procunit_height;
1115 #endif
1116   const int tile_width = rst->tile_width;
1117   const int tile_height = rst->tile_height;
1118   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1119     loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
1120     return;
1121   }
1122   RestorationTileLimits limits =
1123       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1124 #if CONFIG_STRIPED_LOOP_RESTORATION
1125                                tile_height, width, height, rst->subsampling_y);
1126 #else
1127                                tile_height, width, height);
1128 #endif
1129   for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1130 #if CONFIG_STRIPED_LOOP_RESTORATION
1131     int h = setup_processing_stripe_boundary(
1132         i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
1133     procunit_height = h;
1134 #else
1135     int h = AOMMIN(procunit_height, limits.v_end - i);
1136 #endif
1137     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1138       int w = AOMMIN(procunit_width, limits.h_end - j);
1139       uint8_t *data_p = data + i * stride + j;
1140       uint8_t *dst_p = dst + i * dst_stride + j;
1141       apply_selfguided_restoration(
1142           data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
1143           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
1144     }
1145 #if CONFIG_STRIPED_LOOP_RESTORATION
1146     restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1147                                        limits.h_end, data, stride, rst, 0);
1148 #endif
1149   }
1150 }
1151 
loop_sgrproj_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1152 static void loop_sgrproj_filter(uint8_t *data, int width, int height,
1153                                 int stride, RestorationInternal *rst,
1154                                 uint8_t *dst, int dst_stride) {
1155   int tile_idx;
1156   extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
1157                SGRPROJ_BORDER_VERT);
1158   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1159     loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1160                              dst_stride);
1161   }
1162 }
1163 
loop_switchable_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1164 static void loop_switchable_filter(uint8_t *data, int width, int height,
1165                                    int stride, RestorationInternal *rst,
1166                                    uint8_t *dst, int dst_stride) {
1167   int tile_idx;
1168   extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
1169                RESTORATION_BORDER_VERT);
1170   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1171     if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1172       loop_copy_tile(data, tile_idx, width, height, stride, rst, dst,
1173                      dst_stride);
1174     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
1175       loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1176                               dst_stride);
1177     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
1178       loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1179                                dst_stride);
1180     }
1181   }
1182 }
1183 
1184 #if CONFIG_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)1185 void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
1186                          int border_horz, int border_vert) {
1187   uint16_t *data_p;
1188   int i, j;
1189   for (i = 0; i < height; ++i) {
1190     data_p = data + i * stride;
1191     for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
1192     for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
1193   }
1194   data_p = data - border_horz;
1195   for (i = -border_vert; i < 0; ++i) {
1196     memcpy(data_p + i * stride, data_p,
1197            (width + 2 * border_horz) * sizeof(uint16_t));
1198   }
1199   for (i = height; i < height + border_vert; ++i) {
1200     memcpy(data_p + i * stride, data_p + (height - 1) * stride,
1201            (width + 2 * border_horz) * sizeof(uint16_t));
1202   }
1203 }
1204 
loop_copy_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint16_t * dst,int dst_stride)1205 static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int width,
1206                                   int height, int stride,
1207                                   RestorationInternal *rst, uint16_t *dst,
1208                                   int dst_stride) {
1209   const int tile_width = rst->tile_width;
1210   const int tile_height = rst->tile_height;
1211   RestorationTileLimits limits =
1212       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1213 #if CONFIG_STRIPED_LOOP_RESTORATION
1214                                tile_height, width, height, rst->subsampling_y);
1215 #else
1216                                tile_height, width, height);
1217 #endif
1218   for (int i = limits.v_start; i < limits.v_end; ++i)
1219     memcpy(dst + i * dst_stride + limits.h_start,
1220            data + i * stride + limits.h_start,
1221            (limits.h_end - limits.h_start) * sizeof(*dst));
1222 }
1223 
loop_wiener_filter_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint16_t * dst,int dst_stride)1224 static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
1225                                            int width, int height, int stride,
1226                                            RestorationInternal *rst,
1227                                            int bit_depth, uint16_t *dst,
1228                                            int dst_stride) {
1229   const int procunit_width = rst->rsi->procunit_width;
1230 #if CONFIG_STRIPED_LOOP_RESTORATION
1231   int procunit_height;
1232 #else
1233   const int procunit_height = rst->rsi->procunit_height;
1234 #endif
1235   const int tile_width = rst->tile_width;
1236   const int tile_height = rst->tile_height;
1237 
1238   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1239     loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1240                           dst_stride);
1241     return;
1242   }
1243   RestorationTileLimits limits =
1244       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1245 #if CONFIG_STRIPED_LOOP_RESTORATION
1246                                tile_height, width, height, rst->subsampling_y);
1247 #else
1248                                tile_height, width, height);
1249 #endif
1250   InterpKernel vertical_topbot;
1251 
1252   // Convolve the whole tile (done in blocks here to match the requirements
1253   // of the vectorized convolve functions, but the result is equivalent)
1254   for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1255 #if CONFIG_STRIPED_LOOP_RESTORATION
1256     int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1257                                              limits.h_end, (uint8_t *)data,
1258                                              stride, rst, 1);
1259     h = ALIGN_POWER_OF_TWO(h, 1);
1260     procunit_height = h;
1261 #else
1262     int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
1263 #endif
1264     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1265       int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
1266       const uint16_t *data_p = data + i * stride + j;
1267       uint16_t *dst_p = dst + i * dst_stride + j;
1268       // Note h is at least 16
1269       for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
1270         stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
1271                                vertical_topbot, WIENER_BORDER_VERT + b, 1);
1272 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1273         aom_highbd_convolve8_add_src_hip(
1274             CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1275             dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1276             vertical_topbot, 16, w, 1, bit_depth);
1277 #else
1278         aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
1279                                      CONVERT_TO_BYTEPTR(dst_p), dst_stride,
1280                                      rst->rsi->wiener_info[tile_idx].hfilter,
1281                                      16, vertical_topbot, 16, w, 1, bit_depth);
1282 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1283         data_p += stride;
1284         dst_p += dst_stride;
1285       }
1286 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1287       aom_highbd_convolve8_add_src_hip(
1288           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1289           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1290           rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
1291           h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
1292 #else
1293       aom_highbd_convolve8_add_src(
1294           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1295           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1296           rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
1297           h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
1298 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1299       data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
1300       dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
1301       for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
1302         stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
1303                                vertical_topbot, WIENER_BORDER_VERT + b, 0);
1304 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1305         aom_highbd_convolve8_add_src_hip(
1306             CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1307             dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1308             vertical_topbot, 16, w, 1, bit_depth);
1309 #else
1310         aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
1311                                      CONVERT_TO_BYTEPTR(dst_p), dst_stride,
1312                                      rst->rsi->wiener_info[tile_idx].hfilter,
1313                                      16, vertical_topbot, 16, w, 1, bit_depth);
1314 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1315         data_p += stride;
1316         dst_p += dst_stride;
1317       }
1318     }
1319 #if CONFIG_STRIPED_LOOP_RESTORATION
1320     restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1321                                        limits.h_end, (uint8_t *)data, stride,
1322                                        rst, 1);
1323 #endif
1324   }
1325 }
1326 
loop_wiener_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1327 static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
1328                                       int stride, RestorationInternal *rst,
1329                                       int bit_depth, uint8_t *dst8,
1330                                       int dst_stride) {
1331   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1332   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1333   int tile_idx;
1334   extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
1335                       WIENER_BORDER_VERT);
1336   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1337     loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1338                                    bit_depth, dst, dst_stride);
1339   }
1340 }
1341 
av1_selfguided_restoration_highbd_c(uint16_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int bit_depth,int r,int eps)1342 void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
1343                                          int stride, int32_t *dst,
1344                                          int dst_stride, int bit_depth, int r,
1345                                          int eps) {
1346   int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
1347   const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
1348   int32_t *dgd32 =
1349       dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1350   int i, j;
1351   for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
1352     for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
1353       dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
1354     }
1355   }
1356   av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
1357                                       dst_stride, bit_depth, r, eps);
1358 }
1359 
av1_highpass_filter_highbd_c(uint16_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int corner,int edge)1360 void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
1361                                   int stride, int32_t *dst, int dst_stride,
1362                                   int corner, int edge) {
1363   int i, j;
1364   const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
1365 
1366   i = 0;
1367   j = 0;
1368   {
1369     const int k = i * stride + j;
1370     const int l = i * dst_stride + j;
1371     dst[l] =
1372         center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
1373         corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
1374   }
1375   i = 0;
1376   j = width - 1;
1377   {
1378     const int k = i * stride + j;
1379     const int l = i * dst_stride + j;
1380     dst[l] =
1381         center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
1382         corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
1383   }
1384   i = height - 1;
1385   j = 0;
1386   {
1387     const int k = i * stride + j;
1388     const int l = i * dst_stride + j;
1389     dst[l] =
1390         center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
1391         corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
1392   }
1393   i = height - 1;
1394   j = width - 1;
1395   {
1396     const int k = i * stride + j;
1397     const int l = i * dst_stride + j;
1398     dst[l] =
1399         center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
1400         corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
1401   }
1402   i = 0;
1403   for (j = 1; j < width - 1; ++j) {
1404     const int k = i * stride + j;
1405     const int l = i * dst_stride + j;
1406     dst[l] = center * dgd[k] +
1407              edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
1408              corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
1409                        dgd[k + 1]);
1410   }
1411   i = height - 1;
1412   for (j = 1; j < width - 1; ++j) {
1413     const int k = i * stride + j;
1414     const int l = i * dst_stride + j;
1415     dst[l] = center * dgd[k] +
1416              edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
1417              corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
1418                        dgd[k + 1]);
1419   }
1420   j = 0;
1421   for (i = 1; i < height - 1; ++i) {
1422     const int k = i * stride + j;
1423     const int l = i * dst_stride + j;
1424     dst[l] = center * dgd[k] +
1425              edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
1426              corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
1427                        dgd[k - stride] + dgd[k + stride]);
1428   }
1429   j = width - 1;
1430   for (i = 1; i < height - 1; ++i) {
1431     const int k = i * stride + j;
1432     const int l = i * dst_stride + j;
1433     dst[l] = center * dgd[k] +
1434              edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
1435              corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1436                        dgd[k - stride] + dgd[k + stride]);
1437   }
1438   for (i = 1; i < height - 1; ++i) {
1439     for (j = 1; j < width - 1; ++j) {
1440       const int k = i * stride + j;
1441       const int l = i * dst_stride + j;
1442       dst[l] =
1443           center * dgd[k] +
1444           edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
1445           corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1446                     dgd[k - stride + 1] + dgd[k + stride + 1]);
1447     }
1448   }
1449 }
1450 
apply_selfguided_restoration_highbd_c(uint16_t * dat,int width,int height,int stride,int bit_depth,int eps,int * xqd,uint16_t * dst,int dst_stride,int32_t * tmpbuf)1451 void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
1452                                            int stride, int bit_depth, int eps,
1453                                            int *xqd, uint16_t *dst,
1454                                            int dst_stride, int32_t *tmpbuf) {
1455   int xq[2];
1456   int32_t *flt1 = tmpbuf;
1457   int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
1458   int i, j;
1459   assert(width * height <= RESTORATION_TILEPELS_MAX);
1460 #if USE_HIGHPASS_IN_SGRPROJ
1461   av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
1462                                sgr_params[eps].corner, sgr_params[eps].edge);
1463 #else
1464   av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
1465                                       bit_depth, sgr_params[eps].r1,
1466                                       sgr_params[eps].e1);
1467 #endif  // USE_HIGHPASS_IN_SGRPROJ
1468   av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
1469                                       bit_depth, sgr_params[eps].r2,
1470                                       sgr_params[eps].e2);
1471   decode_xq(xqd, xq);
1472   for (i = 0; i < height; ++i) {
1473     for (j = 0; j < width; ++j) {
1474       const int k = i * width + j;
1475       const int l = i * stride + j;
1476       const int m = i * dst_stride + j;
1477       const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
1478       const int32_t f1 = (int32_t)flt1[k] - u;
1479       const int32_t f2 = (int32_t)flt2[k] - u;
1480       const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
1481       const int16_t w =
1482           (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1483       dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
1484     }
1485   }
1486 }
1487 
loop_sgrproj_filter_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint16_t * dst,int dst_stride)1488 static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
1489                                             int width, int height, int stride,
1490                                             RestorationInternal *rst,
1491                                             int bit_depth, uint16_t *dst,
1492                                             int dst_stride) {
1493   const int procunit_width = rst->rsi->procunit_width;
1494 #if CONFIG_STRIPED_LOOP_RESTORATION
1495   int procunit_height;
1496 #else
1497   const int procunit_height = rst->rsi->procunit_height;
1498 #endif
1499   const int tile_width = rst->tile_width;
1500   const int tile_height = rst->tile_height;
1501 
1502   if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1503     loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1504                           dst_stride);
1505     return;
1506   }
1507   RestorationTileLimits limits =
1508       av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1509 #if CONFIG_STRIPED_LOOP_RESTORATION
1510                                tile_height, width, height, rst->subsampling_y);
1511 #else
1512                                tile_height, width, height);
1513 #endif
1514   for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1515 #if CONFIG_STRIPED_LOOP_RESTORATION
1516     int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1517                                              limits.h_end, (uint8_t *)data,
1518                                              stride, rst, 1);
1519     procunit_height = h;
1520 #else
1521     int h = AOMMIN(procunit_height, limits.v_end - i);
1522 #endif
1523     for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1524       int w = AOMMIN(procunit_width, limits.h_end - j);
1525       uint16_t *data_p = data + i * stride + j;
1526       uint16_t *dst_p = dst + i * dst_stride + j;
1527       apply_selfguided_restoration_highbd(
1528           data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
1529           rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
1530     }
1531 #if CONFIG_STRIPED_LOOP_RESTORATION
1532     restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1533                                        limits.h_end, (uint8_t *)data, stride,
1534                                        rst, 1);
1535 #endif
1536   }
1537 }
1538 
loop_sgrproj_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1539 static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
1540                                        int stride, RestorationInternal *rst,
1541                                        int bit_depth, uint8_t *dst8,
1542                                        int dst_stride) {
1543   int tile_idx;
1544   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1545   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1546   extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
1547                       SGRPROJ_BORDER_VERT);
1548   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1549     loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1550                                     bit_depth, dst, dst_stride);
1551   }
1552 }
1553 
loop_switchable_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1554 static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
1555                                           int stride, RestorationInternal *rst,
1556                                           int bit_depth, uint8_t *dst8,
1557                                           int dst_stride) {
1558   uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1559   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1560   int tile_idx;
1561   extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
1562                       RESTORATION_BORDER_VERT);
1563   for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1564     if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1565       loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1566                             dst_stride);
1567     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
1568       loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1569                                      bit_depth, dst, dst_stride);
1570     } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
1571       loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
1572                                       rst, bit_depth, dst, dst_stride);
1573     }
1574   }
1575 }
1576 #endif  // CONFIG_HIGHBITDEPTH
1577 
loop_restoration_rows(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int start_mi_row,int end_mi_row,int components_pattern,RestorationInfo * rsi,YV12_BUFFER_CONFIG * dst)1578 static void loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
1579                                   int start_mi_row, int end_mi_row,
1580                                   int components_pattern, RestorationInfo *rsi,
1581                                   YV12_BUFFER_CONFIG *dst) {
1582   const int ywidth = frame->y_crop_width;
1583   const int yheight = frame->y_crop_height;
1584   const int uvwidth = frame->uv_crop_width;
1585   const int uvheight = frame->uv_crop_height;
1586   const int ystride = frame->y_stride;
1587   const int uvstride = frame->uv_stride;
1588   const int ystart = start_mi_row << MI_SIZE_LOG2;
1589   const int uvstart = ystart >> cm->subsampling_y;
1590   int yend = end_mi_row << MI_SIZE_LOG2;
1591   int uvend = yend >> cm->subsampling_y;
1592   restore_func_type restore_funcs[RESTORE_TYPES] = {
1593     NULL, loop_wiener_filter, loop_sgrproj_filter, loop_switchable_filter
1594   };
1595 #if CONFIG_HIGHBITDEPTH
1596   restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
1597     NULL, loop_wiener_filter_highbd, loop_sgrproj_filter_highbd,
1598     loop_switchable_filter_highbd
1599   };
1600 #endif  // CONFIG_HIGHBITDEPTH
1601   restore_func_type restore_func;
1602 #if CONFIG_HIGHBITDEPTH
1603   restore_func_highbd_type restore_func_highbd;
1604 #endif  // CONFIG_HIGHBITDEPTH
1605   YV12_BUFFER_CONFIG dst_;
1606 
1607   yend = AOMMIN(yend, yheight);
1608   uvend = AOMMIN(uvend, uvheight);
1609   if (components_pattern == (1 << AOM_PLANE_Y)) {
1610     // Only y
1611     if (rsi[0].frame_restoration_type == RESTORE_NONE) {
1612       if (dst) aom_yv12_copy_y(frame, dst);
1613       return;
1614     }
1615   } else if (components_pattern == (1 << AOM_PLANE_U)) {
1616     // Only U
1617     if (rsi[1].frame_restoration_type == RESTORE_NONE) {
1618       if (dst) aom_yv12_copy_u(frame, dst);
1619       return;
1620     }
1621   } else if (components_pattern == (1 << AOM_PLANE_V)) {
1622     // Only V
1623     if (rsi[2].frame_restoration_type == RESTORE_NONE) {
1624       if (dst) aom_yv12_copy_v(frame, dst);
1625       return;
1626     }
1627   } else if (components_pattern ==
1628              ((1 << AOM_PLANE_Y) | (1 << AOM_PLANE_U) | (1 << AOM_PLANE_V))) {
1629     // All components
1630     if (rsi[0].frame_restoration_type == RESTORE_NONE &&
1631         rsi[1].frame_restoration_type == RESTORE_NONE &&
1632         rsi[2].frame_restoration_type == RESTORE_NONE) {
1633       if (dst) aom_yv12_copy_frame(frame, dst);
1634       return;
1635     }
1636   }
1637 
1638   if (!dst) {
1639     dst = &dst_;
1640     memset(dst, 0, sizeof(YV12_BUFFER_CONFIG));
1641     if (aom_realloc_frame_buffer(
1642             dst, ywidth, yheight, cm->subsampling_x, cm->subsampling_y,
1643 #if CONFIG_HIGHBITDEPTH
1644             cm->use_highbitdepth,
1645 #endif
1646             AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
1647       aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1648                          "Failed to allocate restoration dst buffer");
1649   }
1650 
1651   if ((components_pattern >> AOM_PLANE_Y) & 1) {
1652     if (rsi[0].frame_restoration_type != RESTORE_NONE) {
1653       cm->rst_internal.ntiles = av1_get_rest_ntiles(
1654           ywidth, yheight, cm->rst_info[AOM_PLANE_Y].restoration_tilesize,
1655           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1656           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1657       cm->rst_internal.rsi = &rsi[0];
1658 #if CONFIG_STRIPED_LOOP_RESTORATION
1659       cm->rst_internal.component = AOM_PLANE_Y;
1660       cm->rst_internal.subsampling_y = 0;
1661 #endif
1662       restore_func =
1663           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1664 #if CONFIG_HIGHBITDEPTH
1665       restore_func_highbd =
1666           restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1667       if (cm->use_highbitdepth)
1668         restore_func_highbd(
1669             frame->y_buffer + ystart * ystride, ywidth, yend - ystart, ystride,
1670             &cm->rst_internal, cm->bit_depth,
1671             dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
1672       else
1673 #endif  // CONFIG_HIGHBITDEPTH
1674         restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
1675                      ystride, &cm->rst_internal,
1676                      dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
1677     } else {
1678       aom_yv12_copy_y(frame, dst);
1679     }
1680   }
1681 
1682   if ((components_pattern >> AOM_PLANE_U) & 1) {
1683     if (rsi[AOM_PLANE_U].frame_restoration_type != RESTORE_NONE) {
1684       cm->rst_internal.ntiles = av1_get_rest_ntiles(
1685           uvwidth, uvheight, cm->rst_info[AOM_PLANE_U].restoration_tilesize,
1686           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1687           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1688       cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
1689 #if CONFIG_STRIPED_LOOP_RESTORATION
1690       cm->rst_internal.component = AOM_PLANE_U;
1691       cm->rst_internal.subsampling_y = cm->subsampling_y;
1692 #endif
1693       restore_func =
1694           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1695 #if CONFIG_HIGHBITDEPTH
1696       restore_func_highbd =
1697           restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1698       if (cm->use_highbitdepth)
1699         restore_func_highbd(
1700             frame->u_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
1701             uvstride, &cm->rst_internal, cm->bit_depth,
1702             dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1703       else
1704 #endif  // CONFIG_HIGHBITDEPTH
1705         restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
1706                      uvend - uvstart, uvstride, &cm->rst_internal,
1707                      dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1708     } else {
1709       aom_yv12_copy_u(frame, dst);
1710     }
1711   }
1712 
1713   if ((components_pattern >> AOM_PLANE_V) & 1) {
1714     if (rsi[AOM_PLANE_V].frame_restoration_type != RESTORE_NONE) {
1715       cm->rst_internal.ntiles = av1_get_rest_ntiles(
1716           uvwidth, uvheight, cm->rst_info[AOM_PLANE_V].restoration_tilesize,
1717           &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1718           &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1719       cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
1720 #if CONFIG_STRIPED_LOOP_RESTORATION
1721       cm->rst_internal.component = AOM_PLANE_V;
1722       cm->rst_internal.subsampling_y = cm->subsampling_y;
1723 #endif
1724       restore_func =
1725           restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1726 #if CONFIG_HIGHBITDEPTH
1727       restore_func_highbd =
1728           restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1729       if (cm->use_highbitdepth)
1730         restore_func_highbd(
1731             frame->v_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
1732             uvstride, &cm->rst_internal, cm->bit_depth,
1733             dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1734       else
1735 #endif  // CONFIG_HIGHBITDEPTH
1736         restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
1737                      uvend - uvstart, uvstride, &cm->rst_internal,
1738                      dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1739     } else {
1740       aom_yv12_copy_v(frame, dst);
1741     }
1742   }
1743 
1744   if (dst == &dst_) {
1745     if ((components_pattern >> AOM_PLANE_Y) & 1) aom_yv12_copy_y(dst, frame);
1746     if ((components_pattern >> AOM_PLANE_U) & 1) aom_yv12_copy_u(dst, frame);
1747     if ((components_pattern >> AOM_PLANE_V) & 1) aom_yv12_copy_v(dst, frame);
1748     aom_free_frame_buffer(dst);
1749   }
1750 }
1751 
av1_loop_restoration_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,RestorationInfo * rsi,int components_pattern,int partial_frame,YV12_BUFFER_CONFIG * dst)1752 void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
1753                                 RestorationInfo *rsi, int components_pattern,
1754                                 int partial_frame, YV12_BUFFER_CONFIG *dst) {
1755   int start_mi_row, end_mi_row, mi_rows_to_filter;
1756   start_mi_row = 0;
1757 #if CONFIG_FRAME_SUPERRES
1758   mi_rows_to_filter =
1759       ALIGN_POWER_OF_TWO(cm->superres_upscaled_height, 3) >> MI_SIZE_LOG2;
1760 #else
1761   mi_rows_to_filter = cm->mi_rows;
1762 #endif  // CONFIG_FRAME_SUPERRES
1763   if (partial_frame && mi_rows_to_filter > 8) {
1764     start_mi_row = mi_rows_to_filter >> 1;
1765     start_mi_row &= 0xfffffff8;
1766     mi_rows_to_filter = AOMMAX(mi_rows_to_filter / 8, 8);
1767   }
1768   end_mi_row = start_mi_row + mi_rows_to_filter;
1769   loop_restoration_init(&cm->rst_internal, cm->frame_type == KEY_FRAME);
1770   loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, components_pattern,
1771                         rsi, dst);
1772 }
1773 
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1,int * nhtiles)1774 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1775                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
1776                                        int *rcol0, int *rcol1, int *rrow0,
1777                                        int *rrow1, int *nhtiles) {
1778   assert(rcol0 && rcol1 && rrow0 && rrow1 && nhtiles);
1779 
1780   if (bsize != cm->sb_size) return 0;
1781 
1782 #if CONFIG_FRAME_SUPERRES
1783   const int frame_w = cm->superres_upscaled_width;
1784   const int frame_h = cm->superres_upscaled_height;
1785   const int mi_to_px = MI_SIZE * SCALE_NUMERATOR;
1786   const int denom = cm->superres_scale_denominator;
1787 #else
1788   const int frame_w = cm->width;
1789   const int frame_h = cm->height;
1790   const int mi_to_px = MI_SIZE;
1791   const int denom = 1;
1792 #endif  // CONFIG_FRAME_SUPERRES
1793 
1794   const int ss_x = plane > 0 && cm->subsampling_x != 0;
1795   const int ss_y = plane > 0 && cm->subsampling_y != 0;
1796 
1797   const int ss_frame_w = (frame_w + ss_x) >> ss_x;
1798   const int ss_frame_h = (frame_h + ss_y) >> ss_y;
1799 
1800   int rtile_w, rtile_h, nvtiles;
1801   av1_get_rest_ntiles(ss_frame_w, ss_frame_h,
1802                       cm->rst_info[plane].restoration_tilesize, &rtile_w,
1803                       &rtile_h, nhtiles, &nvtiles);
1804 
1805   const int rnd_w = rtile_w * denom - 1;
1806   const int rnd_h = rtile_h * denom - 1;
1807 
1808   // rcol0/rrow0 should be the first column/row of rtiles that doesn't start
1809   // left/below of mi_col/mi_row. For this calculation, we need to round up the
1810   // division (if the sb starts at rtile column 10.1, the first matching rtile
1811   // has column index 11)
1812   *rcol0 = (mi_col * mi_to_px + rnd_w) / (rtile_w * denom);
1813   *rrow0 = (mi_row * mi_to_px + rnd_h) / (rtile_h * denom);
1814 
1815   // rcol1/rrow1 is the equivalent calculation, but for the superblock
1816   // below-right. There are some slightly strange boundary effects. First, we
1817   // need to clamp to nhtiles/nvtiles for the case where it appears there are,
1818   // say, 2.4 restoration tiles horizontally. There we need a maximum mi_row1
1819   // of 2 because tile 1 gets extended.
1820   //
1821   // Second, if mi_col1 >= cm->mi_cols then we must manually set *rcol1 to
1822   // nhtiles. This is needed whenever the frame's width rounded up to the next
1823   // toplevel superblock is smaller than nhtiles * rtile_w. The same logic is
1824   // needed for rows.
1825   const int mi_row1 = mi_row + mi_size_high[bsize];
1826   const int mi_col1 = mi_col + mi_size_wide[bsize];
1827 
1828   if (mi_col1 >= cm->mi_cols)
1829     *rcol1 = *nhtiles;
1830   else
1831     *rcol1 = AOMMIN(*nhtiles, (mi_col1 * mi_to_px + rnd_w) / (rtile_w * denom));
1832 
1833   if (mi_row1 >= cm->mi_rows)
1834     *rrow1 = nvtiles;
1835   else
1836     *rrow1 = AOMMIN(nvtiles, (mi_row1 * mi_to_px + rnd_h) / (rtile_h * denom));
1837 
1838   return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1839 }
1840 
1841 #if CONFIG_STRIPED_LOOP_RESTORATION
1842 
1843 // Extend to left and right
extend_line(uint8_t * buf,int width,int extend,int use_highbitdepth)1844 static void extend_line(uint8_t *buf, int width, int extend,
1845                         int use_highbitdepth) {
1846   int i;
1847   if (use_highbitdepth) {
1848     uint16_t val, *buf16 = (uint16_t *)buf;
1849     val = buf16[0];
1850     for (i = 0; i < extend; i++) buf16[-1 - i] = val;
1851     val = buf16[width - 1];
1852     for (i = 0; i < extend; i++) buf16[width + i] = val;
1853   } else {
1854     uint8_t val;
1855     val = buf[0];
1856     for (i = 0; i < extend; i++) buf[-1 - i] = val;
1857     val = buf[width - 1];
1858     for (i = 0; i < extend; i++) buf[width + i] = val;
1859   }
1860 }
1861 
1862 // For each 64 pixel high stripe, save 4 scan lines to be used as boundary in
1863 // the loop restoration process. The lines are saved in
1864 // rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm)1865 void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
1866                                               AV1_COMMON *cm) {
1867   int p, boundary_stride;
1868   int src_width, src_height, src_stride, stripe_height, stripe_offset, stripe_y,
1869       yy;
1870   uint8_t *src_buf, *boundary_below_buf, *boundary_above_buf;
1871   int use_highbitdepth = 0;
1872   for (p = 0; p < MAX_MB_PLANE; ++p) {
1873     if (p == 0) {
1874       src_buf = frame->y_buffer;
1875       src_width = frame->y_crop_width;
1876       src_height = frame->y_crop_height;
1877       src_stride = frame->y_stride;
1878       stripe_height = 64;
1879       stripe_offset = 56 - 2;  // offset of first line to copy
1880     } else {
1881       src_buf = p == 1 ? frame->u_buffer : frame->v_buffer;
1882       src_width = frame->uv_crop_width;
1883       src_height = frame->uv_crop_height;
1884       src_stride = frame->uv_stride;
1885       stripe_height = 64 >> cm->subsampling_y;
1886       stripe_offset = (56 >> cm->subsampling_y) - 2;
1887     }
1888     boundary_above_buf = cm->rst_internal.stripe_boundary_above[p];
1889     boundary_below_buf = cm->rst_internal.stripe_boundary_below[p];
1890     boundary_stride = cm->rst_internal.stripe_boundary_stride[p];
1891 #if CONFIG_HIGHBITDEPTH
1892     use_highbitdepth = cm->use_highbitdepth;
1893     if (use_highbitdepth) {
1894       src_buf = (uint8_t *)CONVERT_TO_SHORTPTR(src_buf);
1895     }
1896 #endif
1897     src_buf += (stripe_offset * src_stride) << use_highbitdepth;
1898     boundary_above_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
1899     boundary_below_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
1900     // Loop over stripes
1901     for (stripe_y = stripe_offset; stripe_y < src_height;
1902          stripe_y += stripe_height) {
1903       // Save 2 lines above the LR stripe (offset -9, -10)
1904       for (yy = 0; yy < 2; yy++) {
1905         if (stripe_y + yy < src_height) {
1906           memcpy(boundary_above_buf, src_buf, src_width << use_highbitdepth);
1907           extend_line(boundary_above_buf, src_width, RESTORATION_EXTRA_HORZ,
1908                       use_highbitdepth);
1909           src_buf += src_stride << use_highbitdepth;
1910           boundary_above_buf += boundary_stride << use_highbitdepth;
1911         }
1912       }
1913       // Save 2 lines below the LR stripe (offset 56,57)
1914       for (yy = 2; yy < 4; yy++) {
1915         if (stripe_y + yy < src_height) {
1916           memcpy(boundary_below_buf, src_buf, src_width << use_highbitdepth);
1917           extend_line(boundary_below_buf, src_width, RESTORATION_EXTRA_HORZ,
1918                       use_highbitdepth);
1919           src_buf += src_stride << use_highbitdepth;
1920           boundary_below_buf += boundary_stride << use_highbitdepth;
1921         }
1922       }
1923       // jump to next stripe
1924       src_buf += ((stripe_height - 4) * src_stride) << use_highbitdepth;
1925     }
1926   }
1927 }
1928 
1929 #endif  // CONFIG_STRIPED_LOOP_RESTORATION
1930