1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 *
11 */
12
13 #include <math.h>
14
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17 #include "./aom_scale_rtcd.h"
18 #include "av1/common/onyxc_int.h"
19 #include "av1/common/restoration.h"
20 #include "aom_dsp/aom_dsp_common.h"
21 #include "aom_mem/aom_mem.h"
22
23 #include "aom_ports/mem.h"
24
25 const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
26 #if USE_HIGHPASS_IN_SGRPROJ
27 // corner, edge, r2, eps2
28 { -1, 2, 1, 1 }, { -1, 2, 1, 2 }, { -1, 2, 1, 3 }, { -1, 2, 1, 4 },
29 { -1, 2, 1, 5 }, { -2, 3, 1, 2 }, { -2, 3, 1, 3 }, { -2, 3, 1, 4 },
30 { -2, 3, 1, 5 }, { -2, 3, 1, 6 }, { -3, 4, 1, 3 }, { -3, 4, 1, 4 },
31 { -3, 4, 1, 5 }, { -3, 4, 1, 6 }, { -3, 4, 1, 7 }, { -3, 4, 1, 8 }
32 #else
33 // r1, eps1, r2, eps2
34 #if MAX_RADIUS == 2
35 { 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 },
36 { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
37 { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 2, 30, 1, 2 },
38 { 2, 50, 1, 12 }, { 2, 60, 1, 13 }, { 2, 70, 1, 14 }, { 2, 80, 1, 15 },
39 #else
40 { 2, 12, 1, 4 }, { 2, 15, 1, 6 }, { 2, 18, 1, 8 }, { 2, 20, 1, 9 },
41 { 2, 22, 1, 10 }, { 2, 25, 1, 11 }, { 2, 35, 1, 12 }, { 2, 45, 1, 13 },
42 { 2, 55, 1, 14 }, { 2, 65, 1, 15 }, { 2, 75, 1, 16 }, { 3, 30, 1, 10 },
43 { 3, 50, 1, 12 }, { 3, 50, 2, 25 }, { 3, 60, 2, 35 }, { 3, 70, 2, 45 },
44 #endif // MAX_RADIUS == 2
45 #endif
46 };
47
48 typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
49 int stride, RestorationInternal *rst,
50 uint8_t *dst8, int dst_stride);
51 #if CONFIG_HIGHBITDEPTH
52 typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
53 int stride, RestorationInternal *rst,
54 int bit_depth, uint8_t *dst8,
55 int dst_stride);
56 #endif // CONFIG_HIGHBITDEPTH
57
av1_alloc_restoration_struct(AV1_COMMON * cm,RestorationInfo * rst_info,int width,int height)58 int av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rst_info,
59 int width, int height) {
60 const int ntiles = av1_get_rest_ntiles(
61 width, height, rst_info->restoration_tilesize, NULL, NULL, NULL, NULL);
62 aom_free(rst_info->restoration_type);
63 CHECK_MEM_ERROR(cm, rst_info->restoration_type,
64 (RestorationType *)aom_malloc(
65 sizeof(*rst_info->restoration_type) * ntiles));
66 aom_free(rst_info->wiener_info);
67 CHECK_MEM_ERROR(
68 cm, rst_info->wiener_info,
69 (WienerInfo *)aom_memalign(16, sizeof(*rst_info->wiener_info) * ntiles));
70 memset(rst_info->wiener_info, 0, sizeof(*rst_info->wiener_info) * ntiles);
71 aom_free(rst_info->sgrproj_info);
72 CHECK_MEM_ERROR(
73 cm, rst_info->sgrproj_info,
74 (SgrprojInfo *)aom_malloc(sizeof(*rst_info->sgrproj_info) * ntiles));
75 return ntiles;
76 }
77
av1_free_restoration_struct(RestorationInfo * rst_info)78 void av1_free_restoration_struct(RestorationInfo *rst_info) {
79 aom_free(rst_info->restoration_type);
80 rst_info->restoration_type = NULL;
81 aom_free(rst_info->wiener_info);
82 rst_info->wiener_info = NULL;
83 aom_free(rst_info->sgrproj_info);
84 rst_info->sgrproj_info = NULL;
85 }
86
87 // TODO(debargha): This table can be substantially reduced since only a few
88 // values are actually used.
89 int sgrproj_mtable[MAX_EPS][MAX_NELEM];
90
GenSgrprojVtable()91 static void GenSgrprojVtable() {
92 int e, n;
93 for (e = 1; e <= MAX_EPS; ++e)
94 for (n = 1; n <= MAX_NELEM; ++n) {
95 const int n2e = n * n * e;
96 sgrproj_mtable[e - 1][n - 1] =
97 (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
98 }
99 }
100
av1_loop_restoration_precal()101 void av1_loop_restoration_precal() { GenSgrprojVtable(); }
102
loop_restoration_init(RestorationInternal * rst,int kf)103 static void loop_restoration_init(RestorationInternal *rst, int kf) {
104 rst->keyframe = kf;
105 }
106
extend_frame(uint8_t * data,int width,int height,int stride,int border_horz,int border_vert)107 void extend_frame(uint8_t *data, int width, int height, int stride,
108 int border_horz, int border_vert) {
109 uint8_t *data_p;
110 int i;
111 for (i = 0; i < height; ++i) {
112 data_p = data + i * stride;
113 memset(data_p - border_horz, data_p[0], border_horz);
114 memset(data_p + width, data_p[width - 1], border_horz);
115 }
116 data_p = data - border_horz;
117 for (i = -border_vert; i < 0; ++i) {
118 memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
119 }
120 for (i = height; i < height + border_vert; ++i) {
121 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
122 width + 2 * border_horz);
123 }
124 }
125
126 #if CONFIG_STRIPED_LOOP_RESTORATION
127
128 // This function setup a processing stripe by replacing the vertical
129 // stripe boundary (2 lines above and 2 lines below) by data coming
130 // from the above/below buffers. Before doing so the original
131 // frame data is saved into a temporary buffer, such that it
132 // can be restored by the restore_processing_stripe_boundary
133 // function after the filtering of the processing stripe.
134 // Returns the height of the processing stripe
setup_processing_stripe_boundary(int y0,int v_end,int h_start,int h_end,uint8_t * data,int stride,RestorationInternal * rst,int use_highbd)135 static int setup_processing_stripe_boundary(int y0, int v_end, int h_start,
136 int h_end, uint8_t *data,
137 int stride,
138 RestorationInternal *rst,
139 int use_highbd) {
140 int y, y_stripe_topmost, stripe_index, i;
141 int tile_offset = RESTORATION_TILE_OFFSET >> rst->subsampling_y;
142 int stripe_height = rst->rsi->procunit_height;
143 int comp = rst->component;
144 uint8_t *boundary_above_buf = rst->stripe_boundary_above[comp];
145 uint8_t *boundary_below_buf = rst->stripe_boundary_below[comp];
146 int boundary_stride = rst->stripe_boundary_stride[comp];
147 int x0 = h_start - RESTORATION_EXTRA_HORZ;
148 int x1 = h_end + RESTORATION_EXTRA_HORZ;
149
150 stripe_index = (y0 + tile_offset) / stripe_height;
151 y_stripe_topmost = stripe_index * stripe_height - tile_offset;
152 boundary_above_buf +=
153 ((stripe_index - 1) * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
154 << use_highbd;
155 boundary_below_buf +=
156 (stripe_index * 2 * boundary_stride + RESTORATION_EXTRA_HORZ)
157 << use_highbd;
158
159 // setup the 2 lines above the stripe
160 for (i = 0; i < 2; i++) {
161 y = y_stripe_topmost - 2 + i;
162 if (y >= 0 && y < y0 && y >= y0 - 2) {
163 uint8_t *p = data + ((y * stride + x0) << use_highbd);
164 uint8_t *new_data =
165 boundary_above_buf + ((i * boundary_stride + x0) << use_highbd);
166 // printf("above %3d %3d: %08x %08x : %08x %08x\n", y, x0,
167 // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
168 // ((uint32_t*)new_data)[1]);
169 // Save old pixels
170 memcpy(rst->tmp_save_above[i], p, (x1 - x0) << use_highbd);
171 // Replace width pixels from boundary_above_buf
172 memcpy(p, new_data, (x1 - x0) << use_highbd);
173 }
174 }
175 // setup the 2 lines below the stripe
176 for (i = 0; i < 2; i++) {
177 y = y_stripe_topmost + stripe_height + i;
178 if (y < v_end + 2) {
179 uint8_t *p = data + ((y * stride + x0) << use_highbd);
180 uint8_t *new_data =
181 boundary_below_buf + ((i * boundary_stride + x0) << use_highbd);
182 // printf("below %3d %3d: %08x %08x : %08x %08x\n", y, x0,
183 // ((uint32_t*)p)[0], ((uint32_t*)p)[1], ((uint32_t*)new_data)[0],
184 // ((uint32_t*)new_data)[1]);
185 // Save old pixels
186 memcpy(rst->tmp_save_below[i], p, (x1 - x0) << use_highbd);
187 // Replace width pixels from boundary_below_buf
188 memcpy(p, new_data, (x1 - x0) << use_highbd);
189 }
190 }
191 // Return actual stripe height
192 return AOMMIN(v_end, y_stripe_topmost + stripe_height) - y0;
193 }
194
195 // This function restores the boundary lines modified by
196 // setup_processing_stripe_boundary.
restore_processing_stripe_boundary(int y0,int v_end,int h_start,int h_end,uint8_t * data,int stride,RestorationInternal * rst,int use_highbd)197 static void restore_processing_stripe_boundary(int y0, int v_end, int h_start,
198 int h_end, uint8_t *data,
199 int stride,
200 RestorationInternal *rst,
201 int use_highbd) {
202 int y, y_stripe_topmost, i, stripe_index;
203 int tile_offset = 8 >> rst->subsampling_y;
204 int stripe_height = rst->rsi->procunit_height;
205 int x0 = h_start - RESTORATION_EXTRA_HORZ;
206 int x1 = h_end + RESTORATION_EXTRA_HORZ;
207
208 stripe_index = (y0 + tile_offset) / stripe_height;
209 y_stripe_topmost = stripe_index * stripe_height - tile_offset;
210
211 // restore the 2 lines above the stripe
212 for (i = 0; i < 2; i++) {
213 y = y_stripe_topmost - 2 + i;
214 if (y >= 0 && y < y0 && y >= y0 - 2) {
215 uint8_t *p = data + ((y * stride + x0) << use_highbd);
216 memcpy(p, rst->tmp_save_above[i], (x1 - x0) << use_highbd);
217 }
218 }
219 // restore the 2 lines below the stripe
220 for (i = 0; i < 2; i++) {
221 y = y_stripe_topmost + stripe_height + i;
222 if (y < v_end + 2) {
223 uint8_t *p = data + ((y * stride + x0) << use_highbd);
224 memcpy(p, rst->tmp_save_below[i], (x1 - x0) << use_highbd);
225 }
226 }
227 }
228
229 #endif
230
loop_copy_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)231 static void loop_copy_tile(uint8_t *data, int tile_idx, int width, int height,
232 int stride, RestorationInternal *rst, uint8_t *dst,
233 int dst_stride) {
234 const int tile_width = rst->tile_width;
235 const int tile_height = rst->tile_height;
236 RestorationTileLimits limits =
237 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
238 #if CONFIG_STRIPED_LOOP_RESTORATION
239 tile_height, width, height, rst->subsampling_y);
240 #else
241 tile_height, width, height);
242 #endif
243 for (int i = limits.v_start; i < limits.v_end; ++i)
244 memcpy(dst + i * dst_stride + limits.h_start,
245 data + i * stride + limits.h_start, limits.h_end - limits.h_start);
246 }
247
stepdown_wiener_kernel(const InterpKernel orig,InterpKernel vert,int boundary_dist,int istop)248 static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
249 int boundary_dist, int istop) {
250 memcpy(vert, orig, sizeof(InterpKernel));
251 switch (boundary_dist) {
252 case 0:
253 vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
254 vert[2] = vert[1] = vert[0] = 0;
255 break;
256 case 1:
257 vert[2] += vert[1] + vert[0];
258 vert[1] = vert[0] = 0;
259 break;
260 case 2:
261 vert[1] += vert[0];
262 vert[0] = 0;
263 break;
264 default: break;
265 }
266 if (!istop) {
267 int tmp;
268 tmp = vert[0];
269 vert[0] = vert[WIENER_WIN - 1];
270 vert[WIENER_WIN - 1] = tmp;
271 tmp = vert[1];
272 vert[1] = vert[WIENER_WIN - 2];
273 vert[WIENER_WIN - 2] = tmp;
274 tmp = vert[2];
275 vert[2] = vert[WIENER_WIN - 3];
276 vert[WIENER_WIN - 3] = tmp;
277 }
278 }
279
loop_wiener_filter_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)280 static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
281 int height, int stride,
282 RestorationInternal *rst, uint8_t *dst,
283 int dst_stride) {
284 const int procunit_width = rst->rsi->procunit_width;
285 #if CONFIG_STRIPED_LOOP_RESTORATION
286 int procunit_height;
287 #else
288 const int procunit_height = rst->rsi->procunit_height;
289 #endif
290 const int tile_width = rst->tile_width;
291 const int tile_height = rst->tile_height;
292 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
293 loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
294 return;
295 }
296 InterpKernel vertical_topbot;
297 RestorationTileLimits limits =
298 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
299 #if CONFIG_STRIPED_LOOP_RESTORATION
300 tile_height, width, height, rst->subsampling_y);
301 #else
302 tile_height, width, height);
303 #endif
304
305 // Convolve the whole tile (done in blocks here to match the requirements
306 // of the vectorized convolve functions, but the result is equivalent)
307 for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
308 #if CONFIG_STRIPED_LOOP_RESTORATION
309 int h = setup_processing_stripe_boundary(
310 i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
311 h = ALIGN_POWER_OF_TWO(h, 1);
312 procunit_height = h;
313 #else
314 int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
315 #endif
316 for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
317 int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
318 const uint8_t *data_p = data + i * stride + j;
319 uint8_t *dst_p = dst + i * dst_stride + j;
320 // Note h is at least 16
321 for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
322 stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
323 vertical_topbot, WIENER_BORDER_VERT + b, 1);
324 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
325 aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
326 rst->rsi->wiener_info[tile_idx].hfilter, 16,
327 vertical_topbot, 16, w, 1);
328 #else
329 aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
330 rst->rsi->wiener_info[tile_idx].hfilter, 16,
331 vertical_topbot, 16, w, 1);
332 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
333 data_p += stride;
334 dst_p += dst_stride;
335 }
336 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
337 aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
338 rst->rsi->wiener_info[tile_idx].hfilter, 16,
339 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
340 h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
341 #else
342 aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
343 rst->rsi->wiener_info[tile_idx].hfilter, 16,
344 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
345 h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
346 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
347 data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
348 dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
349 for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
350 stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
351 vertical_topbot, WIENER_BORDER_VERT + b, 0);
352 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
353 aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
354 rst->rsi->wiener_info[tile_idx].hfilter, 16,
355 vertical_topbot, 16, w, 1);
356 #else
357 aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
358 rst->rsi->wiener_info[tile_idx].hfilter, 16,
359 vertical_topbot, 16, w, 1);
360 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
361 data_p += stride;
362 dst_p += dst_stride;
363 }
364 }
365 #if CONFIG_STRIPED_LOOP_RESTORATION
366 restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
367 limits.h_end, data, stride, rst, 0);
368 #endif
369 }
370 }
371
loop_wiener_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)372 static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
373 RestorationInternal *rst, uint8_t *dst,
374 int dst_stride) {
375 int tile_idx;
376 extend_frame(data, width, height, stride, WIENER_BORDER_HORZ,
377 WIENER_BORDER_VERT);
378 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
379 loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
380 dst_stride);
381 }
382 }
383
384 /* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
385 over the input. The window is of size (2r + 1)x(2r + 1), and we
386 specialize to r = 1, 2, 3. A default function is used for r > 3.
387
388 Each loop follows the same format: We keep a window's worth of input
389 in individual variables and select data out of that as appropriate.
390 */
boxsum1(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)391 static void boxsum1(int32_t *src, int width, int height, int src_stride,
392 int sqr, int32_t *dst, int dst_stride) {
393 int i, j, a, b, c;
394
395 // Vertical sum over 3-pixel regions, from src into dst.
396 if (!sqr) {
397 for (j = 0; j < width; ++j) {
398 a = src[j];
399 b = src[src_stride + j];
400 c = src[2 * src_stride + j];
401
402 dst[j] = a + b;
403 for (i = 1; i < height - 2; ++i) {
404 // Loop invariant: At the start of each iteration,
405 // a = src[(i - 1) * src_stride + j]
406 // b = src[(i ) * src_stride + j]
407 // c = src[(i + 1) * src_stride + j]
408 dst[i * dst_stride + j] = a + b + c;
409 a = b;
410 b = c;
411 c = src[(i + 2) * src_stride + j];
412 }
413 dst[i * dst_stride + j] = a + b + c;
414 dst[(i + 1) * dst_stride + j] = b + c;
415 }
416 } else {
417 for (j = 0; j < width; ++j) {
418 a = src[j] * src[j];
419 b = src[src_stride + j] * src[src_stride + j];
420 c = src[2 * src_stride + j] * src[2 * src_stride + j];
421
422 dst[j] = a + b;
423 for (i = 1; i < height - 2; ++i) {
424 dst[i * dst_stride + j] = a + b + c;
425 a = b;
426 b = c;
427 c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
428 }
429 dst[i * dst_stride + j] = a + b + c;
430 dst[(i + 1) * dst_stride + j] = b + c;
431 }
432 }
433
434 // Horizontal sum over 3-pixel regions of dst
435 for (i = 0; i < height; ++i) {
436 a = dst[i * dst_stride];
437 b = dst[i * dst_stride + 1];
438 c = dst[i * dst_stride + 2];
439
440 dst[i * dst_stride] = a + b;
441 for (j = 1; j < width - 2; ++j) {
442 // Loop invariant: At the start of each iteration,
443 // a = src[i * src_stride + (j - 1)]
444 // b = src[i * src_stride + (j )]
445 // c = src[i * src_stride + (j + 1)]
446 dst[i * dst_stride + j] = a + b + c;
447 a = b;
448 b = c;
449 c = dst[i * dst_stride + (j + 2)];
450 }
451 dst[i * dst_stride + j] = a + b + c;
452 dst[i * dst_stride + (j + 1)] = b + c;
453 }
454 }
455
boxsum2(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)456 static void boxsum2(int32_t *src, int width, int height, int src_stride,
457 int sqr, int32_t *dst, int dst_stride) {
458 int i, j, a, b, c, d, e;
459
460 // Vertical sum over 5-pixel regions, from src into dst.
461 if (!sqr) {
462 for (j = 0; j < width; ++j) {
463 a = src[j];
464 b = src[src_stride + j];
465 c = src[2 * src_stride + j];
466 d = src[3 * src_stride + j];
467 e = src[4 * src_stride + j];
468
469 dst[j] = a + b + c;
470 dst[dst_stride + j] = a + b + c + d;
471 for (i = 2; i < height - 3; ++i) {
472 // Loop invariant: At the start of each iteration,
473 // a = src[(i - 2) * src_stride + j]
474 // b = src[(i - 1) * src_stride + j]
475 // c = src[(i ) * src_stride + j]
476 // d = src[(i + 1) * src_stride + j]
477 // e = src[(i + 2) * src_stride + j]
478 dst[i * dst_stride + j] = a + b + c + d + e;
479 a = b;
480 b = c;
481 c = d;
482 d = e;
483 e = src[(i + 3) * src_stride + j];
484 }
485 dst[i * dst_stride + j] = a + b + c + d + e;
486 dst[(i + 1) * dst_stride + j] = b + c + d + e;
487 dst[(i + 2) * dst_stride + j] = c + d + e;
488 }
489 } else {
490 for (j = 0; j < width; ++j) {
491 a = src[j] * src[j];
492 b = src[src_stride + j] * src[src_stride + j];
493 c = src[2 * src_stride + j] * src[2 * src_stride + j];
494 d = src[3 * src_stride + j] * src[3 * src_stride + j];
495 e = src[4 * src_stride + j] * src[4 * src_stride + j];
496
497 dst[j] = a + b + c;
498 dst[dst_stride + j] = a + b + c + d;
499 for (i = 2; i < height - 3; ++i) {
500 dst[i * dst_stride + j] = a + b + c + d + e;
501 a = b;
502 b = c;
503 c = d;
504 d = e;
505 e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
506 }
507 dst[i * dst_stride + j] = a + b + c + d + e;
508 dst[(i + 1) * dst_stride + j] = b + c + d + e;
509 dst[(i + 2) * dst_stride + j] = c + d + e;
510 }
511 }
512
513 // Horizontal sum over 5-pixel regions of dst
514 for (i = 0; i < height; ++i) {
515 a = dst[i * dst_stride];
516 b = dst[i * dst_stride + 1];
517 c = dst[i * dst_stride + 2];
518 d = dst[i * dst_stride + 3];
519 e = dst[i * dst_stride + 4];
520
521 dst[i * dst_stride] = a + b + c;
522 dst[i * dst_stride + 1] = a + b + c + d;
523 for (j = 2; j < width - 3; ++j) {
524 // Loop invariant: At the start of each iteration,
525 // a = src[i * src_stride + (j - 2)]
526 // b = src[i * src_stride + (j - 1)]
527 // c = src[i * src_stride + (j )]
528 // d = src[i * src_stride + (j + 1)]
529 // e = src[i * src_stride + (j + 2)]
530 dst[i * dst_stride + j] = a + b + c + d + e;
531 a = b;
532 b = c;
533 c = d;
534 d = e;
535 e = dst[i * dst_stride + (j + 3)];
536 }
537 dst[i * dst_stride + j] = a + b + c + d + e;
538 dst[i * dst_stride + (j + 1)] = b + c + d + e;
539 dst[i * dst_stride + (j + 2)] = c + d + e;
540 }
541 }
542
boxsum3(int32_t * src,int width,int height,int src_stride,int sqr,int32_t * dst,int dst_stride)543 static void boxsum3(int32_t *src, int width, int height, int src_stride,
544 int sqr, int32_t *dst, int dst_stride) {
545 int i, j, a, b, c, d, e, f, g;
546
547 // Vertical sum over 7-pixel regions, from src into dst.
548 if (!sqr) {
549 for (j = 0; j < width; ++j) {
550 a = src[j];
551 b = src[1 * src_stride + j];
552 c = src[2 * src_stride + j];
553 d = src[3 * src_stride + j];
554 e = src[4 * src_stride + j];
555 f = src[5 * src_stride + j];
556 g = src[6 * src_stride + j];
557
558 dst[j] = a + b + c + d;
559 dst[dst_stride + j] = a + b + c + d + e;
560 dst[2 * dst_stride + j] = a + b + c + d + e + f;
561 for (i = 3; i < height - 4; ++i) {
562 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
563 a = b;
564 b = c;
565 c = d;
566 d = e;
567 e = f;
568 f = g;
569 g = src[(i + 4) * src_stride + j];
570 }
571 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
572 dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
573 dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
574 dst[(i + 3) * dst_stride + j] = d + e + f + g;
575 }
576 } else {
577 for (j = 0; j < width; ++j) {
578 a = src[j] * src[j];
579 b = src[1 * src_stride + j] * src[1 * src_stride + j];
580 c = src[2 * src_stride + j] * src[2 * src_stride + j];
581 d = src[3 * src_stride + j] * src[3 * src_stride + j];
582 e = src[4 * src_stride + j] * src[4 * src_stride + j];
583 f = src[5 * src_stride + j] * src[5 * src_stride + j];
584 g = src[6 * src_stride + j] * src[6 * src_stride + j];
585
586 dst[j] = a + b + c + d;
587 dst[dst_stride + j] = a + b + c + d + e;
588 dst[2 * dst_stride + j] = a + b + c + d + e + f;
589 for (i = 3; i < height - 4; ++i) {
590 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
591 a = b;
592 b = c;
593 c = d;
594 d = e;
595 e = f;
596 f = g;
597 g = src[(i + 4) * src_stride + j] * src[(i + 4) * src_stride + j];
598 }
599 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
600 dst[(i + 1) * dst_stride + j] = b + c + d + e + f + g;
601 dst[(i + 2) * dst_stride + j] = c + d + e + f + g;
602 dst[(i + 3) * dst_stride + j] = d + e + f + g;
603 }
604 }
605
606 // Horizontal sum over 7-pixel regions of dst
607 for (i = 0; i < height; ++i) {
608 a = dst[i * dst_stride];
609 b = dst[i * dst_stride + 1];
610 c = dst[i * dst_stride + 2];
611 d = dst[i * dst_stride + 3];
612 e = dst[i * dst_stride + 4];
613 f = dst[i * dst_stride + 5];
614 g = dst[i * dst_stride + 6];
615
616 dst[i * dst_stride] = a + b + c + d;
617 dst[i * dst_stride + 1] = a + b + c + d + e;
618 dst[i * dst_stride + 2] = a + b + c + d + e + f;
619 for (j = 3; j < width - 4; ++j) {
620 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
621 a = b;
622 b = c;
623 c = d;
624 d = e;
625 e = f;
626 f = g;
627 g = dst[i * dst_stride + (j + 4)];
628 }
629 dst[i * dst_stride + j] = a + b + c + d + e + f + g;
630 dst[i * dst_stride + (j + 1)] = b + c + d + e + f + g;
631 dst[i * dst_stride + (j + 2)] = c + d + e + f + g;
632 dst[i * dst_stride + (j + 3)] = d + e + f + g;
633 }
634 }
635
636 // Generic version for any r. To be removed after experiments are done.
boxsumr(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)637 static void boxsumr(int32_t *src, int width, int height, int src_stride, int r,
638 int sqr, int32_t *dst, int dst_stride) {
639 int32_t *tmp = aom_malloc(width * height * sizeof(*tmp));
640 int tmp_stride = width;
641 int i, j;
642 if (sqr) {
643 for (j = 0; j < width; ++j) tmp[j] = src[j] * src[j];
644 for (j = 0; j < width; ++j)
645 for (i = 1; i < height; ++i)
646 tmp[i * tmp_stride + j] =
647 tmp[(i - 1) * tmp_stride + j] +
648 src[i * src_stride + j] * src[i * src_stride + j];
649 } else {
650 memcpy(tmp, src, sizeof(*tmp) * width);
651 for (j = 0; j < width; ++j)
652 for (i = 1; i < height; ++i)
653 tmp[i * tmp_stride + j] =
654 tmp[(i - 1) * tmp_stride + j] + src[i * src_stride + j];
655 }
656 for (i = 0; i <= r; ++i)
657 memcpy(&dst[i * dst_stride], &tmp[(i + r) * tmp_stride],
658 sizeof(*tmp) * width);
659 for (i = r + 1; i < height - r; ++i)
660 for (j = 0; j < width; ++j)
661 dst[i * dst_stride + j] =
662 tmp[(i + r) * tmp_stride + j] - tmp[(i - r - 1) * tmp_stride + j];
663 for (i = height - r; i < height; ++i)
664 for (j = 0; j < width; ++j)
665 dst[i * dst_stride + j] = tmp[(height - 1) * tmp_stride + j] -
666 tmp[(i - r - 1) * tmp_stride + j];
667
668 for (i = 0; i < height; ++i) tmp[i * tmp_stride] = dst[i * dst_stride];
669 for (i = 0; i < height; ++i)
670 for (j = 1; j < width; ++j)
671 tmp[i * tmp_stride + j] =
672 tmp[i * tmp_stride + j - 1] + dst[i * src_stride + j];
673
674 for (j = 0; j <= r; ++j)
675 for (i = 0; i < height; ++i)
676 dst[i * dst_stride + j] = tmp[i * tmp_stride + j + r];
677 for (j = r + 1; j < width - r; ++j)
678 for (i = 0; i < height; ++i)
679 dst[i * dst_stride + j] =
680 tmp[i * tmp_stride + j + r] - tmp[i * tmp_stride + j - r - 1];
681 for (j = width - r; j < width; ++j)
682 for (i = 0; i < height; ++i)
683 dst[i * dst_stride + j] =
684 tmp[i * tmp_stride + width - 1] - tmp[i * tmp_stride + j - r - 1];
685 aom_free(tmp);
686 }
687
boxsum(int32_t * src,int width,int height,int src_stride,int r,int sqr,int32_t * dst,int dst_stride)688 static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
689 int sqr, int32_t *dst, int dst_stride) {
690 if (r == 1)
691 boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
692 else if (r == 2)
693 boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
694 else if (r == 3)
695 boxsum3(src, width, height, src_stride, sqr, dst, dst_stride);
696 else
697 boxsumr(src, width, height, src_stride, r, sqr, dst, dst_stride);
698 }
699
boxnum(int width,int height,int r,int8_t * num,int num_stride)700 static void boxnum(int width, int height, int r, int8_t *num, int num_stride) {
701 int i, j;
702 for (i = 0; i <= r; ++i) {
703 for (j = 0; j <= r; ++j) {
704 num[i * num_stride + j] = (r + 1 + i) * (r + 1 + j);
705 num[i * num_stride + (width - 1 - j)] = num[i * num_stride + j];
706 num[(height - 1 - i) * num_stride + j] = num[i * num_stride + j];
707 num[(height - 1 - i) * num_stride + (width - 1 - j)] =
708 num[i * num_stride + j];
709 }
710 }
711 for (j = 0; j <= r; ++j) {
712 const int val = (2 * r + 1) * (r + 1 + j);
713 for (i = r + 1; i < height - r; ++i) {
714 num[i * num_stride + j] = val;
715 num[i * num_stride + (width - 1 - j)] = val;
716 }
717 }
718 for (i = 0; i <= r; ++i) {
719 const int val = (2 * r + 1) * (r + 1 + i);
720 for (j = r + 1; j < width - r; ++j) {
721 num[i * num_stride + j] = val;
722 num[(height - 1 - i) * num_stride + j] = val;
723 }
724 }
725 for (i = r + 1; i < height - r; ++i) {
726 for (j = r + 1; j < width - r; ++j) {
727 num[i * num_stride + j] = (2 * r + 1) * (2 * r + 1);
728 }
729 }
730 }
731
decode_xq(int * xqd,int * xq)732 void decode_xq(int *xqd, int *xq) {
733 xq[0] = xqd[0];
734 xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
735 }
736
737 const int32_t x_by_xplus1[256] = {
738 0, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
739 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
740 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
741 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
742 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
743 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
744 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
745 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
746 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
747 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
748 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
749 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
750 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
751 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
752 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
753 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
754 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
755 256,
756 };
757
758 const int32_t one_by_x[MAX_NELEM] = {
759 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
760 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
761 #if MAX_RADIUS > 2
762 158, 152, 146, 141, 137, 132, 128, 124, 120, 117, 114, 111, 108,
763 105, 102, 100, 98, 95, 93, 91, 89, 87, 85, 84
764 #endif // MAX_RADIUS > 2
765 };
766
av1_selfguided_restoration_internal(int32_t * dgd,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int r,int eps)767 static void av1_selfguided_restoration_internal(int32_t *dgd, int width,
768 int height, int dgd_stride,
769 int32_t *dst, int dst_stride,
770 int bit_depth, int r, int eps) {
771 const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
772 const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
773 const int num_stride = width_ext;
774 // Adjusting the stride of A and B here appears to avoid bad cache effects,
775 // leading to a significant speed improvement.
776 // We also align the stride to a multiple of 16 bytes, for consistency
777 // with the SIMD version of this function.
778 int buf_stride = ((width_ext + 3) & ~3) + 16;
779 int32_t A_[RESTORATION_PROC_UNIT_PELS];
780 int32_t B_[RESTORATION_PROC_UNIT_PELS];
781 int32_t *A = A_;
782 int32_t *B = B_;
783 int8_t num_[RESTORATION_PROC_UNIT_PELS];
784 int8_t *num = num_ + SGRPROJ_BORDER_VERT * num_stride + SGRPROJ_BORDER_HORZ;
785 int i, j;
786
787 // Don't filter tiles with dimensions < 5 on any axis
788 if ((width < 5) || (height < 5)) return;
789
790 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
791 width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
792 boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
793 width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
794 boxnum(width_ext, height_ext, r, num_, num_stride);
795 assert(r <= 3);
796 A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
797 B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
798 for (i = 0; i < height; ++i) {
799 for (j = 0; j < width; ++j) {
800 const int k = i * buf_stride + j;
801 const int n = num[i * num_stride + j];
802
803 // a < 2^16 * n < 2^22 regardless of bit depth
804 uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
805 // b < 2^8 * n < 2^14 regardless of bit depth
806 uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
807
808 // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
809 // and p itself satisfies p < 2^14 * n^2 < 2^26.
810 // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
811 // This is an artefact of rounding, and can only happen if all pixels
812 // are (almost) identical, so in this case we saturate to p=0.
813 uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
814 uint32_t s = sgrproj_mtable[eps - 1][n - 1];
815
816 // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
817 // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
818 // (this holds even after accounting for the rounding in s)
819 const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
820
821 A[k] = x_by_xplus1[AOMMIN(z, 255)]; // < 2^8
822
823 // SGRPROJ_SGR - A[k] < 2^8, B[k] < 2^(bit_depth) * n,
824 // one_by_x[n - 1] = round(2^12 / n)
825 // => the product here is < 2^(20 + bit_depth) <= 2^32,
826 // and B[k] is set to a value < 2^(8 + bit depth)
827 B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
828 (uint32_t)B[k] *
829 (uint32_t)one_by_x[n - 1],
830 SGRPROJ_RECIP_BITS);
831 }
832 }
833 i = 0;
834 j = 0;
835 {
836 const int k = i * buf_stride + j;
837 const int l = i * dgd_stride + j;
838 const int m = i * dst_stride + j;
839 const int nb = 3;
840 const int32_t a =
841 3 * A[k] + 2 * A[k + 1] + 2 * A[k + buf_stride] + A[k + buf_stride + 1];
842 const int32_t b =
843 3 * B[k] + 2 * B[k + 1] + 2 * B[k + buf_stride] + B[k + buf_stride + 1];
844 const int32_t v = a * dgd[l] + b;
845 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
846 }
847 i = 0;
848 j = width - 1;
849 {
850 const int k = i * buf_stride + j;
851 const int l = i * dgd_stride + j;
852 const int m = i * dst_stride + j;
853 const int nb = 3;
854 const int32_t a =
855 3 * A[k] + 2 * A[k - 1] + 2 * A[k + buf_stride] + A[k + buf_stride - 1];
856 const int32_t b =
857 3 * B[k] + 2 * B[k - 1] + 2 * B[k + buf_stride] + B[k + buf_stride - 1];
858 const int32_t v = a * dgd[l] + b;
859 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
860 }
861 i = height - 1;
862 j = 0;
863 {
864 const int k = i * buf_stride + j;
865 const int l = i * dgd_stride + j;
866 const int m = i * dst_stride + j;
867 const int nb = 3;
868 const int32_t a =
869 3 * A[k] + 2 * A[k + 1] + 2 * A[k - buf_stride] + A[k - buf_stride + 1];
870 const int32_t b =
871 3 * B[k] + 2 * B[k + 1] + 2 * B[k - buf_stride] + B[k - buf_stride + 1];
872 const int32_t v = a * dgd[l] + b;
873 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
874 }
875 i = height - 1;
876 j = width - 1;
877 {
878 const int k = i * buf_stride + j;
879 const int l = i * dgd_stride + j;
880 const int m = i * dst_stride + j;
881 const int nb = 3;
882 const int32_t a =
883 3 * A[k] + 2 * A[k - 1] + 2 * A[k - buf_stride] + A[k - buf_stride - 1];
884 const int32_t b =
885 3 * B[k] + 2 * B[k - 1] + 2 * B[k - buf_stride] + B[k - buf_stride - 1];
886 const int32_t v = a * dgd[l] + b;
887 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
888 }
889 i = 0;
890 for (j = 1; j < width - 1; ++j) {
891 const int k = i * buf_stride + j;
892 const int l = i * dgd_stride + j;
893 const int m = i * dst_stride + j;
894 const int nb = 3;
895 const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k + buf_stride] +
896 A[k + buf_stride - 1] + A[k + buf_stride + 1];
897 const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k + buf_stride] +
898 B[k + buf_stride - 1] + B[k + buf_stride + 1];
899 const int32_t v = a * dgd[l] + b;
900 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
901 }
902 i = height - 1;
903 for (j = 1; j < width - 1; ++j) {
904 const int k = i * buf_stride + j;
905 const int l = i * dgd_stride + j;
906 const int m = i * dst_stride + j;
907 const int nb = 3;
908 const int32_t a = A[k] + 2 * (A[k - 1] + A[k + 1]) + A[k - buf_stride] +
909 A[k - buf_stride - 1] + A[k - buf_stride + 1];
910 const int32_t b = B[k] + 2 * (B[k - 1] + B[k + 1]) + B[k - buf_stride] +
911 B[k - buf_stride - 1] + B[k - buf_stride + 1];
912 const int32_t v = a * dgd[l] + b;
913 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
914 }
915 j = 0;
916 for (i = 1; i < height - 1; ++i) {
917 const int k = i * buf_stride + j;
918 const int l = i * dgd_stride + j;
919 const int m = i * dst_stride + j;
920 const int nb = 3;
921 const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
922 A[k + 1] + A[k - buf_stride + 1] + A[k + buf_stride + 1];
923 const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
924 B[k + 1] + B[k - buf_stride + 1] + B[k + buf_stride + 1];
925 const int32_t v = a * dgd[l] + b;
926 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
927 }
928 j = width - 1;
929 for (i = 1; i < height - 1; ++i) {
930 const int k = i * buf_stride + j;
931 const int l = i * dgd_stride + j;
932 const int m = i * dst_stride + j;
933 const int nb = 3;
934 const int32_t a = A[k] + 2 * (A[k - buf_stride] + A[k + buf_stride]) +
935 A[k - 1] + A[k - buf_stride - 1] + A[k + buf_stride - 1];
936 const int32_t b = B[k] + 2 * (B[k - buf_stride] + B[k + buf_stride]) +
937 B[k - 1] + B[k - buf_stride - 1] + B[k + buf_stride - 1];
938 const int32_t v = a * dgd[l] + b;
939 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
940 }
941 for (i = 1; i < height - 1; ++i) {
942 for (j = 1; j < width - 1; ++j) {
943 const int k = i * buf_stride + j;
944 const int l = i * dgd_stride + j;
945 const int m = i * dst_stride + j;
946 const int nb = 5;
947 const int32_t a =
948 (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
949 4 +
950 (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
951 A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
952 3;
953 const int32_t b =
954 (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
955 4 +
956 (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
957 B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
958 3;
959 const int32_t v = a * dgd[l] + b;
960 dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
961 }
962 }
963 }
964
av1_selfguided_restoration_c(uint8_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int r,int eps)965 void av1_selfguided_restoration_c(uint8_t *dgd, int width, int height,
966 int stride, int32_t *dst, int dst_stride,
967 int r, int eps) {
968 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
969 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
970 int32_t *dgd32 =
971 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
972 int i, j;
973 for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
974 for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
975 dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
976 }
977 }
978 av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
979 dst_stride, 8, r, eps);
980 }
981
av1_highpass_filter_c(uint8_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int corner,int edge)982 void av1_highpass_filter_c(uint8_t *dgd, int width, int height, int stride,
983 int32_t *dst, int dst_stride, int corner, int edge) {
984 int i, j;
985 const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
986
987 i = 0;
988 j = 0;
989 {
990 const int k = i * stride + j;
991 const int l = i * dst_stride + j;
992 dst[l] =
993 center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
994 corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
995 }
996 i = 0;
997 j = width - 1;
998 {
999 const int k = i * stride + j;
1000 const int l = i * dst_stride + j;
1001 dst[l] =
1002 center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
1003 corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
1004 }
1005 i = height - 1;
1006 j = 0;
1007 {
1008 const int k = i * stride + j;
1009 const int l = i * dst_stride + j;
1010 dst[l] =
1011 center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
1012 corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
1013 }
1014 i = height - 1;
1015 j = width - 1;
1016 {
1017 const int k = i * stride + j;
1018 const int l = i * dst_stride + j;
1019 dst[l] =
1020 center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
1021 corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
1022 }
1023 i = 0;
1024 for (j = 1; j < width - 1; ++j) {
1025 const int k = i * stride + j;
1026 const int l = i * dst_stride + j;
1027 dst[l] = center * dgd[k] +
1028 edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
1029 corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
1030 dgd[k + 1]);
1031 }
1032 i = height - 1;
1033 for (j = 1; j < width - 1; ++j) {
1034 const int k = i * stride + j;
1035 const int l = i * dst_stride + j;
1036 dst[l] = center * dgd[k] +
1037 edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
1038 corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
1039 dgd[k + 1]);
1040 }
1041 j = 0;
1042 for (i = 1; i < height - 1; ++i) {
1043 const int k = i * stride + j;
1044 const int l = i * dst_stride + j;
1045 dst[l] = center * dgd[k] +
1046 edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
1047 corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
1048 dgd[k - stride] + dgd[k + stride]);
1049 }
1050 j = width - 1;
1051 for (i = 1; i < height - 1; ++i) {
1052 const int k = i * stride + j;
1053 const int l = i * dst_stride + j;
1054 dst[l] = center * dgd[k] +
1055 edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
1056 corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1057 dgd[k - stride] + dgd[k + stride]);
1058 }
1059 for (i = 1; i < height - 1; ++i) {
1060 for (j = 1; j < width - 1; ++j) {
1061 const int k = i * stride + j;
1062 const int l = i * dst_stride + j;
1063 dst[l] =
1064 center * dgd[k] +
1065 edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
1066 corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1067 dgd[k - stride + 1] + dgd[k + stride + 1]);
1068 }
1069 }
1070 }
1071
apply_selfguided_restoration_c(uint8_t * dat,int width,int height,int stride,int eps,int * xqd,uint8_t * dst,int dst_stride,int32_t * tmpbuf)1072 void apply_selfguided_restoration_c(uint8_t *dat, int width, int height,
1073 int stride, int eps, int *xqd, uint8_t *dst,
1074 int dst_stride, int32_t *tmpbuf) {
1075 int xq[2];
1076 int32_t *flt1 = tmpbuf;
1077 int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
1078 int i, j;
1079 assert(width * height <= RESTORATION_TILEPELS_MAX);
1080 #if USE_HIGHPASS_IN_SGRPROJ
1081 av1_highpass_filter_c(dat, width, height, stride, flt1, width,
1082 sgr_params[eps].corner, sgr_params[eps].edge);
1083 #else
1084 av1_selfguided_restoration_c(dat, width, height, stride, flt1, width,
1085 sgr_params[eps].r1, sgr_params[eps].e1);
1086 #endif // USE_HIGHPASS_IN_SGRPROJ
1087 av1_selfguided_restoration_c(dat, width, height, stride, flt2, width,
1088 sgr_params[eps].r2, sgr_params[eps].e2);
1089 decode_xq(xqd, xq);
1090 for (i = 0; i < height; ++i) {
1091 for (j = 0; j < width; ++j) {
1092 const int k = i * width + j;
1093 const int l = i * stride + j;
1094 const int m = i * dst_stride + j;
1095 const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
1096 const int32_t f1 = (int32_t)flt1[k] - u;
1097 const int32_t f2 = (int32_t)flt2[k] - u;
1098 const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
1099 const int16_t w =
1100 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1101 dst[m] = clip_pixel(w);
1102 }
1103 }
1104 }
1105
loop_sgrproj_filter_tile(uint8_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1106 static void loop_sgrproj_filter_tile(uint8_t *data, int tile_idx, int width,
1107 int height, int stride,
1108 RestorationInternal *rst, uint8_t *dst,
1109 int dst_stride) {
1110 const int procunit_width = rst->rsi->procunit_width;
1111 #if CONFIG_STRIPED_LOOP_RESTORATION
1112 int procunit_height;
1113 #else
1114 const int procunit_height = rst->rsi->procunit_height;
1115 #endif
1116 const int tile_width = rst->tile_width;
1117 const int tile_height = rst->tile_height;
1118 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1119 loop_copy_tile(data, tile_idx, width, height, stride, rst, dst, dst_stride);
1120 return;
1121 }
1122 RestorationTileLimits limits =
1123 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1124 #if CONFIG_STRIPED_LOOP_RESTORATION
1125 tile_height, width, height, rst->subsampling_y);
1126 #else
1127 tile_height, width, height);
1128 #endif
1129 for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1130 #if CONFIG_STRIPED_LOOP_RESTORATION
1131 int h = setup_processing_stripe_boundary(
1132 i, limits.v_end, limits.h_start, limits.h_end, data, stride, rst, 0);
1133 procunit_height = h;
1134 #else
1135 int h = AOMMIN(procunit_height, limits.v_end - i);
1136 #endif
1137 for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1138 int w = AOMMIN(procunit_width, limits.h_end - j);
1139 uint8_t *data_p = data + i * stride + j;
1140 uint8_t *dst_p = dst + i * dst_stride + j;
1141 apply_selfguided_restoration(
1142 data_p, w, h, stride, rst->rsi->sgrproj_info[tile_idx].ep,
1143 rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
1144 }
1145 #if CONFIG_STRIPED_LOOP_RESTORATION
1146 restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1147 limits.h_end, data, stride, rst, 0);
1148 #endif
1149 }
1150 }
1151
loop_sgrproj_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1152 static void loop_sgrproj_filter(uint8_t *data, int width, int height,
1153 int stride, RestorationInternal *rst,
1154 uint8_t *dst, int dst_stride) {
1155 int tile_idx;
1156 extend_frame(data, width, height, stride, SGRPROJ_BORDER_HORZ,
1157 SGRPROJ_BORDER_VERT);
1158 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1159 loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1160 dst_stride);
1161 }
1162 }
1163
loop_switchable_filter(uint8_t * data,int width,int height,int stride,RestorationInternal * rst,uint8_t * dst,int dst_stride)1164 static void loop_switchable_filter(uint8_t *data, int width, int height,
1165 int stride, RestorationInternal *rst,
1166 uint8_t *dst, int dst_stride) {
1167 int tile_idx;
1168 extend_frame(data, width, height, stride, RESTORATION_BORDER_HORZ,
1169 RESTORATION_BORDER_VERT);
1170 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1171 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1172 loop_copy_tile(data, tile_idx, width, height, stride, rst, dst,
1173 dst_stride);
1174 } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
1175 loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1176 dst_stride);
1177 } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
1178 loop_sgrproj_filter_tile(data, tile_idx, width, height, stride, rst, dst,
1179 dst_stride);
1180 }
1181 }
1182 }
1183
1184 #if CONFIG_HIGHBITDEPTH
extend_frame_highbd(uint16_t * data,int width,int height,int stride,int border_horz,int border_vert)1185 void extend_frame_highbd(uint16_t *data, int width, int height, int stride,
1186 int border_horz, int border_vert) {
1187 uint16_t *data_p;
1188 int i, j;
1189 for (i = 0; i < height; ++i) {
1190 data_p = data + i * stride;
1191 for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
1192 for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
1193 }
1194 data_p = data - border_horz;
1195 for (i = -border_vert; i < 0; ++i) {
1196 memcpy(data_p + i * stride, data_p,
1197 (width + 2 * border_horz) * sizeof(uint16_t));
1198 }
1199 for (i = height; i < height + border_vert; ++i) {
1200 memcpy(data_p + i * stride, data_p + (height - 1) * stride,
1201 (width + 2 * border_horz) * sizeof(uint16_t));
1202 }
1203 }
1204
loop_copy_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,uint16_t * dst,int dst_stride)1205 static void loop_copy_tile_highbd(uint16_t *data, int tile_idx, int width,
1206 int height, int stride,
1207 RestorationInternal *rst, uint16_t *dst,
1208 int dst_stride) {
1209 const int tile_width = rst->tile_width;
1210 const int tile_height = rst->tile_height;
1211 RestorationTileLimits limits =
1212 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1213 #if CONFIG_STRIPED_LOOP_RESTORATION
1214 tile_height, width, height, rst->subsampling_y);
1215 #else
1216 tile_height, width, height);
1217 #endif
1218 for (int i = limits.v_start; i < limits.v_end; ++i)
1219 memcpy(dst + i * dst_stride + limits.h_start,
1220 data + i * stride + limits.h_start,
1221 (limits.h_end - limits.h_start) * sizeof(*dst));
1222 }
1223
loop_wiener_filter_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint16_t * dst,int dst_stride)1224 static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
1225 int width, int height, int stride,
1226 RestorationInternal *rst,
1227 int bit_depth, uint16_t *dst,
1228 int dst_stride) {
1229 const int procunit_width = rst->rsi->procunit_width;
1230 #if CONFIG_STRIPED_LOOP_RESTORATION
1231 int procunit_height;
1232 #else
1233 const int procunit_height = rst->rsi->procunit_height;
1234 #endif
1235 const int tile_width = rst->tile_width;
1236 const int tile_height = rst->tile_height;
1237
1238 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1239 loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1240 dst_stride);
1241 return;
1242 }
1243 RestorationTileLimits limits =
1244 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1245 #if CONFIG_STRIPED_LOOP_RESTORATION
1246 tile_height, width, height, rst->subsampling_y);
1247 #else
1248 tile_height, width, height);
1249 #endif
1250 InterpKernel vertical_topbot;
1251
1252 // Convolve the whole tile (done in blocks here to match the requirements
1253 // of the vectorized convolve functions, but the result is equivalent)
1254 for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1255 #if CONFIG_STRIPED_LOOP_RESTORATION
1256 int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1257 limits.h_end, (uint8_t *)data,
1258 stride, rst, 1);
1259 h = ALIGN_POWER_OF_TWO(h, 1);
1260 procunit_height = h;
1261 #else
1262 int h = AOMMIN(procunit_height, (limits.v_end - i + 15) & ~15);
1263 #endif
1264 for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1265 int w = AOMMIN(procunit_width, (limits.h_end - j + 15) & ~15);
1266 const uint16_t *data_p = data + i * stride + j;
1267 uint16_t *dst_p = dst + i * dst_stride + j;
1268 // Note h is at least 16
1269 for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
1270 stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
1271 vertical_topbot, WIENER_BORDER_VERT + b, 1);
1272 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1273 aom_highbd_convolve8_add_src_hip(
1274 CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1275 dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1276 vertical_topbot, 16, w, 1, bit_depth);
1277 #else
1278 aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
1279 CONVERT_TO_BYTEPTR(dst_p), dst_stride,
1280 rst->rsi->wiener_info[tile_idx].hfilter,
1281 16, vertical_topbot, 16, w, 1, bit_depth);
1282 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1283 data_p += stride;
1284 dst_p += dst_stride;
1285 }
1286 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1287 aom_highbd_convolve8_add_src_hip(
1288 CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1289 dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1290 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
1291 h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
1292 #else
1293 aom_highbd_convolve8_add_src(
1294 CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1295 dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1296 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
1297 h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2, bit_depth);
1298 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1299 data_p += stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
1300 dst_p += dst_stride * (h - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
1301 for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
1302 stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
1303 vertical_topbot, WIENER_BORDER_VERT + b, 0);
1304 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1305 aom_highbd_convolve8_add_src_hip(
1306 CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
1307 dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
1308 vertical_topbot, 16, w, 1, bit_depth);
1309 #else
1310 aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
1311 CONVERT_TO_BYTEPTR(dst_p), dst_stride,
1312 rst->rsi->wiener_info[tile_idx].hfilter,
1313 16, vertical_topbot, 16, w, 1, bit_depth);
1314 #endif // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
1315 data_p += stride;
1316 dst_p += dst_stride;
1317 }
1318 }
1319 #if CONFIG_STRIPED_LOOP_RESTORATION
1320 restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1321 limits.h_end, (uint8_t *)data, stride,
1322 rst, 1);
1323 #endif
1324 }
1325 }
1326
loop_wiener_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1327 static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
1328 int stride, RestorationInternal *rst,
1329 int bit_depth, uint8_t *dst8,
1330 int dst_stride) {
1331 uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1332 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1333 int tile_idx;
1334 extend_frame_highbd(data, width, height, stride, WIENER_BORDER_HORZ,
1335 WIENER_BORDER_VERT);
1336 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1337 loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1338 bit_depth, dst, dst_stride);
1339 }
1340 }
1341
av1_selfguided_restoration_highbd_c(uint16_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int bit_depth,int r,int eps)1342 void av1_selfguided_restoration_highbd_c(uint16_t *dgd, int width, int height,
1343 int stride, int32_t *dst,
1344 int dst_stride, int bit_depth, int r,
1345 int eps) {
1346 int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
1347 const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
1348 int32_t *dgd32 =
1349 dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1350 int i, j;
1351 for (i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
1352 for (j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
1353 dgd32[i * dgd32_stride + j] = dgd[i * stride + j];
1354 }
1355 }
1356 av1_selfguided_restoration_internal(dgd32, width, height, dgd32_stride, dst,
1357 dst_stride, bit_depth, r, eps);
1358 }
1359
av1_highpass_filter_highbd_c(uint16_t * dgd,int width,int height,int stride,int32_t * dst,int dst_stride,int corner,int edge)1360 void av1_highpass_filter_highbd_c(uint16_t *dgd, int width, int height,
1361 int stride, int32_t *dst, int dst_stride,
1362 int corner, int edge) {
1363 int i, j;
1364 const int center = (1 << SGRPROJ_RST_BITS) - 4 * (corner + edge);
1365
1366 i = 0;
1367 j = 0;
1368 {
1369 const int k = i * stride + j;
1370 const int l = i * dst_stride + j;
1371 dst[l] =
1372 center * dgd[k] + edge * (dgd[k + 1] + dgd[k + stride] + dgd[k] * 2) +
1373 corner * (dgd[k + stride + 1] + dgd[k + 1] + dgd[k + stride] + dgd[k]);
1374 }
1375 i = 0;
1376 j = width - 1;
1377 {
1378 const int k = i * stride + j;
1379 const int l = i * dst_stride + j;
1380 dst[l] =
1381 center * dgd[k] + edge * (dgd[k - 1] + dgd[k + stride] + dgd[k] * 2) +
1382 corner * (dgd[k + stride - 1] + dgd[k - 1] + dgd[k + stride] + dgd[k]);
1383 }
1384 i = height - 1;
1385 j = 0;
1386 {
1387 const int k = i * stride + j;
1388 const int l = i * dst_stride + j;
1389 dst[l] =
1390 center * dgd[k] + edge * (dgd[k + 1] + dgd[k - stride] + dgd[k] * 2) +
1391 corner * (dgd[k - stride + 1] + dgd[k + 1] + dgd[k - stride] + dgd[k]);
1392 }
1393 i = height - 1;
1394 j = width - 1;
1395 {
1396 const int k = i * stride + j;
1397 const int l = i * dst_stride + j;
1398 dst[l] =
1399 center * dgd[k] + edge * (dgd[k - 1] + dgd[k - stride] + dgd[k] * 2) +
1400 corner * (dgd[k - stride - 1] + dgd[k - 1] + dgd[k - stride] + dgd[k]);
1401 }
1402 i = 0;
1403 for (j = 1; j < width - 1; ++j) {
1404 const int k = i * stride + j;
1405 const int l = i * dst_stride + j;
1406 dst[l] = center * dgd[k] +
1407 edge * (dgd[k - 1] + dgd[k + stride] + dgd[k + 1] + dgd[k]) +
1408 corner * (dgd[k + stride - 1] + dgd[k + stride + 1] + dgd[k - 1] +
1409 dgd[k + 1]);
1410 }
1411 i = height - 1;
1412 for (j = 1; j < width - 1; ++j) {
1413 const int k = i * stride + j;
1414 const int l = i * dst_stride + j;
1415 dst[l] = center * dgd[k] +
1416 edge * (dgd[k - 1] + dgd[k - stride] + dgd[k + 1] + dgd[k]) +
1417 corner * (dgd[k - stride - 1] + dgd[k - stride + 1] + dgd[k - 1] +
1418 dgd[k + 1]);
1419 }
1420 j = 0;
1421 for (i = 1; i < height - 1; ++i) {
1422 const int k = i * stride + j;
1423 const int l = i * dst_stride + j;
1424 dst[l] = center * dgd[k] +
1425 edge * (dgd[k - stride] + dgd[k + 1] + dgd[k + stride] + dgd[k]) +
1426 corner * (dgd[k + stride + 1] + dgd[k - stride + 1] +
1427 dgd[k - stride] + dgd[k + stride]);
1428 }
1429 j = width - 1;
1430 for (i = 1; i < height - 1; ++i) {
1431 const int k = i * stride + j;
1432 const int l = i * dst_stride + j;
1433 dst[l] = center * dgd[k] +
1434 edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k]) +
1435 corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1436 dgd[k - stride] + dgd[k + stride]);
1437 }
1438 for (i = 1; i < height - 1; ++i) {
1439 for (j = 1; j < width - 1; ++j) {
1440 const int k = i * stride + j;
1441 const int l = i * dst_stride + j;
1442 dst[l] =
1443 center * dgd[k] +
1444 edge * (dgd[k - stride] + dgd[k - 1] + dgd[k + stride] + dgd[k + 1]) +
1445 corner * (dgd[k + stride - 1] + dgd[k - stride - 1] +
1446 dgd[k - stride + 1] + dgd[k + stride + 1]);
1447 }
1448 }
1449 }
1450
apply_selfguided_restoration_highbd_c(uint16_t * dat,int width,int height,int stride,int bit_depth,int eps,int * xqd,uint16_t * dst,int dst_stride,int32_t * tmpbuf)1451 void apply_selfguided_restoration_highbd_c(uint16_t *dat, int width, int height,
1452 int stride, int bit_depth, int eps,
1453 int *xqd, uint16_t *dst,
1454 int dst_stride, int32_t *tmpbuf) {
1455 int xq[2];
1456 int32_t *flt1 = tmpbuf;
1457 int32_t *flt2 = flt1 + RESTORATION_TILEPELS_MAX;
1458 int i, j;
1459 assert(width * height <= RESTORATION_TILEPELS_MAX);
1460 #if USE_HIGHPASS_IN_SGRPROJ
1461 av1_highpass_filter_highbd_c(dat, width, height, stride, flt1, width,
1462 sgr_params[eps].corner, sgr_params[eps].edge);
1463 #else
1464 av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt1, width,
1465 bit_depth, sgr_params[eps].r1,
1466 sgr_params[eps].e1);
1467 #endif // USE_HIGHPASS_IN_SGRPROJ
1468 av1_selfguided_restoration_highbd_c(dat, width, height, stride, flt2, width,
1469 bit_depth, sgr_params[eps].r2,
1470 sgr_params[eps].e2);
1471 decode_xq(xqd, xq);
1472 for (i = 0; i < height; ++i) {
1473 for (j = 0; j < width; ++j) {
1474 const int k = i * width + j;
1475 const int l = i * stride + j;
1476 const int m = i * dst_stride + j;
1477 const int32_t u = ((int32_t)dat[l] << SGRPROJ_RST_BITS);
1478 const int32_t f1 = (int32_t)flt1[k] - u;
1479 const int32_t f2 = (int32_t)flt2[k] - u;
1480 const int32_t v = xq[0] * f1 + xq[1] * f2 + (u << SGRPROJ_PRJ_BITS);
1481 const int16_t w =
1482 (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1483 dst[m] = (uint16_t)clip_pixel_highbd(w, bit_depth);
1484 }
1485 }
1486 }
1487
loop_sgrproj_filter_tile_highbd(uint16_t * data,int tile_idx,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint16_t * dst,int dst_stride)1488 static void loop_sgrproj_filter_tile_highbd(uint16_t *data, int tile_idx,
1489 int width, int height, int stride,
1490 RestorationInternal *rst,
1491 int bit_depth, uint16_t *dst,
1492 int dst_stride) {
1493 const int procunit_width = rst->rsi->procunit_width;
1494 #if CONFIG_STRIPED_LOOP_RESTORATION
1495 int procunit_height;
1496 #else
1497 const int procunit_height = rst->rsi->procunit_height;
1498 #endif
1499 const int tile_width = rst->tile_width;
1500 const int tile_height = rst->tile_height;
1501
1502 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1503 loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1504 dst_stride);
1505 return;
1506 }
1507 RestorationTileLimits limits =
1508 av1_get_rest_tile_limits(tile_idx, rst->nhtiles, rst->nvtiles, tile_width,
1509 #if CONFIG_STRIPED_LOOP_RESTORATION
1510 tile_height, width, height, rst->subsampling_y);
1511 #else
1512 tile_height, width, height);
1513 #endif
1514 for (int i = limits.v_start; i < limits.v_end; i += procunit_height) {
1515 #if CONFIG_STRIPED_LOOP_RESTORATION
1516 int h = setup_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1517 limits.h_end, (uint8_t *)data,
1518 stride, rst, 1);
1519 procunit_height = h;
1520 #else
1521 int h = AOMMIN(procunit_height, limits.v_end - i);
1522 #endif
1523 for (int j = limits.h_start; j < limits.h_end; j += procunit_width) {
1524 int w = AOMMIN(procunit_width, limits.h_end - j);
1525 uint16_t *data_p = data + i * stride + j;
1526 uint16_t *dst_p = dst + i * dst_stride + j;
1527 apply_selfguided_restoration_highbd(
1528 data_p, w, h, stride, bit_depth, rst->rsi->sgrproj_info[tile_idx].ep,
1529 rst->rsi->sgrproj_info[tile_idx].xqd, dst_p, dst_stride, rst->tmpbuf);
1530 }
1531 #if CONFIG_STRIPED_LOOP_RESTORATION
1532 restore_processing_stripe_boundary(i, limits.v_end, limits.h_start,
1533 limits.h_end, (uint8_t *)data, stride,
1534 rst, 1);
1535 #endif
1536 }
1537 }
1538
loop_sgrproj_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1539 static void loop_sgrproj_filter_highbd(uint8_t *data8, int width, int height,
1540 int stride, RestorationInternal *rst,
1541 int bit_depth, uint8_t *dst8,
1542 int dst_stride) {
1543 int tile_idx;
1544 uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1545 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1546 extend_frame_highbd(data, width, height, stride, SGRPROJ_BORDER_HORZ,
1547 SGRPROJ_BORDER_VERT);
1548 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1549 loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1550 bit_depth, dst, dst_stride);
1551 }
1552 }
1553
loop_switchable_filter_highbd(uint8_t * data8,int width,int height,int stride,RestorationInternal * rst,int bit_depth,uint8_t * dst8,int dst_stride)1554 static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
1555 int stride, RestorationInternal *rst,
1556 int bit_depth, uint8_t *dst8,
1557 int dst_stride) {
1558 uint16_t *data = CONVERT_TO_SHORTPTR(data8);
1559 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1560 int tile_idx;
1561 extend_frame_highbd(data, width, height, stride, RESTORATION_BORDER_HORZ,
1562 RESTORATION_BORDER_VERT);
1563 for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
1564 if (rst->rsi->restoration_type[tile_idx] == RESTORE_NONE) {
1565 loop_copy_tile_highbd(data, tile_idx, width, height, stride, rst, dst,
1566 dst_stride);
1567 } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
1568 loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
1569 bit_depth, dst, dst_stride);
1570 } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_SGRPROJ) {
1571 loop_sgrproj_filter_tile_highbd(data, tile_idx, width, height, stride,
1572 rst, bit_depth, dst, dst_stride);
1573 }
1574 }
1575 }
1576 #endif // CONFIG_HIGHBITDEPTH
1577
loop_restoration_rows(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,int start_mi_row,int end_mi_row,int components_pattern,RestorationInfo * rsi,YV12_BUFFER_CONFIG * dst)1578 static void loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
1579 int start_mi_row, int end_mi_row,
1580 int components_pattern, RestorationInfo *rsi,
1581 YV12_BUFFER_CONFIG *dst) {
1582 const int ywidth = frame->y_crop_width;
1583 const int yheight = frame->y_crop_height;
1584 const int uvwidth = frame->uv_crop_width;
1585 const int uvheight = frame->uv_crop_height;
1586 const int ystride = frame->y_stride;
1587 const int uvstride = frame->uv_stride;
1588 const int ystart = start_mi_row << MI_SIZE_LOG2;
1589 const int uvstart = ystart >> cm->subsampling_y;
1590 int yend = end_mi_row << MI_SIZE_LOG2;
1591 int uvend = yend >> cm->subsampling_y;
1592 restore_func_type restore_funcs[RESTORE_TYPES] = {
1593 NULL, loop_wiener_filter, loop_sgrproj_filter, loop_switchable_filter
1594 };
1595 #if CONFIG_HIGHBITDEPTH
1596 restore_func_highbd_type restore_funcs_highbd[RESTORE_TYPES] = {
1597 NULL, loop_wiener_filter_highbd, loop_sgrproj_filter_highbd,
1598 loop_switchable_filter_highbd
1599 };
1600 #endif // CONFIG_HIGHBITDEPTH
1601 restore_func_type restore_func;
1602 #if CONFIG_HIGHBITDEPTH
1603 restore_func_highbd_type restore_func_highbd;
1604 #endif // CONFIG_HIGHBITDEPTH
1605 YV12_BUFFER_CONFIG dst_;
1606
1607 yend = AOMMIN(yend, yheight);
1608 uvend = AOMMIN(uvend, uvheight);
1609 if (components_pattern == (1 << AOM_PLANE_Y)) {
1610 // Only y
1611 if (rsi[0].frame_restoration_type == RESTORE_NONE) {
1612 if (dst) aom_yv12_copy_y(frame, dst);
1613 return;
1614 }
1615 } else if (components_pattern == (1 << AOM_PLANE_U)) {
1616 // Only U
1617 if (rsi[1].frame_restoration_type == RESTORE_NONE) {
1618 if (dst) aom_yv12_copy_u(frame, dst);
1619 return;
1620 }
1621 } else if (components_pattern == (1 << AOM_PLANE_V)) {
1622 // Only V
1623 if (rsi[2].frame_restoration_type == RESTORE_NONE) {
1624 if (dst) aom_yv12_copy_v(frame, dst);
1625 return;
1626 }
1627 } else if (components_pattern ==
1628 ((1 << AOM_PLANE_Y) | (1 << AOM_PLANE_U) | (1 << AOM_PLANE_V))) {
1629 // All components
1630 if (rsi[0].frame_restoration_type == RESTORE_NONE &&
1631 rsi[1].frame_restoration_type == RESTORE_NONE &&
1632 rsi[2].frame_restoration_type == RESTORE_NONE) {
1633 if (dst) aom_yv12_copy_frame(frame, dst);
1634 return;
1635 }
1636 }
1637
1638 if (!dst) {
1639 dst = &dst_;
1640 memset(dst, 0, sizeof(YV12_BUFFER_CONFIG));
1641 if (aom_realloc_frame_buffer(
1642 dst, ywidth, yheight, cm->subsampling_x, cm->subsampling_y,
1643 #if CONFIG_HIGHBITDEPTH
1644 cm->use_highbitdepth,
1645 #endif
1646 AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
1647 aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
1648 "Failed to allocate restoration dst buffer");
1649 }
1650
1651 if ((components_pattern >> AOM_PLANE_Y) & 1) {
1652 if (rsi[0].frame_restoration_type != RESTORE_NONE) {
1653 cm->rst_internal.ntiles = av1_get_rest_ntiles(
1654 ywidth, yheight, cm->rst_info[AOM_PLANE_Y].restoration_tilesize,
1655 &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1656 &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1657 cm->rst_internal.rsi = &rsi[0];
1658 #if CONFIG_STRIPED_LOOP_RESTORATION
1659 cm->rst_internal.component = AOM_PLANE_Y;
1660 cm->rst_internal.subsampling_y = 0;
1661 #endif
1662 restore_func =
1663 restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1664 #if CONFIG_HIGHBITDEPTH
1665 restore_func_highbd =
1666 restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1667 if (cm->use_highbitdepth)
1668 restore_func_highbd(
1669 frame->y_buffer + ystart * ystride, ywidth, yend - ystart, ystride,
1670 &cm->rst_internal, cm->bit_depth,
1671 dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
1672 else
1673 #endif // CONFIG_HIGHBITDEPTH
1674 restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
1675 ystride, &cm->rst_internal,
1676 dst->y_buffer + ystart * dst->y_stride, dst->y_stride);
1677 } else {
1678 aom_yv12_copy_y(frame, dst);
1679 }
1680 }
1681
1682 if ((components_pattern >> AOM_PLANE_U) & 1) {
1683 if (rsi[AOM_PLANE_U].frame_restoration_type != RESTORE_NONE) {
1684 cm->rst_internal.ntiles = av1_get_rest_ntiles(
1685 uvwidth, uvheight, cm->rst_info[AOM_PLANE_U].restoration_tilesize,
1686 &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1687 &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1688 cm->rst_internal.rsi = &rsi[AOM_PLANE_U];
1689 #if CONFIG_STRIPED_LOOP_RESTORATION
1690 cm->rst_internal.component = AOM_PLANE_U;
1691 cm->rst_internal.subsampling_y = cm->subsampling_y;
1692 #endif
1693 restore_func =
1694 restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1695 #if CONFIG_HIGHBITDEPTH
1696 restore_func_highbd =
1697 restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1698 if (cm->use_highbitdepth)
1699 restore_func_highbd(
1700 frame->u_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
1701 uvstride, &cm->rst_internal, cm->bit_depth,
1702 dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1703 else
1704 #endif // CONFIG_HIGHBITDEPTH
1705 restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
1706 uvend - uvstart, uvstride, &cm->rst_internal,
1707 dst->u_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1708 } else {
1709 aom_yv12_copy_u(frame, dst);
1710 }
1711 }
1712
1713 if ((components_pattern >> AOM_PLANE_V) & 1) {
1714 if (rsi[AOM_PLANE_V].frame_restoration_type != RESTORE_NONE) {
1715 cm->rst_internal.ntiles = av1_get_rest_ntiles(
1716 uvwidth, uvheight, cm->rst_info[AOM_PLANE_V].restoration_tilesize,
1717 &cm->rst_internal.tile_width, &cm->rst_internal.tile_height,
1718 &cm->rst_internal.nhtiles, &cm->rst_internal.nvtiles);
1719 cm->rst_internal.rsi = &rsi[AOM_PLANE_V];
1720 #if CONFIG_STRIPED_LOOP_RESTORATION
1721 cm->rst_internal.component = AOM_PLANE_V;
1722 cm->rst_internal.subsampling_y = cm->subsampling_y;
1723 #endif
1724 restore_func =
1725 restore_funcs[cm->rst_internal.rsi->frame_restoration_type];
1726 #if CONFIG_HIGHBITDEPTH
1727 restore_func_highbd =
1728 restore_funcs_highbd[cm->rst_internal.rsi->frame_restoration_type];
1729 if (cm->use_highbitdepth)
1730 restore_func_highbd(
1731 frame->v_buffer + uvstart * uvstride, uvwidth, uvend - uvstart,
1732 uvstride, &cm->rst_internal, cm->bit_depth,
1733 dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1734 else
1735 #endif // CONFIG_HIGHBITDEPTH
1736 restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
1737 uvend - uvstart, uvstride, &cm->rst_internal,
1738 dst->v_buffer + uvstart * dst->uv_stride, dst->uv_stride);
1739 } else {
1740 aom_yv12_copy_v(frame, dst);
1741 }
1742 }
1743
1744 if (dst == &dst_) {
1745 if ((components_pattern >> AOM_PLANE_Y) & 1) aom_yv12_copy_y(dst, frame);
1746 if ((components_pattern >> AOM_PLANE_U) & 1) aom_yv12_copy_u(dst, frame);
1747 if ((components_pattern >> AOM_PLANE_V) & 1) aom_yv12_copy_v(dst, frame);
1748 aom_free_frame_buffer(dst);
1749 }
1750 }
1751
av1_loop_restoration_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,RestorationInfo * rsi,int components_pattern,int partial_frame,YV12_BUFFER_CONFIG * dst)1752 void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
1753 RestorationInfo *rsi, int components_pattern,
1754 int partial_frame, YV12_BUFFER_CONFIG *dst) {
1755 int start_mi_row, end_mi_row, mi_rows_to_filter;
1756 start_mi_row = 0;
1757 #if CONFIG_FRAME_SUPERRES
1758 mi_rows_to_filter =
1759 ALIGN_POWER_OF_TWO(cm->superres_upscaled_height, 3) >> MI_SIZE_LOG2;
1760 #else
1761 mi_rows_to_filter = cm->mi_rows;
1762 #endif // CONFIG_FRAME_SUPERRES
1763 if (partial_frame && mi_rows_to_filter > 8) {
1764 start_mi_row = mi_rows_to_filter >> 1;
1765 start_mi_row &= 0xfffffff8;
1766 mi_rows_to_filter = AOMMAX(mi_rows_to_filter / 8, 8);
1767 }
1768 end_mi_row = start_mi_row + mi_rows_to_filter;
1769 loop_restoration_init(&cm->rst_internal, cm->frame_type == KEY_FRAME);
1770 loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, components_pattern,
1771 rsi, dst);
1772 }
1773
av1_loop_restoration_corners_in_sb(const struct AV1Common * cm,int plane,int mi_row,int mi_col,BLOCK_SIZE bsize,int * rcol0,int * rcol1,int * rrow0,int * rrow1,int * nhtiles)1774 int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
1775 int mi_row, int mi_col, BLOCK_SIZE bsize,
1776 int *rcol0, int *rcol1, int *rrow0,
1777 int *rrow1, int *nhtiles) {
1778 assert(rcol0 && rcol1 && rrow0 && rrow1 && nhtiles);
1779
1780 if (bsize != cm->sb_size) return 0;
1781
1782 #if CONFIG_FRAME_SUPERRES
1783 const int frame_w = cm->superres_upscaled_width;
1784 const int frame_h = cm->superres_upscaled_height;
1785 const int mi_to_px = MI_SIZE * SCALE_NUMERATOR;
1786 const int denom = cm->superres_scale_denominator;
1787 #else
1788 const int frame_w = cm->width;
1789 const int frame_h = cm->height;
1790 const int mi_to_px = MI_SIZE;
1791 const int denom = 1;
1792 #endif // CONFIG_FRAME_SUPERRES
1793
1794 const int ss_x = plane > 0 && cm->subsampling_x != 0;
1795 const int ss_y = plane > 0 && cm->subsampling_y != 0;
1796
1797 const int ss_frame_w = (frame_w + ss_x) >> ss_x;
1798 const int ss_frame_h = (frame_h + ss_y) >> ss_y;
1799
1800 int rtile_w, rtile_h, nvtiles;
1801 av1_get_rest_ntiles(ss_frame_w, ss_frame_h,
1802 cm->rst_info[plane].restoration_tilesize, &rtile_w,
1803 &rtile_h, nhtiles, &nvtiles);
1804
1805 const int rnd_w = rtile_w * denom - 1;
1806 const int rnd_h = rtile_h * denom - 1;
1807
1808 // rcol0/rrow0 should be the first column/row of rtiles that doesn't start
1809 // left/below of mi_col/mi_row. For this calculation, we need to round up the
1810 // division (if the sb starts at rtile column 10.1, the first matching rtile
1811 // has column index 11)
1812 *rcol0 = (mi_col * mi_to_px + rnd_w) / (rtile_w * denom);
1813 *rrow0 = (mi_row * mi_to_px + rnd_h) / (rtile_h * denom);
1814
1815 // rcol1/rrow1 is the equivalent calculation, but for the superblock
1816 // below-right. There are some slightly strange boundary effects. First, we
1817 // need to clamp to nhtiles/nvtiles for the case where it appears there are,
1818 // say, 2.4 restoration tiles horizontally. There we need a maximum mi_row1
1819 // of 2 because tile 1 gets extended.
1820 //
1821 // Second, if mi_col1 >= cm->mi_cols then we must manually set *rcol1 to
1822 // nhtiles. This is needed whenever the frame's width rounded up to the next
1823 // toplevel superblock is smaller than nhtiles * rtile_w. The same logic is
1824 // needed for rows.
1825 const int mi_row1 = mi_row + mi_size_high[bsize];
1826 const int mi_col1 = mi_col + mi_size_wide[bsize];
1827
1828 if (mi_col1 >= cm->mi_cols)
1829 *rcol1 = *nhtiles;
1830 else
1831 *rcol1 = AOMMIN(*nhtiles, (mi_col1 * mi_to_px + rnd_w) / (rtile_w * denom));
1832
1833 if (mi_row1 >= cm->mi_rows)
1834 *rrow1 = nvtiles;
1835 else
1836 *rrow1 = AOMMIN(nvtiles, (mi_row1 * mi_to_px + rnd_h) / (rtile_h * denom));
1837
1838 return *rcol0 < *rcol1 && *rrow0 < *rrow1;
1839 }
1840
1841 #if CONFIG_STRIPED_LOOP_RESTORATION
1842
1843 // Extend to left and right
extend_line(uint8_t * buf,int width,int extend,int use_highbitdepth)1844 static void extend_line(uint8_t *buf, int width, int extend,
1845 int use_highbitdepth) {
1846 int i;
1847 if (use_highbitdepth) {
1848 uint16_t val, *buf16 = (uint16_t *)buf;
1849 val = buf16[0];
1850 for (i = 0; i < extend; i++) buf16[-1 - i] = val;
1851 val = buf16[width - 1];
1852 for (i = 0; i < extend; i++) buf16[width + i] = val;
1853 } else {
1854 uint8_t val;
1855 val = buf[0];
1856 for (i = 0; i < extend; i++) buf[-1 - i] = val;
1857 val = buf[width - 1];
1858 for (i = 0; i < extend; i++) buf[width + i] = val;
1859 }
1860 }
1861
1862 // For each 64 pixel high stripe, save 4 scan lines to be used as boundary in
1863 // the loop restoration process. The lines are saved in
1864 // rst_internal.stripe_boundary_lines
av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm)1865 void av1_loop_restoration_save_boundary_lines(YV12_BUFFER_CONFIG *frame,
1866 AV1_COMMON *cm) {
1867 int p, boundary_stride;
1868 int src_width, src_height, src_stride, stripe_height, stripe_offset, stripe_y,
1869 yy;
1870 uint8_t *src_buf, *boundary_below_buf, *boundary_above_buf;
1871 int use_highbitdepth = 0;
1872 for (p = 0; p < MAX_MB_PLANE; ++p) {
1873 if (p == 0) {
1874 src_buf = frame->y_buffer;
1875 src_width = frame->y_crop_width;
1876 src_height = frame->y_crop_height;
1877 src_stride = frame->y_stride;
1878 stripe_height = 64;
1879 stripe_offset = 56 - 2; // offset of first line to copy
1880 } else {
1881 src_buf = p == 1 ? frame->u_buffer : frame->v_buffer;
1882 src_width = frame->uv_crop_width;
1883 src_height = frame->uv_crop_height;
1884 src_stride = frame->uv_stride;
1885 stripe_height = 64 >> cm->subsampling_y;
1886 stripe_offset = (56 >> cm->subsampling_y) - 2;
1887 }
1888 boundary_above_buf = cm->rst_internal.stripe_boundary_above[p];
1889 boundary_below_buf = cm->rst_internal.stripe_boundary_below[p];
1890 boundary_stride = cm->rst_internal.stripe_boundary_stride[p];
1891 #if CONFIG_HIGHBITDEPTH
1892 use_highbitdepth = cm->use_highbitdepth;
1893 if (use_highbitdepth) {
1894 src_buf = (uint8_t *)CONVERT_TO_SHORTPTR(src_buf);
1895 }
1896 #endif
1897 src_buf += (stripe_offset * src_stride) << use_highbitdepth;
1898 boundary_above_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
1899 boundary_below_buf += RESTORATION_EXTRA_HORZ << use_highbitdepth;
1900 // Loop over stripes
1901 for (stripe_y = stripe_offset; stripe_y < src_height;
1902 stripe_y += stripe_height) {
1903 // Save 2 lines above the LR stripe (offset -9, -10)
1904 for (yy = 0; yy < 2; yy++) {
1905 if (stripe_y + yy < src_height) {
1906 memcpy(boundary_above_buf, src_buf, src_width << use_highbitdepth);
1907 extend_line(boundary_above_buf, src_width, RESTORATION_EXTRA_HORZ,
1908 use_highbitdepth);
1909 src_buf += src_stride << use_highbitdepth;
1910 boundary_above_buf += boundary_stride << use_highbitdepth;
1911 }
1912 }
1913 // Save 2 lines below the LR stripe (offset 56,57)
1914 for (yy = 2; yy < 4; yy++) {
1915 if (stripe_y + yy < src_height) {
1916 memcpy(boundary_below_buf, src_buf, src_width << use_highbitdepth);
1917 extend_line(boundary_below_buf, src_width, RESTORATION_EXTRA_HORZ,
1918 use_highbitdepth);
1919 src_buf += src_stride << use_highbitdepth;
1920 boundary_below_buf += boundary_stride << use_highbitdepth;
1921 }
1922 }
1923 // jump to next stripe
1924 src_buf += ((stripe_height - 4) * src_stride) << use_highbitdepth;
1925 }
1926 }
1927 }
1928
1929 #endif // CONFIG_STRIPED_LOOP_RESTORATION
1930