1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <math.h>
13 #include <limits.h>
14
15 #include "config/aom_config.h"
16
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/odintrin.h"
19 #include "aom_mem/aom_mem.h"
20 #include "aom_ports/aom_timer.h"
21 #include "aom_ports/mem.h"
22 #include "aom_scale/aom_scale.h"
23 #include "av1/common/alloccommon.h"
24 #include "av1/common/av1_common_int.h"
25 #include "av1/common/quant_common.h"
26 #include "av1/common/reconinter.h"
27 #include "av1/encoder/av1_quantize.h"
28 #include "av1/encoder/encodeframe.h"
29 #include "av1/encoder/encoder.h"
30 #include "av1/encoder/ethread.h"
31 #include "av1/encoder/extend.h"
32 #include "av1/encoder/firstpass.h"
33 #include "av1/encoder/mcomp.h"
34 #include "av1/encoder/ratectrl.h"
35 #include "av1/encoder/reconinter_enc.h"
36 #include "av1/encoder/segmentation.h"
37 #include "av1/encoder/temporal_filter.h"
38
39 /*!\cond */
40
41 // NOTE: All `tf` in this file means `temporal filtering`.
42
43 // Forward Declaration.
44 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
45 MV *subblock_mvs, int *subblock_mses);
46
47 /*!\endcond */
48 /*!\brief Does motion search for blocks in temporal filtering. This is
49 * the first step for temporal filtering. More specifically, given a frame to
50 * be filtered and another frame as reference, this function searches the
51 * reference frame to find out the most similar block as that from the frame
52 * to be filtered. This found block will be further used for weighted
53 * averaging.
54 *
55 * NOTE: Besides doing motion search for the entire block, this function will
56 * also do motion search for each 1/4 sub-block to get more precise
57 * predictions. Then, this function will determines whether to use 4
58 * sub-blocks to replace the entire block. If we do need to split the
59 * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
60 * the searched motion vector and search error (MSE) w.r.t. each sub-block
61 * respectively. Otherwise, the 4 elements will be the same, all of which
62 * are assigned as the searched motion vector and search error (MSE) for
63 * the entire block.
64 *
65 * \ingroup src_frame_proc
66 * \param[in] cpi Top level encoder instance structure
67 * \param[in] mb Pointer to macroblock
68 * \param[in] frame_to_filter Pointer to the frame to be filtered
69 * \param[in] ref_frame Pointer to the reference frame
70 * \param[in] block_size Block size used for motion search
71 * \param[in] mb_row Row index of the block in the frame
72 * \param[in] mb_col Column index of the block in the frame
73 * \param[in] ref_mv Reference motion vector, which is commonly
74 * inherited from the motion search result of
75 * previous frame.
76 * \param[out] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
77 * \param[out] subblock_mses Pointer to the search errors (MSE) for 4
78 * sub-blocks
79 *
80 * \return Nothing will be returned. Results are saved in subblock_mvs and
81 * subblock_mses
82 */
tf_motion_search(AV1_COMP * cpi,MACROBLOCK * mb,const YV12_BUFFER_CONFIG * frame_to_filter,const YV12_BUFFER_CONFIG * ref_frame,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,MV * ref_mv,MV * subblock_mvs,int * subblock_mses)83 static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
84 const YV12_BUFFER_CONFIG *frame_to_filter,
85 const YV12_BUFFER_CONFIG *ref_frame,
86 const BLOCK_SIZE block_size, const int mb_row,
87 const int mb_col, MV *ref_mv, MV *subblock_mvs,
88 int *subblock_mses) {
89 // Frame information
90 const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
91
92 // Block information (ONLY Y-plane is used for motion search).
93 const int mb_height = block_size_high[block_size];
94 const int mb_width = block_size_wide[block_size];
95 const int mb_pels = mb_height * mb_width;
96 const int y_stride = frame_to_filter->y_stride;
97 assert(y_stride == ref_frame->y_stride);
98 const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
99
100 // Save input state.
101 MACROBLOCKD *const mbd = &mb->e_mbd;
102 const struct buf_2d ori_src_buf = mb->plane[0].src;
103 const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
104
105 // Parameters used for motion search.
106 FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
107 SUBPEL_MOTION_SEARCH_PARAMS ms_params;
108 const SEARCH_METHODS search_method = NSTEP;
109 const search_site_config *search_site_cfg =
110 cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
111 const int step_param = av1_init_search_range(
112 AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
113 const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
114 const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
115 const MV_COST_TYPE mv_cost_type =
116 min_frame_size >= 720
117 ? MV_COST_L1_HDRES
118 : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
119
120 // Starting position for motion search.
121 FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
122 // Baseline position for motion search (used for rate distortion comparison).
123 const MV baseline_mv = kZeroMv;
124
125 // Setup.
126 mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
127 mb->plane[0].src.stride = y_stride;
128 mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
129 mbd->plane[0].pre[0].stride = y_stride;
130 // Unused intermediate results for motion search.
131 unsigned int sse, error;
132 int distortion;
133 int cost_list[5];
134
135 // Do motion search.
136 int_mv best_mv; // Searched motion vector.
137 int block_mse = INT_MAX;
138 MV block_mv = kZeroMv;
139
140 av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
141 &baseline_mv, search_site_cfg,
142 /*fine_search_interval=*/0);
143 av1_set_mv_search_method(&full_ms_params, search_site_cfg, search_method);
144 full_ms_params.run_mesh_search = 1;
145 full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
146
147 av1_full_pixel_search(start_mv, &full_ms_params, step_param,
148 cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
149 NULL);
150
151 if (force_integer_mv == 1) { // Only do full search on the entire block.
152 const int mv_row = best_mv.as_mv.row;
153 const int mv_col = best_mv.as_mv.col;
154 best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
155 best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
156 const int mv_offset = mv_row * y_stride + mv_col;
157 error = cpi->ppi->fn_ptr[block_size].vf(
158 ref_frame->y_buffer + y_offset + mv_offset, y_stride,
159 frame_to_filter->y_buffer + y_offset, y_stride, &sse);
160 block_mse = DIVIDE_AND_ROUND(error, mb_pels);
161 block_mv = best_mv.as_mv;
162 } else { // Do fractional search on the entire block and all sub-blocks.
163 av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
164 &baseline_mv, cost_list);
165 ms_params.forced_stop = EIGHTH_PEL;
166 ms_params.var_params.subpel_search_type = subpel_search_type;
167 // Since we are merely refining the result from full pixel search, we don't
168 // need regularization for subpel search
169 ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
170
171 MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
172 error = cpi->mv_search_params.find_fractional_mv_step(
173 &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
174 &distortion, &sse, NULL);
175 block_mse = DIVIDE_AND_ROUND(error, mb_pels);
176 block_mv = best_mv.as_mv;
177 *ref_mv = best_mv.as_mv;
178 // On 4 sub-blocks.
179 const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
180 const int subblock_height = block_size_high[subblock_size];
181 const int subblock_width = block_size_wide[subblock_size];
182 const int subblock_pels = subblock_height * subblock_width;
183 start_mv = get_fullmv_from_mv(ref_mv);
184
185 int subblock_idx = 0;
186 for (int i = 0; i < mb_height; i += subblock_height) {
187 for (int j = 0; j < mb_width; j += subblock_width) {
188 const int offset = i * y_stride + j;
189 mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
190 mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
191 av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb,
192 subblock_size, &baseline_mv,
193 search_site_cfg,
194 /*fine_search_interval=*/0);
195 av1_set_mv_search_method(&full_ms_params, search_site_cfg,
196 search_method);
197 full_ms_params.run_mesh_search = 1;
198 full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
199
200 av1_full_pixel_search(start_mv, &full_ms_params, step_param,
201 cond_cost_list(cpi, cost_list),
202 &best_mv.as_fullmv, NULL);
203
204 av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
205 &baseline_mv, cost_list);
206 ms_params.forced_stop = EIGHTH_PEL;
207 ms_params.var_params.subpel_search_type = subpel_search_type;
208 // Since we are merely refining the result from full pixel search, we
209 // don't need regularization for subpel search
210 ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
211
212 subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
213 error = cpi->mv_search_params.find_fractional_mv_step(
214 &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
215 &best_mv.as_mv, &distortion, &sse, NULL);
216 subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
217 subblock_mvs[subblock_idx] = best_mv.as_mv;
218 ++subblock_idx;
219 }
220 }
221 }
222
223 // Restore input state.
224 mb->plane[0].src = ori_src_buf;
225 mbd->plane[0].pre[0] = ori_pre_buf;
226
227 // Make partition decision.
228 tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
229 subblock_mses);
230
231 // Do not pass down the reference motion vector if error is too large.
232 const int thresh = (min_frame_size >= 720) ? 12 : 3;
233 if (block_mse > (thresh << (mbd->bd - 8))) {
234 *ref_mv = kZeroMv;
235 }
236 }
237 /*!\cond */
238
239 // Determines whether to split the entire block to 4 sub-blocks for filtering.
240 // In particular, this decision is made based on the comparison between the
241 // motion search error of the entire block and the errors of all sub-blocks.
242 // Inputs:
243 // block_mv: Motion vector for the entire block (ONLY as reference).
244 // block_mse: Motion search error (MSE) for the entire block (ONLY as
245 // reference).
246 // subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
247 // modified based on the partition decision).
248 // subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
249 // be modified based on the partition decision).
250 // Returns:
251 // Nothing will be returned. Results are saved in `subblock_mvs` and
252 // `subblock_mses`.
tf_determine_block_partition(const MV block_mv,const int block_mse,MV * subblock_mvs,int * subblock_mses)253 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
254 MV *subblock_mvs, int *subblock_mses) {
255 int min_subblock_mse = INT_MAX;
256 int max_subblock_mse = INT_MIN;
257 int64_t sum_subblock_mse = 0;
258 for (int i = 0; i < 4; ++i) {
259 sum_subblock_mse += subblock_mses[i];
260 min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
261 max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
262 }
263
264 // TODO(any): The following magic numbers may be tuned to improve the
265 // performance OR find a way to get rid of these magic numbers.
266 if (((block_mse * 15 < sum_subblock_mse * 4) &&
267 max_subblock_mse - min_subblock_mse < 48) ||
268 ((block_mse * 14 < sum_subblock_mse * 4) &&
269 max_subblock_mse - min_subblock_mse < 24)) { // No split.
270 for (int i = 0; i < 4; ++i) {
271 subblock_mvs[i] = block_mv;
272 subblock_mses[i] = block_mse;
273 }
274 }
275 }
276
277 // Helper function to determine whether a frame is encoded with high bit-depth.
is_frame_high_bitdepth(const YV12_BUFFER_CONFIG * frame)278 static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
279 return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
280 }
281
282 /*!\endcond */
283 /*!\brief Builds predictor for blocks in temporal filtering. This is the
284 * second step for temporal filtering, which is to construct predictions from
285 * all reference frames INCLUDING the frame to be filtered itself. These
286 * predictors are built based on the motion search results (motion vector is
287 * set as 0 for the frame to be filtered), and will be futher used for
288 * weighted averaging.
289 *
290 * \ingroup src_frame_proc
291 * \param[in] ref_frame Pointer to the reference frame (or the frame
292 * to be filtered)
293 * \param[in] mbd Pointer to the block for filtering. Besides
294 * containing the subsampling information of all
295 * planes, this field also gives the searched
296 * motion vector for the entire block, i.e.,
297 * `mbd->mi[0]->mv[0]`. This vector should be 0
298 * if the `ref_frame` itself is the frame to be
299 * filtered.
300 * \param[in] block_size Size of the block
301 * \param[in] mb_row Row index of the block in the frame
302 * \param[in] mb_col Column index of the block in the frame
303 * \param[in] num_planes Number of planes in the frame
304 * \param[in] scale Scaling factor
305 * \param[in] subblock_mvs The motion vectors for each sub-block (row-major
306 * order)
307 * \param[out] pred Pointer to the predictor to be built
308 *
309 * \return Nothing returned, But the contents of `pred` will be modified
310 */
tf_build_predictor(const YV12_BUFFER_CONFIG * ref_frame,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const struct scale_factors * scale,const MV * subblock_mvs,uint8_t * pred)311 static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
312 const MACROBLOCKD *mbd,
313 const BLOCK_SIZE block_size, const int mb_row,
314 const int mb_col, const int num_planes,
315 const struct scale_factors *scale,
316 const MV *subblock_mvs, uint8_t *pred) {
317 // Information of the entire block.
318 const int mb_height = block_size_high[block_size]; // Height.
319 const int mb_width = block_size_wide[block_size]; // Width.
320 const int mb_y = mb_height * mb_row; // Y-coord (Top-left).
321 const int mb_x = mb_width * mb_col; // X-coord (Top-left).
322 const int bit_depth = mbd->bd; // Bit depth.
323 const int is_intrabc = 0; // Is intra-copied?
324 const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
325
326 // Default interpolation filters.
327 const int_interpfilters interp_filters =
328 av1_broadcast_interp_filter(MULTITAP_SHARP2);
329
330 // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
331 int plane_offset = 0;
332 for (int plane = 0; plane < num_planes; ++plane) {
333 const int subsampling_y = mbd->plane[plane].subsampling_y;
334 const int subsampling_x = mbd->plane[plane].subsampling_x;
335 // Information of each sub-block in current plane.
336 const int plane_h = mb_height >> subsampling_y; // Plane height.
337 const int plane_w = mb_width >> subsampling_x; // Plane width.
338 const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left).
339 const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left).
340 const int h = plane_h >> 1; // Sub-block height.
341 const int w = plane_w >> 1; // Sub-block width.
342 const int is_y_plane = (plane == 0); // Is Y-plane?
343
344 const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
345 ref_frame->widths[is_y_plane ? 0 : 1],
346 ref_frame->heights[is_y_plane ? 0 : 1],
347 ref_frame->strides[is_y_plane ? 0 : 1] };
348
349 // Handle each subblock.
350 int subblock_idx = 0;
351 for (int i = 0; i < plane_h; i += h) {
352 for (int j = 0; j < plane_w; j += w) {
353 // Choose proper motion vector.
354 const MV mv = subblock_mvs[subblock_idx++];
355 assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
356 mv.col >= INT16_MIN && mv.col <= INT16_MAX);
357
358 const int y = plane_y + i;
359 const int x = plane_x + j;
360
361 // Build predictior for each sub-block on current plane.
362 InterPredParams inter_pred_params;
363 av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
364 subsampling_y, bit_depth, is_high_bitdepth,
365 is_intrabc, scale, &ref_buf, interp_filters);
366 inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
367 av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
368 plane_w, &mv, &inter_pred_params);
369 }
370 }
371 plane_offset += plane_h * plane_w;
372 }
373 }
374 /*!\cond */
375
376 // Computes temporal filter weights and accumulators for the frame to be
377 // filtered. More concretely, the filter weights for all pixels are the same.
378 // Inputs:
379 // mbd: Pointer to the block for filtering, which is ONLY used to get
380 // subsampling information of all planes as well as the bit-depth.
381 // block_size: Size of the block.
382 // num_planes: Number of planes in the frame.
383 // pred: Pointer to the well-built predictors.
384 // accum: Pointer to the pixel-wise accumulator for filtering.
385 // count: Pointer to the pixel-wise counter fot filtering.
386 // Returns:
387 // Nothing will be returned. But the content to which `accum` and `pred`
388 // point will be modified.
tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG * ref_frame,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,uint32_t * accum,uint16_t * count)389 void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
390 const MACROBLOCKD *mbd,
391 const BLOCK_SIZE block_size,
392 const int mb_row, const int mb_col,
393 const int num_planes, uint32_t *accum,
394 uint16_t *count) {
395 // Block information.
396 const int mb_height = block_size_high[block_size];
397 const int mb_width = block_size_wide[block_size];
398 const int is_high_bitdepth = is_cur_buf_hbd(mbd);
399
400 int plane_offset = 0;
401 for (int plane = 0; plane < num_planes; ++plane) {
402 const int subsampling_y = mbd->plane[plane].subsampling_y;
403 const int subsampling_x = mbd->plane[plane].subsampling_x;
404 const int h = mb_height >> subsampling_y; // Plane height.
405 const int w = mb_width >> subsampling_x; // Plane width.
406
407 const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
408 const uint8_t *buf8 = ref_frame->buffers[plane];
409 const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
410 const int frame_offset = mb_row * h * frame_stride + mb_col * w;
411
412 int pred_idx = 0;
413 int pixel_idx = 0;
414 for (int i = 0; i < h; ++i) {
415 for (int j = 0; j < w; ++j) {
416 const int idx = plane_offset + pred_idx; // Index with plane shift.
417 const int pred_value = is_high_bitdepth
418 ? buf16[frame_offset + pixel_idx]
419 : buf8[frame_offset + pixel_idx];
420 accum[idx] += TF_WEIGHT_SCALE * pred_value;
421 count[idx] += TF_WEIGHT_SCALE;
422 ++pred_idx;
423 ++pixel_idx;
424 }
425 pixel_idx += (frame_stride - w);
426 }
427 plane_offset += h * w;
428 }
429 }
430
431 // Function to compute pixel-wise squared difference between two buffers.
432 // Inputs:
433 // ref: Pointer to reference buffer.
434 // ref_offset: Start position of reference buffer for computation.
435 // ref_stride: Stride for reference buffer.
436 // tgt: Pointer to target buffer.
437 // tgt_offset: Start position of target buffer for computation.
438 // tgt_stride: Stride for target buffer.
439 // height: Height of block for computation.
440 // width: Width of block for computation.
441 // is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
442 // square_diff: Pointer to save the squared differces.
443 // Returns:
444 // Nothing will be returned. But the content to which `square_diff` points
445 // will be modified.
compute_square_diff(const uint8_t * ref,const int ref_offset,const int ref_stride,const uint8_t * tgt,const int tgt_offset,const int tgt_stride,const int height,const int width,const int is_high_bitdepth,uint32_t * square_diff)446 static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
447 const int ref_stride, const uint8_t *tgt,
448 const int tgt_offset,
449 const int tgt_stride, const int height,
450 const int width,
451 const int is_high_bitdepth,
452 uint32_t *square_diff) {
453 const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
454 const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
455
456 int ref_idx = 0;
457 int tgt_idx = 0;
458 int idx = 0;
459 for (int i = 0; i < height; ++i) {
460 for (int j = 0; j < width; ++j) {
461 const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
462 : ref[ref_offset + ref_idx];
463 const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
464 : tgt[tgt_offset + tgt_idx];
465 const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
466 : (tgt_value - ref_value);
467 square_diff[idx] = diff * diff;
468
469 ++ref_idx;
470 ++tgt_idx;
471 ++idx;
472 }
473 ref_idx += (ref_stride - width);
474 tgt_idx += (tgt_stride - width);
475 }
476 }
477
478 // Function to accumulate pixel-wise squared difference between two luma buffers
479 // to be consumed while filtering the chroma planes.
480 // Inputs:
481 // square_diff: Pointer to squared differences from luma plane.
482 // luma_sse_sum: Pointer to save the sum of luma squared differences.
483 // block_height: Height of block for computation.
484 // block_width: Width of block for computation.
485 // ss_x_shift: Chroma subsampling shift in 'X' direction
486 // ss_y_shift: Chroma subsampling shift in 'Y' direction
487 // Returns:
488 // Nothing will be returned. But the content to which `luma_sse_sum` points
489 // will be modified.
compute_luma_sq_error_sum(uint32_t * square_diff,uint32_t * luma_sse_sum,int block_height,int block_width,int ss_x_shift,int ss_y_shift)490 void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
491 int block_height, int block_width,
492 int ss_x_shift, int ss_y_shift) {
493 for (int i = 0; i < block_height; ++i) {
494 for (int j = 0; j < block_width; ++j) {
495 for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
496 for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
497 const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
498 const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
499 const int ww = block_width << ss_x_shift; // Width of Y-plane.
500 luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
501 }
502 }
503 }
504 }
505 }
506
507 /*!\endcond */
508 /*!\brief Applies temporal filtering. NOTE that there are various optimised
509 * versions of this function called where the appropriate instruction set is
510 * supported.
511 *
512 * \ingroup src_frame_proc
513 * \param[in] frame_to_filter Pointer to the frame to be filtered, which is
514 * used as reference to compute squared
515 * difference from the predictor.
516 * \param[in] mbd Pointer to the block for filtering, ONLY used
517 * to get subsampling information for the planes
518 * \param[in] block_size Size of the block
519 * \param[in] mb_row Row index of the block in the frame
520 * \param[in] mb_col Column index of the block in the frame
521 * \param[in] num_planes Number of planes in the frame
522 * \param[in] noise_levels Estimated noise levels for each plane
523 * in the frame (Y,U,V)
524 * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
525 * \param[in] subblock_mses Pointer to the search errors (MSE) for 4
526 * sub-blocks
527 * \param[in] q_factor Quantization factor. This is actually the `q`
528 * defined in libaom, converted from `qindex`
529 * \param[in] filter_strength Filtering strength. This value lies in range
530 * [0, 6] where 6 is the maximum strength.
531 * \param[out] pred Pointer to the well-built predictors
532 * \param[out] accum Pointer to the pixel-wise accumulator for
533 * filtering
534 * \param[out] count Pointer to the pixel-wise counter for
535 * filtering
536 *
537 * \return Nothing returned, But the contents of `accum`, `pred` and 'count'
538 * will be modified
539 */
av1_apply_temporal_filter_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const MV * subblock_mvs,const int * subblock_mses,const int q_factor,const int filter_strength,const uint8_t * pred,uint32_t * accum,uint16_t * count)540 void av1_apply_temporal_filter_c(
541 const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
542 const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
543 const int num_planes, const double *noise_levels, const MV *subblock_mvs,
544 const int *subblock_mses, const int q_factor, const int filter_strength,
545 const uint8_t *pred, uint32_t *accum, uint16_t *count) {
546 // Block information.
547 const int mb_height = block_size_high[block_size];
548 const int mb_width = block_size_wide[block_size];
549 const int mb_pels = mb_height * mb_width;
550 const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
551 const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
552 // Frame information.
553 const int frame_height = frame_to_filter->y_crop_height;
554 const int frame_width = frame_to_filter->y_crop_width;
555 const int min_frame_size = AOMMIN(frame_height, frame_width);
556 // Variables to simplify combined error calculation.
557 const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
558 TF_SEARCH_ERROR_NORM_WEIGHT);
559 const double weight_factor =
560 (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
561 // Decay factors for non-local mean approach.
562 double decay_factor[MAX_MB_PLANE] = { 0 };
563 // Adjust filtering based on q.
564 // Larger q -> stronger filtering -> larger weight.
565 // Smaller q -> weaker filtering -> smaller weight.
566 double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
567 q_decay = CLIP(q_decay, 1e-5, 1);
568 if (q_factor >= TF_QINDEX_CUTOFF) {
569 // Max q_factor is 255, therefore the upper bound of q_decay is 8.
570 // We do not need a clip here.
571 q_decay = 0.5 * pow((double)q_factor / 64, 2);
572 }
573 // Smaller strength -> smaller filtering weight.
574 double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
575 s_decay = CLIP(s_decay, 1e-5, 1);
576 for (int plane = 0; plane < num_planes; plane++) {
577 // Larger noise -> larger filtering weight.
578 const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
579 decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
580 }
581 double d_factor[4] = { 0 };
582 for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
583 // Larger motion vector -> smaller filtering weight.
584 const MV mv = subblock_mvs[subblock_idx];
585 const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
586 double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
587 distance_threshold = AOMMAX(distance_threshold, 1);
588 d_factor[subblock_idx] = distance / distance_threshold;
589 d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
590 }
591
592 // Allocate memory for pixel-wise squared differences. They,
593 // regardless of the subsampling, are assigned with memory of size `mb_pels`.
594 uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
595 memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
596
597 // Allocate memory for accumulated luma squared error. This value will be
598 // consumed while filtering the chroma planes.
599 uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
600 memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
601
602 // Get window size for pixel-wise filtering.
603 assert(TF_WINDOW_LENGTH % 2 == 1);
604 const int half_window = TF_WINDOW_LENGTH >> 1;
605
606 // Handle planes in sequence.
607 int plane_offset = 0;
608 for (int plane = 0; plane < num_planes; ++plane) {
609 // Locate pixel on reference frame.
610 const int subsampling_y = mbd->plane[plane].subsampling_y;
611 const int subsampling_x = mbd->plane[plane].subsampling_x;
612 const int h = mb_height >> subsampling_y; // Plane height.
613 const int w = mb_width >> subsampling_x; // Plane width.
614 const int frame_stride =
615 frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
616 const int frame_offset = mb_row * h * frame_stride + mb_col * w;
617 const uint8_t *ref = frame_to_filter->buffers[plane];
618 const int ss_y_shift =
619 subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
620 const int ss_x_shift =
621 subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
622 const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
623 ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
624 const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
625
626 // Filter U-plane and V-plane using Y-plane. This is because motion
627 // search is only done on Y-plane, so the information from Y-plane will
628 // be more accurate. The luma sse sum is reused in both chroma planes.
629 if (plane == AOM_PLANE_U)
630 compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
631 ss_y_shift);
632 compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
633 h, w, is_high_bitdepth, square_diff);
634
635 // Perform filtering.
636 int pred_idx = 0;
637 for (int i = 0; i < h; ++i) {
638 for (int j = 0; j < w; ++j) {
639 // non-local mean approach
640 uint64_t sum_square_diff = 0;
641
642 for (int wi = -half_window; wi <= half_window; ++wi) {
643 for (int wj = -half_window; wj <= half_window; ++wj) {
644 const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane.
645 const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane.
646 sum_square_diff += square_diff[y * w + x];
647 }
648 }
649
650 sum_square_diff += luma_sse_sum[i * w + j];
651
652 // Scale down the difference for high bit depth input.
653 if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
654
655 // Combine window error and block error, and normalize it.
656 const double window_error = sum_square_diff * inv_num_ref_pixels;
657 const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
658 const double block_error = (double)subblock_mses[subblock_idx];
659 const double combined_error =
660 weight_factor * window_error + block_error * inv_factor;
661
662 // Compute filter weight.
663 double scaled_error =
664 combined_error * d_factor[subblock_idx] * decay_factor[plane];
665 scaled_error = AOMMIN(scaled_error, 7);
666 const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
667
668 const int idx = plane_offset + pred_idx; // Index with plane shift.
669 const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
670 accum[idx] += weight * pred_value;
671 count[idx] += weight;
672
673 ++pred_idx;
674 }
675 }
676 plane_offset += h * w;
677 }
678
679 aom_free(square_diff);
680 aom_free(luma_sse_sum);
681 }
682 #if CONFIG_AV1_HIGHBITDEPTH
683 // Calls High bit-depth temporal filter
av1_highbd_apply_temporal_filter_c(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const MV * subblock_mvs,const int * subblock_mses,const int q_factor,const int filter_strength,const uint8_t * pred,uint32_t * accum,uint16_t * count)684 void av1_highbd_apply_temporal_filter_c(
685 const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
686 const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
687 const int num_planes, const double *noise_levels, const MV *subblock_mvs,
688 const int *subblock_mses, const int q_factor, const int filter_strength,
689 const uint8_t *pred, uint32_t *accum, uint16_t *count) {
690 av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
691 num_planes, noise_levels, subblock_mvs,
692 subblock_mses, q_factor, filter_strength, pred,
693 accum, count);
694 }
695 #endif // CONFIG_AV1_HIGHBITDEPTH
696 /*!\brief Normalizes the accumulated filtering result to produce the filtered
697 * frame
698 *
699 * \ingroup src_frame_proc
700 * \param[in] mbd Pointer to the block for filtering, which is
701 * ONLY used to get subsampling information for
702 * all the planes
703 * \param[in] block_size Size of the block
704 * \param[in] mb_row Row index of the block in the frame
705 * \param[in] mb_col Column index of the block in the frame
706 * \param[in] num_planes Number of planes in the frame
707 * \param[in] accum Pointer to the pre-computed accumulator
708 * \param[in] count Pointer to the pre-computed count
709 * \param[out] result_buffer Pointer to result buffer
710 *
711 * \return Nothing returned, but the content to which `result_buffer` pointer
712 * will be modified
713 */
tf_normalize_filtered_frame(const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const uint32_t * accum,const uint16_t * count,YV12_BUFFER_CONFIG * result_buffer)714 static void tf_normalize_filtered_frame(
715 const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
716 const int mb_col, const int num_planes, const uint32_t *accum,
717 const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
718 // Block information.
719 const int mb_height = block_size_high[block_size];
720 const int mb_width = block_size_wide[block_size];
721 const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
722
723 int plane_offset = 0;
724 for (int plane = 0; plane < num_planes; ++plane) {
725 const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
726 const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
727 const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
728 const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
729 uint8_t *const buf = result_buffer->buffers[plane];
730 uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
731
732 int plane_idx = 0; // Pixel index on current plane (block-base).
733 int frame_idx = frame_offset; // Pixel index on the entire frame.
734 for (int i = 0; i < plane_h; ++i) {
735 for (int j = 0; j < plane_w; ++j) {
736 const int idx = plane_idx + plane_offset;
737 const uint16_t rounding = count[idx] >> 1;
738 if (is_high_bitdepth) {
739 buf16[frame_idx] =
740 (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
741 } else {
742 buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
743 }
744 ++plane_idx;
745 ++frame_idx;
746 }
747 frame_idx += (frame_stride - plane_w);
748 }
749 plane_offset += plane_h * plane_w;
750 }
751 }
752
av1_get_q(const AV1_COMP * cpi)753 int av1_get_q(const AV1_COMP *cpi) {
754 const GF_GROUP *gf_group = &cpi->ppi->gf_group;
755 const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
756 const int q =
757 (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
758 cpi->common.seq_params->bit_depth);
759 return q;
760 }
761
av1_tf_do_filtering_row(AV1_COMP * cpi,ThreadData * td,int mb_row)762 void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
763 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
764 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
765 const int num_frames = tf_ctx->num_frames;
766 const int filter_frame_idx = tf_ctx->filter_frame_idx;
767 const int check_show_existing = tf_ctx->check_show_existing;
768 const struct scale_factors *scale = &tf_ctx->sf;
769 const double *noise_levels = tf_ctx->noise_levels;
770 const int num_pels = tf_ctx->num_pels;
771 const int q_factor = tf_ctx->q_factor;
772 const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
773 const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
774 MACROBLOCK *const mb = &td->mb;
775 MACROBLOCKD *const mbd = &mb->e_mbd;
776 TemporalFilterData *const tf_data = &td->tf_data;
777 const int mb_height = block_size_high[block_size];
778 const int mb_width = block_size_wide[block_size];
779 const int mi_h = mi_size_high_log2[block_size];
780 const int mi_w = mi_size_wide_log2[block_size];
781 const int num_planes = av1_num_planes(&cpi->common);
782 uint32_t *accum = tf_data->accum;
783 uint16_t *count = tf_data->count;
784 uint8_t *pred = tf_data->pred;
785
786 // Factor to control the filering strength.
787 const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
788
789 // Do filtering.
790 FRAME_DIFF *diff = &td->tf_data.diff;
791 av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
792 (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
793 cpi->oxcf.border_in_pixels);
794 for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
795 av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
796 (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
797 cpi->oxcf.border_in_pixels);
798 memset(accum, 0, num_pels * sizeof(accum[0]));
799 memset(count, 0, num_pels * sizeof(count[0]));
800 MV ref_mv = kZeroMv; // Reference motion vector passed down along frames.
801 // Perform temporal filtering frame by frame.
802 for (int frame = 0; frame < num_frames; frame++) {
803 if (frames[frame] == NULL) continue;
804
805 // Motion search.
806 MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
807 int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
808 if (frame ==
809 filter_frame_idx) { // Frame to be filtered.
810 // Change ref_mv sign for following frames.
811 ref_mv.row *= -1;
812 ref_mv.col *= -1;
813 } else { // Other reference frames.
814 tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
815 mb_row, mb_col, &ref_mv, subblock_mvs, subblock_mses);
816 }
817
818 // Perform weighted averaging.
819 if (frame == filter_frame_idx) { // Frame to be filtered.
820 tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
821 mb_col, num_planes, accum, count);
822 } else { // Other reference frames.
823 tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
824 num_planes, scale, subblock_mvs, pred);
825
826 // All variants of av1_apply_temporal_filter() contain floating point
827 // operations. Hence, clear the system state.
828
829 // TODO(any): avx2/sse2 version should be changed to align with C
830 // function before using. In particular, current avx2/sse2 function
831 // only supports 32x32 block size and 5x5 filtering window.
832 if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth
833 #if CONFIG_AV1_HIGHBITDEPTH
834 if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
835 av1_highbd_apply_temporal_filter(
836 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
837 noise_levels, subblock_mvs, subblock_mses, q_factor,
838 filter_strength, pred, accum, count);
839 } else {
840 #endif // CONFIG_AV1_HIGHBITDEPTH
841 av1_apply_temporal_filter_c(
842 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
843 noise_levels, subblock_mvs, subblock_mses, q_factor,
844 filter_strength, pred, accum, count);
845 #if CONFIG_AV1_HIGHBITDEPTH
846 }
847 #endif // CONFIG_AV1_HIGHBITDEPTH
848 } else { // for 8-bit
849 if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
850 av1_apply_temporal_filter(frame_to_filter, mbd, block_size, mb_row,
851 mb_col, num_planes, noise_levels,
852 subblock_mvs, subblock_mses, q_factor,
853 filter_strength, pred, accum, count);
854 } else {
855 av1_apply_temporal_filter_c(
856 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
857 noise_levels, subblock_mvs, subblock_mses, q_factor,
858 filter_strength, pred, accum, count);
859 }
860 }
861 }
862 }
863 tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
864 accum, count, &cpi->ppi->alt_ref_buffer);
865
866 if (check_show_existing) {
867 const int y_height = mb_height >> mbd->plane[0].subsampling_y;
868 const int y_width = mb_width >> mbd->plane[0].subsampling_x;
869 const int source_y_stride = frame_to_filter->y_stride;
870 const int filter_y_stride = cpi->ppi->alt_ref_buffer.y_stride;
871 const int source_offset =
872 mb_row * y_height * source_y_stride + mb_col * y_width;
873 const int filter_offset =
874 mb_row * y_height * filter_y_stride + mb_col * y_width;
875 unsigned int sse = 0;
876 cpi->ppi->fn_ptr[block_size].vf(
877 frame_to_filter->y_buffer + source_offset, source_y_stride,
878 cpi->ppi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride,
879 &sse);
880 diff->sum += sse;
881 diff->sse += sse * (int64_t)sse;
882 }
883 }
884 }
885
886 /*!\brief Does temporal filter for a given frame.
887 *
888 * \ingroup src_frame_proc
889 * \param[in] cpi Top level encoder instance structure
890 *
891 * \return Nothing will be returned, but the contents of td->diff will be
892 modified.
893 */
tf_do_filtering(AV1_COMP * cpi)894 static void tf_do_filtering(AV1_COMP *cpi) {
895 // Basic information.
896 ThreadData *td = &cpi->td;
897 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
898 const struct scale_factors *scale = &tf_ctx->sf;
899 const int num_planes = av1_num_planes(&cpi->common);
900 assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
901
902 MACROBLOCKD *mbd = &td->mb.e_mbd;
903 uint8_t *input_buffer[MAX_MB_PLANE];
904 MB_MODE_INFO **input_mb_mode_info;
905 tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
906 tf_setup_macroblockd(mbd, &td->tf_data, scale);
907
908 // Perform temporal filtering for each row.
909 for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
910 av1_tf_do_filtering_row(cpi, td, mb_row);
911
912 tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
913 }
914
915 /*!\brief Setups the frame buffer for temporal filtering. This fuction
916 * determines how many frames will be used for temporal filtering and then
917 * groups them into a buffer. This function will also estimate the noise level
918 * of the to-filter frame.
919 *
920 * \ingroup src_frame_proc
921 * \param[in] cpi Top level encoder instance structure
922 * \param[in] filter_frame_lookahead_idx The index of the to-filter frame
923 * in the lookahead buffer cpi->lookahead
924 * \param[in] is_second_arf Whether the to-filter frame is the second ARF.
925 * This field will affect the number of frames
926 * used for filtering.
927 * \param[in] update_type This frame's update type.
928 *
929 * \param[in] is_forward_keyframe Indicate whether this is a forward keyframe.
930 *
931 * \return Nothing will be returned. But the fields `frames`, `num_frames`,
932 * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
933 */
tf_setup_filtering_buffer(AV1_COMP * cpi,const int filter_frame_lookahead_idx,const int is_second_arf,FRAME_UPDATE_TYPE update_type,int is_forward_keyframe)934 static void tf_setup_filtering_buffer(AV1_COMP *cpi,
935 const int filter_frame_lookahead_idx,
936 const int is_second_arf,
937 FRAME_UPDATE_TYPE update_type,
938 int is_forward_keyframe) {
939 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
940 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
941 // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
942 // temporal filtering.
943 int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
944 int num_before = 0; // Number of filtering frames before the to-filter frame.
945 int num_after = 0; // Number of filtering frames after the to-filer frame.
946 const int lookahead_depth =
947 av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
948
949 int arf_src_offset = cpi->ppi->gf_group.arf_src_offset[cpi->gf_frame_index];
950 const FRAME_TYPE frame_type =
951 cpi->ppi->gf_group.frame_type[cpi->gf_frame_index];
952
953 // Temporal filtering should not go beyond key frames
954 const int key_to_curframe =
955 AOMMAX(cpi->rc.frames_since_key + arf_src_offset, 0);
956 const int curframe_to_key =
957 AOMMAX(cpi->rc.frames_to_key - arf_src_offset - 1, 0);
958
959 // Number of buffered frames before the to-filter frame.
960 int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
961
962 // Number of buffered frames after the to-filter frame.
963 int max_after =
964 AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
965
966 // Estimate noises for each plane.
967 const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
968 cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
969 assert(to_filter_buf != NULL);
970 const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
971 const int num_planes = av1_num_planes(&cpi->common);
972 double *noise_levels = tf_ctx->noise_levels;
973 for (int plane = 0; plane < num_planes; ++plane) {
974 noise_levels[plane] = av1_estimate_noise_from_single_plane(
975 to_filter_frame, plane, cpi->common.seq_params->bit_depth);
976 }
977 // Get quantization factor.
978 const int q = av1_get_q(cpi);
979 // Get correlation estimates from first-pass;
980 const FIRSTPASS_STATS *stats =
981 cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
982 double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
983 for (int i = 1; i <= max_after; i++) {
984 if (stats + filter_frame_lookahead_idx + i >=
985 cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
986 max_after = i - 1;
987 break;
988 }
989 accu_coeff1 *=
990 AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
991 }
992 if (max_after >= 1) {
993 accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
994 }
995 for (int i = 1; i <= max_before; i++) {
996 if (stats + filter_frame_lookahead_idx - i + 1 <=
997 cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
998 max_before = i - 1;
999 break;
1000 }
1001 accu_coeff0 *=
1002 AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
1003 }
1004 if (max_before >= 1) {
1005 accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
1006 }
1007
1008 // Adjust number of filtering frames based on quantization factor. When the
1009 // quantization factor is small enough (lossless compression), we will not
1010 // change the number of frames for key frame filtering, which is to avoid
1011 // visual quality drop.
1012 int adjust_num = 6;
1013 if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering.
1014 adjust_num = 0;
1015 } else if ((update_type == KF_UPDATE) && q <= 10) {
1016 adjust_num = 0;
1017 }
1018 num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
1019
1020 if (frame_type == KEY_FRAME) {
1021 num_before = is_forward_keyframe ? num_frames / 2 : 0;
1022 num_after = AOMMIN(num_frames - 1, max_after);
1023 } else {
1024 num_frames = AOMMIN(num_frames, cpi->ppi->p_rc.gfu_boost / 150);
1025 num_frames += !(num_frames & 1); // Make the number odd.
1026 // Only use 2 neighbours for the second ARF.
1027 if (is_second_arf) num_frames = AOMMIN(num_frames, 3);
1028 if (AOMMIN(max_after, max_before) >= num_frames / 2) {
1029 // just use half half
1030 num_before = num_frames / 2;
1031 num_after = num_frames / 2;
1032 } else {
1033 if (max_after < num_frames / 2) {
1034 num_after = max_after;
1035 num_before = AOMMIN(num_frames - 1 - num_after, max_before);
1036 } else {
1037 num_before = max_before;
1038 num_after = AOMMIN(num_frames - 1 - num_before, max_after);
1039 }
1040 // Adjust insymmetry based on frame-level correlation
1041 if (max_after > 0 && max_before > 0) {
1042 if (num_after < num_before) {
1043 const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
1044 num_before = AOMMIN(num_before, num_after + insym);
1045 } else {
1046 const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
1047 num_after = AOMMIN(num_after, num_before + insym);
1048 }
1049 }
1050 }
1051 }
1052 num_frames = num_before + 1 + num_after;
1053
1054 // Setup the frame buffer.
1055 for (int frame = 0; frame < num_frames; ++frame) {
1056 const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
1057 struct lookahead_entry *buf = av1_lookahead_peek(
1058 cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
1059 assert(buf != NULL);
1060 frames[frame] = &buf->img;
1061 }
1062 tf_ctx->num_frames = num_frames;
1063 tf_ctx->filter_frame_idx = num_before;
1064 assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
1065
1066 av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
1067 cpi->common.seq_params->sb_size);
1068 av1_setup_block_planes(&cpi->td.mb.e_mbd,
1069 cpi->common.seq_params->subsampling_x,
1070 cpi->common.seq_params->subsampling_y, num_planes);
1071 }
1072
1073 /*!\cond */
1074
1075 // A constant number, sqrt(pi / 2), used for noise estimation.
1076 static const double SQRT_PI_BY_2 = 1.25331413732;
1077
av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG * frame,const int plane,const int bit_depth)1078 double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
1079 const int plane,
1080 const int bit_depth) {
1081 const int is_y_plane = (plane == 0);
1082 const int height = frame->crop_heights[is_y_plane ? 0 : 1];
1083 const int width = frame->crop_widths[is_y_plane ? 0 : 1];
1084 const int stride = frame->strides[is_y_plane ? 0 : 1];
1085 const uint8_t *src = frame->buffers[plane];
1086 const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
1087 const int is_high_bitdepth = is_frame_high_bitdepth(frame);
1088
1089 int64_t accum = 0;
1090 int count = 0;
1091 for (int i = 1; i < height - 1; ++i) {
1092 for (int j = 1; j < width - 1; ++j) {
1093 // Setup a small 3x3 matrix.
1094 const int center_idx = i * stride + j;
1095 int mat[3][3];
1096 for (int ii = -1; ii <= 1; ++ii) {
1097 for (int jj = -1; jj <= 1; ++jj) {
1098 const int idx = center_idx + ii * stride + jj;
1099 mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
1100 }
1101 }
1102 // Compute sobel gradients.
1103 const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
1104 2 * (mat[1][0] - mat[1][2]);
1105 const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
1106 2 * (mat[0][1] - mat[2][1]);
1107 const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
1108 // Accumulate Laplacian.
1109 if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) { // Only count smooth pixels.
1110 const int v = 4 * mat[1][1] -
1111 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
1112 (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
1113 accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
1114 ++count;
1115 }
1116 }
1117 }
1118
1119 // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
1120 return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
1121 }
1122
1123 // Initializes the members of TemporalFilterCtx
1124 // Inputs:
1125 // cpi: Top level encoder instance structure
1126 // filter_frame_lookahead_idx: The index of the frame to be filtered in the
1127 // lookahead buffer cpi->lookahead.
1128 // is_second_arf: Flag indiacting whether second ARF filtering is required.
1129 // Returns:
1130 // Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
init_tf_ctx(AV1_COMP * cpi,int filter_frame_lookahead_idx,int is_second_arf,FRAME_UPDATE_TYPE update_type,int is_forward_keyframe)1131 static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
1132 int is_second_arf, FRAME_UPDATE_TYPE update_type,
1133 int is_forward_keyframe) {
1134 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
1135 // Setup frame buffer for filtering.
1136 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
1137 tf_ctx->num_frames = 0;
1138 tf_ctx->filter_frame_idx = -1;
1139 tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
1140 update_type, is_forward_keyframe);
1141 assert(tf_ctx->num_frames > 0);
1142 assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
1143
1144 // Check show existing condition for non-keyframes. For KFs, only check when
1145 // KF overlay is enabled.
1146 tf_ctx->check_show_existing =
1147 !(is_forward_keyframe && update_type == KF_UPDATE) ||
1148 cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
1149
1150 // Setup scaling factors. Scaling on each of the arnr frames is not
1151 // supported.
1152 // ARF is produced at the native frame size and resized when coded.
1153 struct scale_factors *sf = &tf_ctx->sf;
1154 av1_setup_scale_factors_for_frame(
1155 sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
1156 frames[0]->y_crop_width, frames[0]->y_crop_height);
1157
1158 // Initialize temporal filter parameters.
1159 MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
1160 const int filter_frame_idx = tf_ctx->filter_frame_idx;
1161 const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
1162 const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
1163 const int frame_height = frame_to_filter->y_crop_height;
1164 const int frame_width = frame_to_filter->y_crop_width;
1165 const int mb_width = block_size_wide[block_size];
1166 const int mb_height = block_size_high[block_size];
1167 const int mb_rows = get_num_blocks(frame_height, mb_height);
1168 const int mb_cols = get_num_blocks(frame_width, mb_width);
1169 const int mb_pels = mb_width * mb_height;
1170 const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
1171 const int num_planes = av1_num_planes(&cpi->common);
1172 int num_pels = 0;
1173 for (int i = 0; i < num_planes; i++) {
1174 const int subsampling_x = mbd->plane[i].subsampling_x;
1175 const int subsampling_y = mbd->plane[i].subsampling_y;
1176 num_pels += mb_pels >> (subsampling_x + subsampling_y);
1177 }
1178 tf_ctx->num_pels = num_pels;
1179 tf_ctx->mb_rows = mb_rows;
1180 tf_ctx->mb_cols = mb_cols;
1181 tf_ctx->is_highbitdepth = is_highbitdepth;
1182 tf_ctx->q_factor = av1_get_q(cpi);
1183 }
1184
av1_temporal_filter(AV1_COMP * cpi,const int filter_frame_lookahead_idx,FRAME_UPDATE_TYPE update_type,int is_forward_keyframe,int * show_existing_arf)1185 int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
1186 FRAME_UPDATE_TYPE update_type, int is_forward_keyframe,
1187 int *show_existing_arf) {
1188 MultiThreadInfo *const mt_info = &cpi->mt_info;
1189 // Basic informaton of the current frame.
1190 const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
1191 const uint8_t group_idx = cpi->gf_frame_index;
1192 TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
1193 TemporalFilterData *tf_data = &cpi->td.tf_data;
1194 // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
1195 // This frame is ALWAYS a show existing frame.
1196 const int is_second_arf =
1197 (update_type == INTNL_ARF_UPDATE) &&
1198 (filter_frame_lookahead_idx >= TF_LOOKAHEAD_IDX_THR) &&
1199 cpi->sf.hl_sf.second_alt_ref_filtering;
1200 // TODO(anyone): Currently, we enforce the filtering strength on internal
1201 // ARFs except the second ARF to be zero. We should investigate in which case
1202 // it is more beneficial to use non-zero strength filtering.
1203 if (update_type == INTNL_ARF_UPDATE && !is_second_arf) {
1204 return 0;
1205 }
1206
1207 #if CONFIG_FRAME_PARALLEL_ENCODE
1208 // Only parallel level 0 frames go through temporal filtering.
1209 assert(gf_group->frame_parallel_level[group_idx] == 0);
1210 #endif // CONFIG_FRAME_PARALLEL_ENCODE
1211
1212 // Initialize temporal filter context structure.
1213 init_tf_ctx(cpi, filter_frame_lookahead_idx, is_second_arf, update_type,
1214 is_forward_keyframe);
1215
1216 // Set showable frame.
1217 if (is_forward_keyframe == 0 && update_type != KF_UPDATE) {
1218 cpi->common.showable_frame = tf_ctx->num_frames == 1 || is_second_arf ||
1219 (cpi->oxcf.algo_cfg.enable_overlay == 0);
1220 }
1221
1222 // Allocate and reset temporal filter buffers.
1223 const int is_highbitdepth = tf_ctx->is_highbitdepth;
1224 tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth);
1225
1226 // Perform temporal filtering process.
1227 if (mt_info->num_workers > 1)
1228 av1_tf_do_filtering_mt(cpi);
1229 else
1230 tf_do_filtering(cpi);
1231
1232 // Deallocate temporal filter buffers.
1233 tf_dealloc_data(tf_data, is_highbitdepth);
1234
1235 if (!tf_ctx->check_show_existing) return 1;
1236
1237 if (show_existing_arf != NULL || is_second_arf) {
1238 YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
1239 const FRAME_DIFF *diff = &tf_data->diff;
1240 const int filter_frame_idx = tf_ctx->filter_frame_idx;
1241 const int frame_height = frames[filter_frame_idx]->y_crop_height;
1242 const int frame_width = frames[filter_frame_idx]->y_crop_width;
1243 const int block_height = block_size_high[TF_BLOCK_SIZE];
1244 const int block_width = block_size_wide[TF_BLOCK_SIZE];
1245 const int mb_rows = get_num_blocks(frame_height, block_height);
1246 const int mb_cols = get_num_blocks(frame_width, block_width);
1247 const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
1248 const float mean = (float)diff->sum / num_mbs;
1249 const float std = (float)sqrt((float)diff->sse / num_mbs - mean * mean);
1250
1251 // TODO(yunqing): This can be combined with TPL q calculation later.
1252 cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
1253 av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
1254 int top_index = 0;
1255 int bottom_index = 0;
1256 const int q = av1_rc_pick_q_and_bounds(
1257 cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
1258 group_idx, &bottom_index, &top_index);
1259 const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params->bit_depth);
1260 const float threshold = 0.7f * ac_q * ac_q;
1261
1262 if (!is_second_arf) {
1263 *show_existing_arf = 0;
1264 if (mean < threshold && std < mean * 1.2) {
1265 *show_existing_arf = 1;
1266 }
1267 cpi->common.showable_frame |= *show_existing_arf;
1268 } else {
1269 // Use source frame if the filtered frame becomes very different.
1270 if (!(mean < threshold && std < mean * 1.2)) {
1271 return 0;
1272 }
1273 }
1274 }
1275
1276 return 1;
1277 }
1278 /*!\endcond */
1279