1 // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2 // Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
3 //
4 // This source code is subject to the terms of the BSD 2 Clause License and
5 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 // was not distributed with this source code in the LICENSE file, you can
7 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 // Media Patent License 1.0 was not distributed with this source code in the
9 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11 #![allow(non_camel_case_types)]
12
13 use crate::api::*;
14 use crate::cdef::*;
15 use crate::context::*;
16 use crate::cpu_features::CpuFeatureLevel;
17 use crate::deblock::*;
18 use crate::dist::*;
19 use crate::ec::{Writer, WriterCounter, OD_BITRES};
20 use crate::encode_block_with_modes;
21 use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
22 use crate::frame::Frame;
23 use crate::frame::*;
24 use crate::header::ReferenceMode;
25 use crate::lrf::*;
26 use crate::luma_ac;
27 use crate::mc::MotionVector;
28 use crate::me::*;
29 use crate::motion_compensate;
30 use crate::partition::RefType::*;
31 use crate::partition::*;
32 use crate::predict::{
33 AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
34 RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
35 };
36 use crate::rdo_tables::*;
37 use crate::tiling::*;
38 use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
39 use crate::util::{init_slice_repeat_mut, Aligned, CastFromPrimitive, Pixel};
40 use crate::write_tx_blocks;
41 use crate::write_tx_tree;
42 use crate::Tune;
43 use crate::{encode_block_post_cdef, encode_block_pre_cdef};
44
45 use crate::partition::PartitionType::*;
46 use arrayvec::*;
47 use itertools::izip;
48 use std::fmt;
49 use std::mem::MaybeUninit;
50
51 #[derive(Copy, Clone, PartialEq)]
52 pub enum RDOType {
53 PixelDistRealRate,
54 TxDistRealRate,
55 TxDistEstRate,
56 }
57
58 impl RDOType {
59 #[inline]
needs_tx_dist(self) -> bool60 pub fn needs_tx_dist(self) -> bool {
61 match self {
62 // Pixel-domain distortion and exact ec rate
63 RDOType::PixelDistRealRate => false,
64 // Tx-domain distortion and exact ec rate
65 RDOType::TxDistRealRate => true,
66 // Tx-domain distortion and txdist-based rate
67 RDOType::TxDistEstRate => true,
68 }
69 }
70 #[inline]
needs_coeff_rate(self) -> bool71 pub fn needs_coeff_rate(self) -> bool {
72 match self {
73 RDOType::PixelDistRealRate => true,
74 RDOType::TxDistRealRate => true,
75 RDOType::TxDistEstRate => false,
76 }
77 }
78 }
79
80 #[derive(Clone)]
81 pub struct PartitionGroupParameters {
82 pub rd_cost: f64,
83 pub part_type: PartitionType,
84 pub part_modes: ArrayVec<PartitionParameters, 4>,
85 }
86
87 #[derive(Clone, Debug)]
88 pub struct PartitionParameters {
89 pub rd_cost: f64,
90 pub bo: TileBlockOffset,
91 pub bsize: BlockSize,
92 pub pred_mode_luma: PredictionMode,
93 pub pred_mode_chroma: PredictionMode,
94 pub pred_cfl_params: CFLParams,
95 pub angle_delta: AngleDelta,
96 pub ref_frames: [RefType; 2],
97 pub mvs: [MotionVector; 2],
98 pub skip: bool,
99 pub has_coeff: bool,
100 pub tx_size: TxSize,
101 pub tx_type: TxType,
102 pub sidx: u8,
103 }
104
105 impl Default for PartitionParameters {
default() -> Self106 fn default() -> Self {
107 PartitionParameters {
108 rd_cost: std::f64::MAX,
109 bo: TileBlockOffset::default(),
110 bsize: BlockSize::BLOCK_INVALID,
111 pred_mode_luma: PredictionMode::default(),
112 pred_mode_chroma: PredictionMode::default(),
113 pred_cfl_params: CFLParams::default(),
114 angle_delta: AngleDelta::default(),
115 ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
116 mvs: [MotionVector::default(); 2],
117 skip: false,
118 has_coeff: true,
119 tx_size: TxSize::TX_4X4,
120 tx_type: TxType::DCT_DCT,
121 sidx: 0,
122 }
123 }
124 }
125
estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64126 pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
127 let bs_index = ts as usize;
128 let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV;
129 let bin_idx_down =
130 ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64);
131 let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64);
132 let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
133 let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
134 let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
135 let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
136 let slope = ((y1 - y0) << 8) / (x1 - x0);
137 (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64
138 }
139
140 // The microbenchmarks perform better with inlining turned off
141 #[inline(never)]
cdef_dist_wxh_8x8<T: Pixel>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize, ) -> RawDistortion142 fn cdef_dist_wxh_8x8<T: Pixel>(
143 src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize,
144 ) -> RawDistortion {
145 debug_assert!(src1.plane_cfg.xdec == 0);
146 debug_assert!(src1.plane_cfg.ydec == 0);
147 debug_assert!(src2.plane_cfg.xdec == 0);
148 debug_assert!(src2.plane_cfg.ydec == 0);
149
150 // Sum into columns to improve auto-vectorization
151 let mut sum_s_cols: [u16; 8] = [0; 8];
152 let mut sum_d_cols: [u16; 8] = [0; 8];
153 let mut sum_s2_cols: [u32; 8] = [0; 8];
154 let mut sum_d2_cols: [u32; 8] = [0; 8];
155 let mut sum_sd_cols: [u32; 8] = [0; 8];
156
157 // Check upfront that 8 rows are available.
158 let _row1 = &src1[7];
159 let _row2 = &src2[7];
160
161 for j in 0..8 {
162 let row1 = &src1[j][0..8];
163 let row2 = &src2[j][0..8];
164 for (sum_s, sum_d, sum_s2, sum_d2, sum_sd, s, d) in izip!(
165 &mut sum_s_cols,
166 &mut sum_d_cols,
167 &mut sum_s2_cols,
168 &mut sum_d2_cols,
169 &mut sum_sd_cols,
170 row1,
171 row2
172 ) {
173 // Don't convert directly to u32 to allow better vectorization
174 let s: u16 = u16::cast_from(*s);
175 let d: u16 = u16::cast_from(*d);
176 *sum_s += s;
177 *sum_d += d;
178
179 // Convert to u32 to avoid overflows when multiplying
180 let s: u32 = s as u32;
181 let d: u32 = d as u32;
182
183 *sum_s2 += s * s;
184 *sum_d2 += d * d;
185 *sum_sd += s * d;
186 }
187 }
188
189 // Sum together the sum of columns
190 let sum_s: i64 =
191 sum_s_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
192 let sum_d: i64 =
193 sum_d_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
194 let sum_s2: i64 = sum_s2_cols.iter().sum::<u32>() as i64;
195 let sum_d2: i64 = sum_d2_cols.iter().sum::<u32>() as i64;
196 let sum_sd: i64 = sum_sd_cols.iter().sum::<u32>() as i64;
197
198 // Use sums to calculate distortion
199 let svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
200 let dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
201 let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as u64;
202 RawDistortion::new(ssim_boost(svar, dvar, bit_depth).mul_u64(sse))
203 }
204
205 #[inline(always)]
ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale206 pub fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale {
207 let coeff_shift = bit_depth - 8;
208
209 //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO
210 DistortionScale::new(
211 (4033_f64 / 16_384_f64)
212 * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64
213 / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64),
214 )
215 }
216
217 #[allow(unused)]
cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, compute_bias: F, ) -> Distortion218 pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
219 src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
220 bit_depth: usize, compute_bias: F,
221 ) -> Distortion {
222 assert!(w & 0x7 == 0);
223 assert!(h & 0x7 == 0);
224 debug_assert!(src1.plane_cfg.xdec == 0);
225 debug_assert!(src1.plane_cfg.ydec == 0);
226 debug_assert!(src2.plane_cfg.xdec == 0);
227 debug_assert!(src2.plane_cfg.ydec == 0);
228
229 let mut sum = Distortion::zero();
230 for j in 0isize..h as isize / 8 {
231 for i in 0isize..w as isize / 8 {
232 let area = Area::StartingAt { x: i * 8, y: j * 8 };
233 let value = cdef_dist_wxh_8x8(
234 &src1.subregion(area),
235 &src2.subregion(area),
236 bit_depth,
237 );
238
239 // cdef is always called on non-subsampled planes, so BLOCK_8X8 is
240 // correct here.
241 sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
242 }
243 }
244 sum
245 }
246
247 /// Sum of Squared Error for a wxh block
248 /// Currently limited to w and h of valid blocks
sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, ) -> Distortion249 pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
250 src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
251 compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
252 ) -> Distortion {
253 // See get_weighted_sse in src/dist.rs.
254 // Provide a scale to get_weighted_sse for each square region of this size.
255 const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1;
256
257 // To bias the distortion correctly, compute it in blocks up to the size
258 // importance block size in a non-subsampled plane.
259 let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
260 let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
261
262 let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
263
264 let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE;
265 let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE;
266
267 // TODO: Copying biases into a buffer is slow. It would be best if biases were
268 // passed directly. To do this, we would need different versions of the
269 // weighted sse function for decimated/subsampled data. Also requires
270 // eliminating use of unbiased sse.
271 // It should also be noted that the current copy code does not auto-vectorize.
272
273 // Copy biases into a buffer.
274 let mut buf_storage = Aligned::new(
275 [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE],
276 );
277 let buf_stride = n_imp_blocks_w.next_power_of_two();
278 let buf = init_slice_repeat_mut(
279 &mut buf_storage.data[..buf_stride * n_imp_blocks_h],
280 0,
281 );
282
283 for block_y in 0..n_imp_blocks_h {
284 for block_x in 0..n_imp_blocks_w {
285 let block = Area::StartingAt {
286 x: (block_x * CHUNK_SIZE) as isize,
287 y: (block_y * CHUNK_SIZE) as isize,
288 };
289 buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
290 }
291 }
292
293 Distortion(get_weighted_sse(
294 src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
295 ))
296 }
297
clip_visible_bsize( frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize, ) -> (usize, usize)298 pub fn clip_visible_bsize(
299 frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
300 ) -> (usize, usize) {
301 let blk_w = bsize.width();
302 let blk_h = bsize.height();
303
304 let visible_w: usize = if x + blk_w <= frame_w {
305 blk_w
306 } else if x >= frame_w {
307 0
308 } else {
309 frame_w - x
310 };
311
312 let visible_h: usize = if y + blk_h <= frame_h {
313 blk_h
314 } else if y >= frame_h {
315 0
316 } else {
317 frame_h - y
318 };
319
320 (visible_w, visible_h)
321 }
322
323 // Compute the pixel-domain distortion for an encode
compute_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, ) -> ScaledDistortion324 fn compute_distortion<T: Pixel>(
325 fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
326 is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
327 ) -> ScaledDistortion {
328 let area = Area::BlockStartingAt { bo: tile_bo.0 };
329 let input_region = ts.input_tile.planes[0].subregion(area);
330 let rec_region = ts.rec.planes[0].subregion(area);
331
332 // clip a block to have visible pixles only
333 let frame_bo = ts.to_frame_block_offset(tile_bo);
334 let (visible_w, visible_h) = clip_visible_bsize(
335 fi.width,
336 fi.height,
337 bsize,
338 frame_bo.0.x << MI_SIZE_LOG2,
339 frame_bo.0.y << MI_SIZE_LOG2,
340 );
341
342 if visible_w == 0 || visible_h == 0 {
343 return ScaledDistortion::zero();
344 }
345
346 let mut distortion = match fi.config.tune {
347 Tune::Psychovisual if bsize.width() >= 8 && bsize.height() >= 8 => {
348 let w8 = visible_w & !7;
349 let h8 = visible_h & !7;
350 let mut sum = Distortion(0);
351 if w8 > 0 && h8 > 0 {
352 sum += cdef_dist_wxh(
353 &input_region,
354 &rec_region,
355 w8,
356 h8,
357 fi.sequence.bit_depth,
358 |bias_area, bsize| {
359 distortion_scale(
360 fi,
361 input_region.subregion(bias_area).frame_block_offset(),
362 bsize,
363 )
364 },
365 );
366 }
367 if visible_w > w8 && h8 > 0 {
368 let area = Area::StartingAt { x: w8 as isize, y: 0 };
369 sum += sse_wxh(
370 &input_region.subregion(area),
371 &rec_region.subregion(area),
372 visible_w - w8,
373 h8,
374 |bias_area, bsize| {
375 spatiotemporal_scale(
376 fi,
377 input_region
378 .subregion(area)
379 .subregion(bias_area)
380 .frame_block_offset(),
381 bsize,
382 )
383 },
384 fi.sequence.bit_depth,
385 fi.cpu_feature_level,
386 );
387 }
388 if visible_h > h8 && visible_w > 0 {
389 let area = Area::StartingAt { x: 0, y: h8 as isize };
390 sum += sse_wxh(
391 &input_region.subregion(area),
392 &rec_region.subregion(area),
393 visible_w,
394 visible_h - h8,
395 |bias_area, bsize| {
396 spatiotemporal_scale(
397 fi,
398 input_region
399 .subregion(area)
400 .subregion(bias_area)
401 .frame_block_offset(),
402 bsize,
403 )
404 },
405 fi.sequence.bit_depth,
406 fi.cpu_feature_level,
407 );
408 }
409 sum
410 }
411 Tune::Psnr | Tune::Psychovisual => sse_wxh(
412 &input_region,
413 &rec_region,
414 visible_w,
415 visible_h,
416 |bias_area, bsize| {
417 distortion_scale(
418 fi,
419 input_region.subregion(bias_area).frame_block_offset(),
420 bsize,
421 )
422 },
423 fi.sequence.bit_depth,
424 fi.cpu_feature_level,
425 ),
426 } * fi.dist_scale[0];
427
428 if is_chroma_block
429 && !luma_only
430 && fi.sequence.chroma_sampling != ChromaSampling::Cs400
431 {
432 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
433 let chroma_w = if bsize.width() >= 8 || xdec == 0 {
434 (visible_w + xdec) >> xdec
435 } else {
436 (4 + visible_w + xdec) >> xdec
437 };
438 let chroma_h = if bsize.height() >= 8 || ydec == 0 {
439 (visible_h + ydec) >> ydec
440 } else {
441 (4 + visible_h + ydec) >> ydec
442 };
443
444 for p in 1..3 {
445 let input_region = ts.input_tile.planes[p].subregion(area);
446 let rec_region = ts.rec.planes[p].subregion(area);
447 distortion += sse_wxh(
448 &input_region,
449 &rec_region,
450 chroma_w,
451 chroma_h,
452 |bias_area, bsize| {
453 distortion_scale(
454 fi,
455 input_region.subregion(bias_area).frame_block_offset(),
456 bsize,
457 )
458 },
459 fi.sequence.bit_depth,
460 fi.cpu_feature_level,
461 ) * fi.dist_scale[p];
462 }
463 }
464 distortion
465 }
466
467 // Compute the transform-domain distortion for an encode
compute_tx_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, skip: bool, luma_only: bool, ) -> ScaledDistortion468 fn compute_tx_distortion<T: Pixel>(
469 fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
470 is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
471 skip: bool, luma_only: bool,
472 ) -> ScaledDistortion {
473 assert!(fi.config.tune == Tune::Psnr);
474 let area = Area::BlockStartingAt { bo: tile_bo.0 };
475 let input_region = ts.input_tile.planes[0].subregion(area);
476 let rec_region = ts.rec.planes[0].subregion(area);
477
478 let (visible_w, visible_h) = if !skip {
479 (bsize.width(), bsize.height())
480 } else {
481 let frame_bo = ts.to_frame_block_offset(tile_bo);
482 clip_visible_bsize(
483 fi.width,
484 fi.height,
485 bsize,
486 frame_bo.0.x << MI_SIZE_LOG2,
487 frame_bo.0.y << MI_SIZE_LOG2,
488 )
489 };
490
491 if visible_w == 0 || visible_h == 0 {
492 return ScaledDistortion::zero();
493 }
494
495 let mut distortion = if skip {
496 sse_wxh(
497 &input_region,
498 &rec_region,
499 visible_w,
500 visible_h,
501 |bias_area, bsize| {
502 distortion_scale(
503 fi,
504 input_region.subregion(bias_area).frame_block_offset(),
505 bsize,
506 )
507 },
508 fi.sequence.bit_depth,
509 fi.cpu_feature_level,
510 ) * fi.dist_scale[0]
511 } else {
512 tx_dist
513 };
514
515 if is_chroma_block
516 && !luma_only
517 && skip
518 && fi.sequence.chroma_sampling != ChromaSampling::Cs400
519 {
520 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
521 let chroma_w = if bsize.width() >= 8 || xdec == 0 {
522 (visible_w + xdec) >> xdec
523 } else {
524 (4 + visible_w + xdec) >> xdec
525 };
526 let chroma_h = if bsize.height() >= 8 || ydec == 0 {
527 (visible_h + ydec) >> ydec
528 } else {
529 (4 + visible_h + ydec) >> ydec
530 };
531
532 for p in 1..3 {
533 let input_region = ts.input_tile.planes[p].subregion(area);
534 let rec_region = ts.rec.planes[p].subregion(area);
535 distortion += sse_wxh(
536 &input_region,
537 &rec_region,
538 chroma_w,
539 chroma_h,
540 |bias_area, bsize| {
541 distortion_scale(
542 fi,
543 input_region.subregion(bias_area).frame_block_offset(),
544 bsize,
545 )
546 },
547 fi.sequence.bit_depth,
548 fi.cpu_feature_level,
549 ) * fi.dist_scale[p];
550 }
551 }
552 distortion
553 }
554
555 /// Compute a scaling factor to multiply the distortion of a block by,
556 /// this factor is determined using temporal RDO.
distortion_scale<T: Pixel>( fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale557 pub fn distortion_scale<T: Pixel>(
558 fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
559 ) -> DistortionScale {
560 if !fi.config.temporal_rdo() {
561 return DistortionScale::default();
562 }
563 // EncoderConfig::temporal_rdo() should always return false in situations
564 // where distortion is computed on > 8x8 blocks, so we should never hit this
565 // assert.
566 assert!(bsize <= BlockSize::BLOCK_8X8);
567
568 let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
569 let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
570
571 fi.distortion_scales[y * fi.w_in_imp_b + x]
572 }
573
spatiotemporal_scale<T: Pixel>( fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale574 pub fn spatiotemporal_scale<T: Pixel>(
575 fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
576 ) -> DistortionScale {
577 if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual {
578 return DistortionScale::default();
579 }
580
581 let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
582 let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
583 let x1 = (x0 + bsize.width_imp_b()).min(fi.w_in_imp_b);
584 let y1 = (y0 + bsize.height_imp_b()).min(fi.h_in_imp_b);
585 let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT;
586
587 let mut sum = 0;
588 for y in y0..y1 {
589 sum += fi.distortion_scales[y * fi.w_in_imp_b..][x0..x1]
590 .iter()
591 .zip(fi.activity_scales[y * fi.w_in_imp_b..][x0..x1].iter())
592 .take(MAX_SB_IN_IMP_B)
593 .map(|(d, a)| d.0 as u64 * a.0 as u64)
594 .sum::<u64>();
595 }
596 DistortionScale(((sum + (den >> 1)) / den) as u32)
597 }
598
distortion_scale_for( propagate_cost: f64, intra_cost: f64, ) -> DistortionScale599 pub fn distortion_scale_for(
600 propagate_cost: f64, intra_cost: f64,
601 ) -> DistortionScale {
602 // The mbtree paper \cite{mbtree} uses the following formula:
603 //
604 // QP_delta = -strength * log2(1 + (propagate_cost / intra_cost))
605 //
606 // Since this is H.264, this corresponds to the following quantizer:
607 //
608 // Q' = Q * 2^(QP_delta/6)
609 //
610 // Since lambda is proportial to Q^2, this means we want to minimize:
611 //
612 // D + lambda' * R
613 // = D + 2^(QP_delta / 3) * lambda * R
614 //
615 // If we want to keep lambda fixed, we can instead scale distortion and
616 // minimize:
617 //
618 // D * scale + lambda * R
619 //
620 // where:
621 //
622 // scale = 2^(QP_delta / -3)
623 // = (1 + (propagate_cost / intra_cost))^(strength / 3)
624 //
625 // The original paper empirically chooses strength = 2.0, but strength = 1.0
626 // seems to work best in rav1e currently, this may have something to do with
627 // the fact that they use 16x16 blocks whereas our "importance blocks" are
628 // 8x8, but everything should be scale invariant here so that's weird.
629 //
630 // @article{mbtree,
631 // title={A novel macroblock-tree algorithm for high-performance
632 // optimization of dependent video coding in H.264/AVC},
633 // author={Garrett-Glaser, Jason},
634 // journal={Tech. Rep.},
635 // year={2009},
636 // url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
637 // }
638
639 if intra_cost == 0. {
640 return DistortionScale::default(); // no scaling
641 }
642
643 let strength = 1.0; // empirical, see comment above
644 let frac = (intra_cost + propagate_cost) / intra_cost;
645 DistortionScale::new(frac.powf(strength / 3.0))
646 }
647
648 /// Fixed point arithmetic version of distortion scale
649 #[repr(transparent)]
650 #[derive(Copy, Clone)]
651 pub struct DistortionScale(pub u32);
652
653 #[repr(transparent)]
654 pub struct RawDistortion(u64);
655
656 #[repr(transparent)]
657 pub struct Distortion(pub u64);
658
659 #[repr(transparent)]
660 pub struct ScaledDistortion(u64);
661
662 impl DistortionScale {
663 /// Bits past the radix point
664 const SHIFT: u32 = 12;
665 /// Number of bits used. Determines the max value.
666 /// 24 bits is likely excessive.
667 const BITS: u32 = 24;
668
669 #[inline]
new(scale: f64) -> Self670 pub fn new(scale: f64) -> Self {
671 Self(
672 (scale * (1 << Self::SHIFT) as f64 + 0.5)
673 .min(((1 << Self::BITS as u64) - 1) as f64) as u32,
674 )
675 }
676
677 /// Multiply, round and shift
678 /// Internal implementation, so don't use multiply trait.
679 #[inline]
mul_u64(self, dist: u64) -> u64680 pub fn mul_u64(self, dist: u64) -> u64 {
681 (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
682 }
683 }
684
685 // Default value for DistortionScale is a fixed point 1
686 impl Default for DistortionScale {
687 #[inline]
default() -> Self688 fn default() -> Self {
689 Self(1 << Self::SHIFT)
690 }
691 }
692
693 impl fmt::Debug for DistortionScale {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result694 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
695 write!(f, "{}", f64::from(*self))
696 }
697 }
698
699 impl From<DistortionScale> for f64 {
700 #[inline]
from(scale: DistortionScale) -> Self701 fn from(scale: DistortionScale) -> Self {
702 scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64
703 }
704 }
705
706 impl RawDistortion {
707 #[inline]
new(dist: u64) -> Self708 pub const fn new(dist: u64) -> Self {
709 Self(dist)
710 }
711 }
712
713 impl std::ops::Mul<DistortionScale> for RawDistortion {
714 type Output = Distortion;
715 #[inline]
mul(self, rhs: DistortionScale) -> Distortion716 fn mul(self, rhs: DistortionScale) -> Distortion {
717 Distortion(rhs.mul_u64(self.0))
718 }
719 }
720
721 impl Distortion {
722 #[inline]
zero() -> Self723 pub const fn zero() -> Self {
724 Self(0)
725 }
726 }
727
728 impl std::ops::Mul<f64> for Distortion {
729 type Output = ScaledDistortion;
730 #[inline]
mul(self, rhs: f64) -> ScaledDistortion731 fn mul(self, rhs: f64) -> ScaledDistortion {
732 ScaledDistortion((self.0 as f64 * rhs) as u64)
733 }
734 }
735
736 impl std::ops::AddAssign for Distortion {
737 #[inline]
add_assign(&mut self, other: Self)738 fn add_assign(&mut self, other: Self) {
739 self.0 += other.0;
740 }
741 }
742
743 impl ScaledDistortion {
744 #[inline]
zero() -> Self745 pub const fn zero() -> Self {
746 Self(0)
747 }
748 }
749
750 impl std::ops::AddAssign for ScaledDistortion {
751 #[inline]
add_assign(&mut self, other: Self)752 fn add_assign(&mut self, other: Self) {
753 self.0 += other.0;
754 }
755 }
756
compute_rd_cost<T: Pixel>( fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion, ) -> f64757 pub fn compute_rd_cost<T: Pixel>(
758 fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
759 ) -> f64 {
760 let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64);
761 distortion.0 as f64 + fi.lambda * rate_in_bits
762 }
763
rdo_tx_size_type<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], skip: bool, ) -> (TxSize, TxType)764 pub fn rdo_tx_size_type<T: Pixel>(
765 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
766 cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
767 luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
768 skip: bool,
769 ) -> (TxSize, TxType) {
770 let is_inter = !luma_mode.is_intra();
771 let mut tx_size = max_txsize_rect_lookup[bsize as usize];
772
773 if fi.enable_inter_txfm_split && is_inter && !skip {
774 tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
775 }
776
777 let mut best_tx_type = TxType::DCT_DCT;
778 let mut best_tx_size = tx_size;
779 let mut best_rd = std::f64::MAX;
780
781 let do_rdo_tx_size =
782 fi.tx_mode_select && fi.config.speed_settings.rdo_tx_decision && !is_inter;
783 let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 };
784 let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
785
786 for _ in 0..=rdo_tx_depth {
787 let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
788
789 let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
790 && fi.config.speed_settings.rdo_tx_decision
791 && !is_inter
792 && !skip;
793
794 if !do_rdo_tx_size && !do_rdo_tx_type {
795 return (best_tx_size, best_tx_type);
796 };
797
798 let tx_types =
799 if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
800
801 // Luma plane transform type decision
802 let (tx_type, rd_cost) = rdo_tx_type_decision(
803 fi,
804 ts,
805 cw,
806 &mut cw_checkpoint,
807 luma_mode,
808 ref_frames,
809 mvs,
810 bsize,
811 tile_bo,
812 tx_size,
813 tx_set,
814 tx_types,
815 );
816
817 if rd_cost < best_rd {
818 best_tx_size = tx_size;
819 best_tx_type = tx_type;
820 best_rd = rd_cost;
821 }
822
823 debug_assert!(tx_size.width_log2() <= bsize.width_log2());
824 debug_assert!(tx_size.height_log2() <= bsize.height_log2());
825 debug_assert!(
826 tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT
827 );
828
829 let next_tx_size = sub_tx_size_map[tx_size as usize];
830
831 if next_tx_size == tx_size {
832 break;
833 } else {
834 tx_size = next_tx_size;
835 };
836 }
837
838 (best_tx_size, best_tx_type)
839 }
840
841 #[inline]
dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool842 fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
843 let diff_row = mv.row as i32 - ref_mv.row as i32;
844 let diff_col = mv.col as i32 - ref_mv.col as i32;
845 diff_row >= MV_LOW
846 && diff_row <= MV_UPP
847 && diff_col >= MV_LOW
848 && diff_col <= MV_UPP
849 }
850
851 #[inline]
luma_chroma_mode_rdo<T: Pixel>( luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize, tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, rdo_type: RDOType, cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters, mvs: [MotionVector; 2], ref_frames: [RefType; 2], mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool, mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>, angle_delta: AngleDelta, )852 fn luma_chroma_mode_rdo<T: Pixel>(
853 luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
854 tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
855 cw: &mut ContextWriter, rdo_type: RDOType,
856 cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
857 mvs: [MotionVector; 2], ref_frames: [RefType; 2],
858 mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
859 mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>,
860 angle_delta: AngleDelta,
861 ) {
862 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
863
864 let is_chroma_block =
865 has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
866
867 if !luma_mode_is_intra {
868 let ref_mvs = if mv_stack.is_empty() {
869 [MotionVector::default(); 2]
870 } else {
871 [mv_stack[0].this_mv, mv_stack[0].comp_mv]
872 };
873
874 if (luma_mode == PredictionMode::NEWMV
875 || luma_mode == PredictionMode::NEW_NEWMV
876 || luma_mode == PredictionMode::NEW_NEARESTMV)
877 && !dmv_in_range(mvs[0], ref_mvs[0])
878 {
879 return;
880 }
881
882 if (luma_mode == PredictionMode::NEW_NEWMV
883 || luma_mode == PredictionMode::NEAREST_NEWMV)
884 && !dmv_in_range(mvs[1], ref_mvs[1])
885 {
886 return;
887 }
888 }
889
890 // Find the best chroma prediction mode for the current luma prediction mode
891 let mut chroma_rdo = |skip: bool| -> bool {
892 use crate::segmentation::select_segment;
893
894 let mut zero_distortion = false;
895
896 for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
897 cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
898
899 let (tx_size, tx_type) = rdo_tx_size_type(
900 fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
901 );
902 for &chroma_mode in mode_set_chroma.iter() {
903 let wr = &mut WriterCounter::new();
904 let tell = wr.tell_frac();
905
906 if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
907 cw.write_partition(
908 wr,
909 tile_bo,
910 PartitionType::PARTITION_NONE,
911 bsize,
912 );
913 }
914
915 // TODO(yushin): luma and chroma would have different decision based on chroma format
916 let need_recon_pixel =
917 luma_mode_is_intra && tx_size.block_size() != bsize;
918
919 encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
920 let (has_coeff, tx_dist) = encode_block_post_cdef(
921 fi,
922 ts,
923 cw,
924 wr,
925 luma_mode,
926 chroma_mode,
927 angle_delta,
928 ref_frames,
929 mvs,
930 bsize,
931 tile_bo,
932 skip,
933 CFLParams::default(),
934 tx_size,
935 tx_type,
936 mode_context,
937 mv_stack,
938 rdo_type,
939 need_recon_pixel,
940 false,
941 );
942
943 let rate = wr.tell_frac() - tell;
944 let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
945 compute_tx_distortion(
946 fi,
947 ts,
948 bsize,
949 is_chroma_block,
950 tile_bo,
951 tx_dist,
952 skip,
953 false,
954 )
955 } else {
956 compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
957 };
958 let is_zero_dist = distortion.0 == 0;
959 let rd = compute_rd_cost(fi, rate, distortion);
960 if rd < best.rd_cost {
961 //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV {
962 best.rd_cost = rd;
963 best.pred_mode_luma = luma_mode;
964 best.pred_mode_chroma = chroma_mode;
965 best.angle_delta = angle_delta;
966 best.ref_frames = ref_frames;
967 best.mvs = mvs;
968 best.skip = skip;
969 best.has_coeff = has_coeff;
970 best.tx_size = tx_size;
971 best.tx_type = tx_type;
972 best.sidx = sidx;
973 zero_distortion = is_zero_dist;
974 }
975
976 cw.rollback(cw_checkpoint);
977 }
978 }
979
980 zero_distortion
981 };
982
983 // Don't skip when using intra modes
984 let zero_distortion =
985 if !luma_mode_is_intra { chroma_rdo(true) } else { false };
986 // early skip
987 if !zero_distortion {
988 chroma_rdo(false);
989 }
990 }
991
992 // RDO-based mode decision
rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, ) -> PartitionParameters993 pub fn rdo_mode_decision<T: Pixel>(
994 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
995 cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
996 inter_cfg: &InterConfig,
997 ) -> PartitionParameters {
998 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
999 let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1000
1001 let rdo_type = if fi.use_tx_domain_rate {
1002 RDOType::TxDistEstRate
1003 } else if fi.use_tx_domain_distortion {
1004 RDOType::TxDistRealRate
1005 } else {
1006 RDOType::PixelDistRealRate
1007 };
1008
1009 let mut best = if fi.frame_type.has_inter() {
1010 assert!(fi.frame_type != FrameType::KEY);
1011
1012 inter_frame_rdo_mode_decision(
1013 fi,
1014 ts,
1015 cw,
1016 bsize,
1017 tile_bo,
1018 inter_cfg,
1019 &cw_checkpoint,
1020 rdo_type,
1021 )
1022 } else {
1023 PartitionParameters::default()
1024 };
1025
1026 let is_chroma_block =
1027 has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1028
1029 if !best.skip {
1030 best = intra_frame_rdo_mode_decision(
1031 fi,
1032 ts,
1033 cw,
1034 bsize,
1035 tile_bo,
1036 &cw_checkpoint,
1037 rdo_type,
1038 best,
1039 is_chroma_block,
1040 );
1041 }
1042
1043 if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
1044 cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
1045
1046 let chroma_mode = PredictionMode::UV_CFL_PRED;
1047 let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1048 let mut wr = WriterCounter::new();
1049 let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
1050
1051 write_tx_blocks(
1052 fi,
1053 ts,
1054 cw,
1055 &mut wr,
1056 best.pred_mode_luma,
1057 best.pred_mode_luma,
1058 angle_delta,
1059 tile_bo,
1060 bsize,
1061 best.tx_size,
1062 best.tx_type,
1063 false,
1064 CFLParams::default(),
1065 true,
1066 rdo_type,
1067 true,
1068 );
1069 cw.rollback(&cw_checkpoint);
1070 if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1071 if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1072 let mut wr = WriterCounter::new();
1073 let tell = wr.tell_frac();
1074
1075 encode_block_pre_cdef(
1076 &fi.sequence,
1077 ts,
1078 cw,
1079 &mut wr,
1080 bsize,
1081 tile_bo,
1082 best.skip,
1083 );
1084 let (has_coeff, _) = encode_block_post_cdef(
1085 fi,
1086 ts,
1087 cw,
1088 &mut wr,
1089 best.pred_mode_luma,
1090 chroma_mode,
1091 angle_delta,
1092 best.ref_frames,
1093 best.mvs,
1094 bsize,
1095 tile_bo,
1096 best.skip,
1097 cfl,
1098 best.tx_size,
1099 best.tx_type,
1100 0,
1101 &[],
1102 rdo_type,
1103 true, // For CFL, luma should be always reconstructed.
1104 false,
1105 );
1106
1107 let rate = wr.tell_frac() - tell;
1108
1109 // For CFL, tx-domain distortion is not an option.
1110 let distortion =
1111 compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
1112 let rd = compute_rd_cost(fi, rate, distortion);
1113 if rd < best.rd_cost {
1114 best.rd_cost = rd;
1115 best.pred_mode_chroma = chroma_mode;
1116 best.angle_delta = angle_delta;
1117 best.has_coeff = has_coeff;
1118 best.pred_cfl_params = cfl;
1119 }
1120
1121 cw.rollback(&cw_checkpoint);
1122 }
1123 }
1124 }
1125
1126 cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1127 cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1128 cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1129
1130 assert!(best.rd_cost >= 0_f64);
1131
1132 PartitionParameters {
1133 bo: tile_bo,
1134 bsize,
1135 pred_mode_luma: best.pred_mode_luma,
1136 pred_mode_chroma: best.pred_mode_chroma,
1137 pred_cfl_params: best.pred_cfl_params,
1138 angle_delta: best.angle_delta,
1139 ref_frames: best.ref_frames,
1140 mvs: best.mvs,
1141 rd_cost: best.rd_cost,
1142 skip: best.skip,
1143 has_coeff: best.has_coeff,
1144 tx_size: best.tx_size,
1145 tx_type: best.tx_type,
1146 sidx: best.sidx,
1147 }
1148 }
1149
inter_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, ) -> PartitionParameters1150 fn inter_frame_rdo_mode_decision<T: Pixel>(
1151 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1152 cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1153 inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1154 rdo_type: RDOType,
1155 ) -> PartitionParameters {
1156 let mut best = PartitionParameters::default();
1157
1158 // we can never have more than 7 reference frame sets
1159 let mut ref_frames_set = ArrayVec::<_, 7>::new();
1160 // again, max of 7 ref slots
1161 let mut ref_slot_set = ArrayVec::<_, 7>::new();
1162 // our implementation never returns more than 3 at the moment
1163 let mut mvs_from_me = ArrayVec::<_, 3>::new();
1164 let mut fwdref = None;
1165 let mut bwdref = None;
1166
1167 for i in inter_cfg.allowed_ref_frames().iter().copied() {
1168 // Don't search LAST3 since it's used only for probs
1169 if i == LAST3_FRAME {
1170 continue;
1171 }
1172
1173 if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1174 if fwdref == None && i.is_fwd_ref() {
1175 fwdref = Some(ref_frames_set.len());
1176 }
1177 if bwdref == None && i.is_bwd_ref() {
1178 bwdref = Some(ref_frames_set.len());
1179 }
1180 ref_frames_set.push([i, NONE_FRAME]);
1181 let slot_idx = fi.ref_frames[i.to_index()];
1182 ref_slot_set.push(slot_idx);
1183 }
1184 }
1185 assert!(!ref_frames_set.is_empty());
1186
1187 let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new();
1188 let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new();
1189 let mut satds = ArrayVec::<u32, 20>::new();
1190 let mut mv_stacks = ArrayVec::<_, 20>::new();
1191 let mut mode_contexts = ArrayVec::<_, 7>::new();
1192
1193 for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1194 let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1195 mode_contexts.push(cw.find_mvrefs(
1196 tile_bo,
1197 ref_frames,
1198 &mut mv_stack,
1199 bsize,
1200 fi,
1201 false,
1202 ));
1203
1204 let mut pmv = [MotionVector::default(); 2];
1205 if !mv_stack.is_empty() {
1206 pmv[0] = mv_stack[0].this_mv;
1207 }
1208 if mv_stack.len() > 1 {
1209 pmv[1] = mv_stack[1].this_mv;
1210 }
1211
1212 let res = motion_estimation(fi, ts, bsize, tile_bo, ref_frames[0], pmv);
1213 let b_me = res.0;
1214
1215 mvs_from_me.push([b_me, MotionVector::default()]);
1216
1217 for &x in RAV1E_INTER_MODES_MINIMAL {
1218 inter_mode_set.push((x, i));
1219 }
1220 if !mv_stack.is_empty() {
1221 inter_mode_set.push((PredictionMode::NEAR0MV, i));
1222 }
1223 if mv_stack.len() >= 2 {
1224 inter_mode_set.push((PredictionMode::GLOBALMV, i));
1225 }
1226 let include_near_mvs = fi.config.speed_settings.include_near_mvs;
1227 if include_near_mvs {
1228 if mv_stack.len() >= 3 {
1229 inter_mode_set.push((PredictionMode::NEAR1MV, i));
1230 }
1231 if mv_stack.len() >= 4 {
1232 inter_mode_set.push((PredictionMode::NEAR2MV, i));
1233 }
1234 }
1235 let same_row_col = |x: &CandidateMV| {
1236 x.this_mv.row == mvs_from_me[i][0].row
1237 && x.this_mv.col == mvs_from_me[i][0].col
1238 };
1239 if !mv_stack
1240 .iter()
1241 .take(if include_near_mvs { 4 } else { 2 })
1242 .any(same_row_col)
1243 && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0)
1244 {
1245 inter_mode_set.push((PredictionMode::NEWMV, i));
1246 }
1247
1248 mv_stacks.push(mv_stack);
1249 }
1250
1251 let sz = bsize.width_mi().min(bsize.height_mi());
1252
1253 // To use non single reference modes, block width and height must be greater than 4.
1254 if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 {
1255 // Adding compound candidate
1256 if let Some(r0) = fwdref {
1257 if let Some(r1) = bwdref {
1258 let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]];
1259 ref_frames_set.push(ref_frames);
1260 let mv0 = mvs_from_me[r0][0];
1261 let mv1 = mvs_from_me[r1][0];
1262 mvs_from_me.push([mv0, mv1]);
1263 let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1264 mode_contexts.push(cw.find_mvrefs(
1265 tile_bo,
1266 ref_frames,
1267 &mut mv_stack,
1268 bsize,
1269 fi,
1270 true,
1271 ));
1272 for &x in RAV1E_INTER_COMPOUND_MODES {
1273 // exclude any NEAR mode based on speed setting
1274 if fi.config.speed_settings.include_near_mvs || !x.has_nearmv() {
1275 let mv_stack_idx = ref_frames_set.len() - 1;
1276 // exclude NEAR modes if the mv_stack is too short
1277 if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1278 inter_mode_set.push((x, mv_stack_idx));
1279 }
1280 }
1281 }
1282 mv_stacks.push(mv_stack);
1283 }
1284 }
1285 }
1286
1287 let num_modes_rdo = if fi.config.speed_settings.prediction_modes
1288 >= PredictionModesSetting::ComplexAll
1289 {
1290 inter_mode_set.len()
1291 } else {
1292 9 // This number is determined by AWCY test
1293 };
1294
1295 inter_mode_set.iter().for_each(|&(luma_mode, i)| {
1296 let mvs = match luma_mode {
1297 PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i],
1298 PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => {
1299 if !mv_stacks[i].is_empty() {
1300 [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv]
1301 } else {
1302 [MotionVector::default(); 2]
1303 }
1304 }
1305 PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => {
1306 if mv_stacks[i].len() > 1 {
1307 [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv]
1308 } else {
1309 [MotionVector::default(); 2]
1310 }
1311 }
1312 PredictionMode::NEAR1MV
1313 | PredictionMode::NEAR2MV
1314 | PredictionMode::NEAR_NEAR1MV
1315 | PredictionMode::NEAR_NEAR2MV => [
1316 mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1317 mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1318 ],
1319 PredictionMode::NEAREST_NEWMV => {
1320 [mv_stacks[i][0].this_mv, mvs_from_me[i][1]]
1321 }
1322 PredictionMode::NEW_NEARESTMV => {
1323 [mvs_from_me[i][0], mv_stacks[i][0].comp_mv]
1324 }
1325 PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => {
1326 [MotionVector::default(); 2]
1327 }
1328 _ => {
1329 unimplemented!();
1330 }
1331 };
1332 mvs_set.push(mvs);
1333
1334 // Calculate SATD for each mode
1335 if num_modes_rdo != inter_mode_set.len() {
1336 let tile_rect = ts.tile_rect();
1337 let rec = &mut ts.rec.planes[0];
1338 let po = tile_bo.plane_offset(rec.plane_cfg);
1339 let mut rec_region =
1340 rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1341
1342 luma_mode.predict_inter(
1343 fi,
1344 tile_rect,
1345 0,
1346 po,
1347 &mut rec_region,
1348 bsize.width(),
1349 bsize.height(),
1350 ref_frames_set[i],
1351 mvs,
1352 &mut ts.inter_compound_buffers,
1353 );
1354
1355 let plane_org = ts.input_tile.planes[0]
1356 .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1357 let plane_ref = rec_region.as_const();
1358
1359 let satd = get_satd(
1360 &plane_org,
1361 &plane_ref,
1362 bsize,
1363 fi.sequence.bit_depth,
1364 fi.cpu_feature_level,
1365 );
1366 satds.push(satd);
1367 } else {
1368 satds.push(0);
1369 }
1370 });
1371
1372 let mut sorted =
1373 izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, 20>>();
1374 if num_modes_rdo != sorted.len() {
1375 sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd);
1376 }
1377
1378 sorted.iter().take(num_modes_rdo).for_each(
1379 |&((luma_mode, i), mvs, _satd)| {
1380 let mode_set_chroma = ArrayVec::from([luma_mode]);
1381
1382 luma_chroma_mode_rdo(
1383 luma_mode,
1384 fi,
1385 bsize,
1386 tile_bo,
1387 ts,
1388 cw,
1389 rdo_type,
1390 cw_checkpoint,
1391 &mut best,
1392 mvs,
1393 ref_frames_set[i],
1394 &mode_set_chroma,
1395 false,
1396 mode_contexts[i],
1397 &mv_stacks[i],
1398 AngleDelta::default(),
1399 );
1400 },
1401 );
1402
1403 best
1404 }
1405
intra_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, mut best: PartitionParameters, is_chroma_block: bool, ) -> PartitionParameters1406 fn intra_frame_rdo_mode_decision<T: Pixel>(
1407 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1408 cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1409 cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1410 mut best: PartitionParameters, is_chroma_block: bool,
1411 ) -> PartitionParameters {
1412 let num_modes_rdo: usize;
1413 let mut modes = ArrayVec::<_, INTRA_MODES>::new();
1414
1415 // Reduce number of prediction modes at higher speed levels
1416 num_modes_rdo = if (fi.frame_type == FrameType::KEY
1417 && fi.config.speed_settings.prediction_modes
1418 >= PredictionModesSetting::ComplexKeyframes)
1419 || (fi.frame_type.has_inter()
1420 && fi.config.speed_settings.prediction_modes
1421 >= PredictionModesSetting::ComplexAll)
1422 {
1423 7
1424 } else {
1425 3
1426 };
1427
1428 let intra_mode_set = RAV1E_INTRA_MODES;
1429
1430 // Find mode with lowest rate cost
1431 {
1432 use crate::ec::cdf_to_pdf;
1433
1434 let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() {
1435 cw.get_cdf_intra_mode(bsize)
1436 } else {
1437 cw.get_cdf_intra_mode_kf(tile_bo)
1438 });
1439
1440 modes.try_extend_from_slice(intra_mode_set).unwrap();
1441 modes.sort_by_key(|&a| !probs_all[a as usize]);
1442 }
1443
1444 // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1445 // may be improved by emulating prediction for each tx block.
1446 {
1447 let satds = {
1448 // FIXME: If tx partition is used, this whole sads block should be fixed
1449 let tx_size = bsize.tx_size();
1450 let edge_buf = {
1451 let rec = &ts.rec.planes[0].as_const();
1452 let po = tile_bo.plane_offset(rec.plane_cfg);
1453 // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1454 get_intra_edges(
1455 rec,
1456 tile_bo,
1457 0,
1458 0,
1459 bsize,
1460 po,
1461 tx_size,
1462 fi.sequence.bit_depth,
1463 None,
1464 fi.sequence.enable_intra_edge_filter,
1465 IntraParam::None,
1466 )
1467 };
1468
1469 let ief_params = if fi.sequence.enable_intra_edge_filter {
1470 let above_block_info = ts.above_block_info(tile_bo, 0, 0);
1471 let left_block_info = ts.left_block_info(tile_bo, 0, 0);
1472 Some(IntraEdgeFilterParameters::new(
1473 0,
1474 above_block_info,
1475 left_block_info,
1476 ))
1477 } else {
1478 None
1479 };
1480
1481 let mut satds_all = [0; INTRA_MODES];
1482 for &luma_mode in modes.iter().skip(num_modes_rdo / 2) {
1483 let tile_rect = ts.tile_rect();
1484 let rec = &mut ts.rec.planes[0];
1485 let mut rec_region =
1486 rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1487 // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1488 luma_mode.predict_intra(
1489 tile_rect,
1490 &mut rec_region,
1491 tx_size,
1492 fi.sequence.bit_depth,
1493 &[0i16; 2],
1494 IntraParam::None,
1495 if luma_mode.is_directional() { ief_params } else { None },
1496 &edge_buf,
1497 fi.cpu_feature_level,
1498 );
1499
1500 let plane_org = ts.input_tile.planes[0]
1501 .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1502 let plane_ref = rec_region.as_const();
1503
1504 satds_all[luma_mode as usize] = get_satd(
1505 &plane_org,
1506 &plane_ref,
1507 tx_size.block_size(),
1508 fi.sequence.bit_depth,
1509 fi.cpu_feature_level,
1510 );
1511 }
1512 satds_all
1513 };
1514
1515 modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]);
1516 }
1517
1518 debug_assert!(num_modes_rdo >= 1);
1519
1520 modes.iter().take(num_modes_rdo).for_each(|&luma_mode| {
1521 let mvs = [MotionVector::default(); 2];
1522 let ref_frames = [INTRA_FRAME, NONE_FRAME];
1523 let mut mode_set_chroma = ArrayVec::<_, 2>::new();
1524 mode_set_chroma.push(luma_mode);
1525 if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1526 mode_set_chroma.push(PredictionMode::DC_PRED);
1527 }
1528 luma_chroma_mode_rdo(
1529 luma_mode,
1530 fi,
1531 bsize,
1532 tile_bo,
1533 ts,
1534 cw,
1535 rdo_type,
1536 cw_checkpoint,
1537 &mut best,
1538 mvs,
1539 ref_frames,
1540 &mode_set_chroma,
1541 true,
1542 0,
1543 &ArrayVec::<CandidateMV, 9>::new(),
1544 AngleDelta::default(),
1545 );
1546 });
1547
1548 if fi.config.speed_settings.fine_directional_intra
1549 && bsize >= BlockSize::BLOCK_8X8
1550 {
1551 // Find the best angle delta for the current best prediction mode
1552 let luma_deltas = best.pred_mode_luma.angle_delta_count();
1553 let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1554
1555 let mvs = [MotionVector::default(); 2];
1556 let ref_frames = [INTRA_FRAME, NONE_FRAME];
1557 let mode_set_chroma = [best.pred_mode_chroma];
1558 let mv_stack = ArrayVec::<_, 9>::new();
1559 let mut best_angle_delta = best.angle_delta;
1560 let mut angle_delta_rdo = |y, uv| -> AngleDelta {
1561 if best.angle_delta.y != y || best.angle_delta.uv != uv {
1562 luma_chroma_mode_rdo(
1563 best.pred_mode_luma,
1564 fi,
1565 bsize,
1566 tile_bo,
1567 ts,
1568 cw,
1569 rdo_type,
1570 cw_checkpoint,
1571 &mut best,
1572 mvs,
1573 ref_frames,
1574 &mode_set_chroma,
1575 true,
1576 0,
1577 &mv_stack,
1578 AngleDelta { y, uv },
1579 );
1580 }
1581 best.angle_delta
1582 };
1583
1584 for i in 0..luma_deltas {
1585 let angle_delta_y =
1586 if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 };
1587 best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1588 }
1589 for j in 0..chroma_deltas {
1590 let angle_delta_uv =
1591 if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 };
1592 best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1593 }
1594 }
1595
1596 best
1597 }
1598
rdo_cfl_alpha<T: Pixel>( ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, luma_tx_size: TxSize, fi: &FrameInvariants<T>, ) -> Option<CFLParams>1599 pub fn rdo_cfl_alpha<T: Pixel>(
1600 ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1601 luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1602 ) -> Option<CFLParams> {
1603 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1604 let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1605 debug_assert!(bsize.subsampled_size(xdec, ydec) == uv_tx_size.block_size());
1606
1607 let frame_bo = ts.to_frame_block_offset(tile_bo);
1608 let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1609 (fi.width + xdec) >> xdec,
1610 (fi.height + ydec) >> ydec,
1611 uv_tx_size.block_size(),
1612 (frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1613 (frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1614 );
1615
1616 if visible_tx_w == 0 || visible_tx_h == 0 {
1617 return None;
1618 };
1619 let mut ac: Aligned<[i16; 32 * 32]> = Aligned::uninitialized();
1620 luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1621 let best_alpha: ArrayVec<i16, 2> = (1..3)
1622 .map(|p| {
1623 let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1624 let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1625 let rec = &mut ts.rec.planes[p];
1626 let input = &ts.input_tile.planes[p];
1627 let po = tile_bo.plane_offset(rec.plane_cfg);
1628 let edge_buf = get_intra_edges(
1629 &rec.as_const(),
1630 tile_bo,
1631 0,
1632 0,
1633 bsize,
1634 po,
1635 uv_tx_size,
1636 fi.sequence.bit_depth,
1637 Some(PredictionMode::UV_CFL_PRED),
1638 fi.sequence.enable_intra_edge_filter,
1639 IntraParam::None,
1640 );
1641 let mut alpha_cost = |alpha: i16| -> u64 {
1642 let mut rec_region =
1643 rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1644 PredictionMode::UV_CFL_PRED.predict_intra(
1645 tile_rect,
1646 &mut rec_region,
1647 uv_tx_size,
1648 fi.sequence.bit_depth,
1649 &ac.data,
1650 IntraParam::Alpha(alpha),
1651 None,
1652 &edge_buf,
1653 fi.cpu_feature_level,
1654 );
1655 sse_wxh(
1656 &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1657 &rec_region.as_const(),
1658 visible_tx_w,
1659 visible_tx_h,
1660 |_, _| DistortionScale::default(), // We're not doing RDO here.
1661 fi.sequence.bit_depth,
1662 fi.cpu_feature_level,
1663 )
1664 .0
1665 };
1666 let mut best = (alpha_cost(0), 0);
1667 let mut count = 2;
1668 for alpha in 1i16..=16i16 {
1669 let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1670 if cost.0 < best.0 {
1671 best = (cost.0, alpha);
1672 count += 2;
1673 }
1674 if cost.1 < best.0 {
1675 best = (cost.1, -alpha);
1676 count += 2;
1677 }
1678 if count < alpha {
1679 break;
1680 }
1681 }
1682 best.1
1683 })
1684 .collect();
1685
1686 if best_alpha[0] == 0 && best_alpha[1] == 0 {
1687 None
1688 } else {
1689 Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
1690 }
1691 }
1692
1693 /// RDO-based transform type decision
1694 /// If cw_checkpoint is None, a checkpoint for cw's (ContextWriter) current
1695 /// state is created and stored for later use.
rdo_tx_type_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>, mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet, tx_types: &[TxType], ) -> (TxType, f64)1696 pub fn rdo_tx_type_decision<T: Pixel>(
1697 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1698 cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1699 mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
1700 bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1701 tx_types: &[TxType],
1702 ) -> (TxType, f64) {
1703 let mut best_type = TxType::DCT_DCT;
1704 let mut best_rd = std::f64::MAX;
1705
1706 let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1707 let is_chroma_block =
1708 has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1709
1710 let is_inter = !mode.is_intra();
1711
1712 if cw_checkpoint.is_none() {
1713 // Only run the first call
1714 // Prevents creating multiple checkpoints for own version of cw
1715 *cw_checkpoint =
1716 Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling));
1717 }
1718
1719 let rdo_type = if fi.use_tx_domain_distortion {
1720 RDOType::TxDistRealRate
1721 } else {
1722 RDOType::PixelDistRealRate
1723 };
1724 let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1725
1726 for &tx_type in tx_types {
1727 // Skip unsupported transform types
1728 if av1_tx_used[tx_set as usize][tx_type as usize] == 0 {
1729 continue;
1730 }
1731
1732 if is_inter {
1733 motion_compensate(
1734 fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
1735 );
1736 }
1737
1738 let mut wr = WriterCounter::new();
1739 let tell = wr.tell_frac();
1740 let (_, tx_dist) = if is_inter {
1741 write_tx_tree(
1742 fi,
1743 ts,
1744 cw,
1745 &mut wr,
1746 mode,
1747 0,
1748 tile_bo,
1749 bsize,
1750 tx_size,
1751 tx_type,
1752 false,
1753 true,
1754 rdo_type,
1755 need_recon_pixel,
1756 )
1757 } else {
1758 write_tx_blocks(
1759 fi,
1760 ts,
1761 cw,
1762 &mut wr,
1763 mode,
1764 mode,
1765 AngleDelta::default(),
1766 tile_bo,
1767 bsize,
1768 tx_size,
1769 tx_type,
1770 false,
1771 CFLParams::default(), // Unused.
1772 true,
1773 rdo_type,
1774 need_recon_pixel,
1775 )
1776 };
1777
1778 let rate = wr.tell_frac() - tell;
1779 let distortion = if fi.use_tx_domain_distortion {
1780 compute_tx_distortion(
1781 fi,
1782 ts,
1783 bsize,
1784 is_chroma_block,
1785 tile_bo,
1786 tx_dist,
1787 false,
1788 true,
1789 )
1790 } else {
1791 compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
1792 };
1793 let rd = compute_rd_cost(fi, rate, distortion);
1794 if rd < best_rd {
1795 best_rd = rd;
1796 best_type = tx_type;
1797 }
1798
1799 cw.rollback(cw_checkpoint.as_ref().unwrap());
1800 }
1801
1802 assert!(best_rd >= 0_f64);
1803
1804 (best_type, best_rd)
1805 }
1806
get_sub_partitions( four_partitions: &[TileBlockOffset; 4], partition: PartitionType, ) -> ArrayVec<TileBlockOffset, 4>1807 pub fn get_sub_partitions(
1808 four_partitions: &[TileBlockOffset; 4], partition: PartitionType,
1809 ) -> ArrayVec<TileBlockOffset, 4> {
1810 let mut partition_offsets = ArrayVec::<TileBlockOffset, 4>::new();
1811
1812 partition_offsets.push(four_partitions[0]);
1813
1814 if partition == PARTITION_NONE {
1815 return partition_offsets;
1816 }
1817 if partition == PARTITION_VERT || partition == PARTITION_SPLIT {
1818 partition_offsets.push(four_partitions[1]);
1819 };
1820 if partition == PARTITION_HORZ || partition == PARTITION_SPLIT {
1821 partition_offsets.push(four_partitions[2]);
1822 };
1823 if partition == PARTITION_SPLIT {
1824 partition_offsets.push(four_partitions[3]);
1825 };
1826
1827 partition_offsets
1828 }
1829
1830 #[inline(always)]
rdo_partition_none<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>, ) -> f641831 fn rdo_partition_none<T: Pixel>(
1832 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1833 cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1834 inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
1835 ) -> f64 {
1836 debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1837
1838 let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1839 let cost = mode.rd_cost;
1840
1841 child_modes.push(mode);
1842
1843 cost
1844 }
1845
1846 // VERTICAL, HORIZONTAL or simple SPLIT
1847 #[inline(always)]
rdo_partition_simple<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, partition: PartitionType, rdo_type: RDOType, best_rd: f64, child_modes: &mut ArrayVec<PartitionParameters, 4>, ) -> Option<f64>1848 fn rdo_partition_simple<T: Pixel, W: Writer>(
1849 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1850 cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1851 bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1852 partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1853 child_modes: &mut ArrayVec<PartitionParameters, 4>,
1854 ) -> Option<f64> {
1855 debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1856 let subsize = bsize.subsize(partition);
1857
1858 debug_assert!(subsize != BlockSize::BLOCK_INVALID);
1859
1860 let cost = if bsize >= BlockSize::BLOCK_8X8 {
1861 let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1862 let tell = w.tell_frac();
1863 cw.write_partition(w, tile_bo, partition, bsize);
1864 compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1865 } else {
1866 0.0
1867 };
1868
1869 let hbsw = subsize.width_mi(); // Half the block size width in blocks
1870 let hbsh = subsize.height_mi(); // Half the block size height in blocks
1871 let four_partitions = [
1872 tile_bo,
1873 TileBlockOffset(BlockOffset {
1874 x: tile_bo.0.x + hbsw as usize,
1875 y: tile_bo.0.y,
1876 }),
1877 TileBlockOffset(BlockOffset {
1878 x: tile_bo.0.x,
1879 y: tile_bo.0.y + hbsh as usize,
1880 }),
1881 TileBlockOffset(BlockOffset {
1882 x: tile_bo.0.x + hbsw as usize,
1883 y: tile_bo.0.y + hbsh as usize,
1884 }),
1885 ];
1886
1887 let partitions = get_sub_partitions(&four_partitions, partition);
1888
1889 let mut rd_cost_sum = 0.0;
1890
1891 for offset in partitions {
1892 let hbs = subsize.width_mi() >> 1;
1893 let has_cols = offset.0.x + hbs < ts.mi_width;
1894 let has_rows = offset.0.y + hbs < ts.mi_height;
1895
1896 if has_cols && has_rows {
1897 let mode_decision =
1898 rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1899
1900 rd_cost_sum += mode_decision.rd_cost;
1901
1902 if fi.enable_early_exit && rd_cost_sum > best_rd {
1903 return None;
1904 }
1905 if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1906 let w: &mut W =
1907 if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1908 cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1909 }
1910 encode_block_with_modes(
1911 fi,
1912 ts,
1913 cw,
1914 w_pre_cdef,
1915 w_post_cdef,
1916 subsize,
1917 offset,
1918 &mode_decision,
1919 rdo_type,
1920 false,
1921 );
1922 child_modes.push(mode_decision);
1923 } else {
1924 //rd_cost_sum += std::f64::MAX;
1925 return None;
1926 }
1927 }
1928
1929 Some(cost + rd_cost_sum)
1930 }
1931
1932 // RDO-based single level partitioning decision
rdo_partition_decision<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, cached_block: &PartitionGroupParameters, partition_types: &[PartitionType], rdo_type: RDOType, inter_cfg: &InterConfig, ) -> PartitionGroupParameters1933 pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1934 fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1935 cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1936 bsize: BlockSize, tile_bo: TileBlockOffset,
1937 cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1938 rdo_type: RDOType, inter_cfg: &InterConfig,
1939 ) -> PartitionGroupParameters {
1940 let mut best_partition = cached_block.part_type;
1941 let mut best_rd = cached_block.rd_cost;
1942 let mut best_pred_modes = cached_block.part_modes.clone();
1943
1944 let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1945 let w_pre_checkpoint = w_pre_cdef.checkpoint();
1946 let w_post_checkpoint = w_post_cdef.checkpoint();
1947
1948 for &partition in partition_types {
1949 // Do not re-encode results we already have
1950 if partition == cached_block.part_type {
1951 continue;
1952 }
1953
1954 let mut child_modes = ArrayVec::<_, 4>::new();
1955
1956 let cost = match partition {
1957 PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1958 Some(rdo_partition_none(
1959 fi,
1960 ts,
1961 cw,
1962 bsize,
1963 tile_bo,
1964 inter_cfg,
1965 &mut child_modes,
1966 ))
1967 }
1968 PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
1969 rdo_partition_simple(
1970 fi,
1971 ts,
1972 cw,
1973 w_pre_cdef,
1974 w_post_cdef,
1975 bsize,
1976 tile_bo,
1977 inter_cfg,
1978 partition,
1979 rdo_type,
1980 best_rd,
1981 &mut child_modes,
1982 )
1983 }
1984 _ => {
1985 unreachable!();
1986 }
1987 };
1988
1989 if let Some(rd) = cost {
1990 if rd < best_rd {
1991 best_rd = rd;
1992 best_partition = partition;
1993 best_pred_modes = child_modes.clone();
1994 }
1995 }
1996 cw.rollback(&cw_checkpoint);
1997 w_pre_cdef.rollback(&w_pre_checkpoint);
1998 w_post_cdef.rollback(&w_post_checkpoint);
1999 }
2000
2001 assert!(best_rd >= 0_f64);
2002
2003 PartitionGroupParameters {
2004 rd_cost: best_rd,
2005 part_type: best_partition,
2006 part_modes: best_pred_modes,
2007 }
2008 }
2009
rdo_loop_plane_error<T: Pixel>( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize, ) -> ScaledDistortion2010 fn rdo_loop_plane_error<T: Pixel>(
2011 base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
2012 sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
2013 blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
2014 ) -> ScaledDistortion {
2015 let sb_w_blocks =
2016 if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w;
2017 let sb_h_blocks =
2018 if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h;
2019 // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
2020 // accumulating in-frame and unpadded
2021 let mut err = Distortion::zero();
2022 for by in 0..sb_h_blocks {
2023 for bx in 0..sb_w_blocks {
2024 let loop_bo = offset_sbo.block_offset(bx << 1, by << 1);
2025 if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
2026 let src_plane = &src.planes[pli];
2027 let test_plane = &test.planes[pli];
2028 let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
2029 debug_assert_eq!(xdec, test_plane.cfg.xdec);
2030 debug_assert_eq!(ydec, test_plane.cfg.ydec);
2031
2032 // Unfortunately, our distortion biases are only available via
2033 // Frame-absolute addressing, so we need a block offset
2034 // relative to the full frame origin (not the tile or analysis
2035 // area)
2036 let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1);
2037 let bias = distortion_scale(
2038 fi,
2039 ts.to_frame_block_offset(frame_bo),
2040 BlockSize::BLOCK_8X8,
2041 );
2042
2043 let src_region =
2044 src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
2045 let test_region =
2046 test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
2047
2048 err += if pli == 0 {
2049 // For loop filters, We intentionally use cdef_dist even with
2050 // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
2051 // significant negative impact on other metrics and visual quality.
2052 cdef_dist_wxh_8x8(&src_region, &test_region, fi.sequence.bit_depth)
2053 * bias
2054 } else {
2055 sse_wxh(
2056 &src_region,
2057 &test_region,
2058 8 >> xdec,
2059 8 >> ydec,
2060 |_, _| bias,
2061 fi.sequence.bit_depth,
2062 fi.cpu_feature_level,
2063 )
2064 };
2065 }
2066 }
2067 }
2068 err * fi.dist_scale[pli]
2069 }
2070
2071 // Passed in a superblock offset representing the upper left corner of
2072 // the LRU area we're optimizing. This area covers the largest LRU in
2073 // any of the present planes, but may consist of a number of
2074 // superblocks and full, smaller LRUs in the other planes
rdo_loop_decision<T: Pixel, W: Writer>( base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, deblock_p: bool, )2075 pub fn rdo_loop_decision<T: Pixel, W: Writer>(
2076 base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2077 ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
2078 deblock_p: bool,
2079 ) {
2080 let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2081 1
2082 } else {
2083 MAX_PLANES
2084 };
2085 assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration);
2086 // Determine area of optimization: Which plane has the largest LRUs?
2087 // How many LRUs for each?
2088 let mut sb_w = 1; // how many superblocks wide the largest LRU
2089 // is/how many SBs we're processing (same thing)
2090 let mut sb_h = 1; // how many superblocks wide the largest LRU
2091 // is/how many SBs we're processing (same thing)
2092 let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing
2093 let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing
2094 for pli in 0..planes {
2095 let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2096 let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2097 if sb_w < (1 << sb_h_shift) {
2098 sb_w = 1 << sb_h_shift;
2099 }
2100 if sb_h < (1 << sb_v_shift) {
2101 sb_h = 1 << sb_v_shift;
2102 }
2103 }
2104 for pli in 0..planes {
2105 let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2106 let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2107 lru_w[pli] = sb_w / (1 << sb_h_shift);
2108 lru_h[pli] = sb_h / (1 << sb_v_shift);
2109 }
2110
2111 // The superblock width/height determinations may be calling for us
2112 // to compute over superblocks that do not actually exist in the
2113 // frame (off the right or lower edge). Trim sb width/height down
2114 // to actual superblocks. Note that these last superblocks on the
2115 // right/bottom may themselves still span the edge of the frame, but
2116 // they do hold at least some visible pixels.
2117 sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2118 sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2119
2120 // We have need to know the Y visible pixel limits as well (the
2121 // sb_w/sb_h figures above can be used to determine how many
2122 // allocated pixels, possibly beyond the visible frame, exist).
2123 let crop_w =
2124 fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2125 let crop_h =
2126 fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2127 let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2128 let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2129
2130 // Based on `RestorationState::new`
2131 const MAX_SB_SHIFT: usize = 4;
2132 const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT;
2133 const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2134
2135 // Static allocation relies on the "minimal LRU area for all N planes" invariant.
2136 let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE];
2137 let mut best_lrf =
2138 [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2139
2140 // due to imprecision in the reconstruction parameter solver, we
2141 // need to make sure we don't fall into a limit cycle. Track our
2142 // best cost at LRF so that we can break if we get a solution that doesn't
2143 // improve at the reconstruction stage.
2144 let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2145
2146 // Loop filter RDO is an iterative process and we need temporary
2147 // scratch data to hold the results of deblocking, cdef, and the
2148 // loop reconstruction filter so that each can be partially updated
2149 // without recomputing the entire stack. Construct
2150 // largest-LRU-sized frames for each, accounting for padding
2151 // required by deblocking, cdef and [optionally] LR.
2152 let mut rec_subset = ts
2153 .rec
2154 .subregion(Area::BlockRect {
2155 bo: base_sbo.block_offset(0, 0).0,
2156 width: (pixel_w + 7) >> 3 << 3,
2157 height: (pixel_h + 7) >> 3 << 3,
2158 })
2159 .scratch_copy();
2160
2161 // sub-setted region of the TileBlocks for our working frame area.
2162 // Note that the size of this subset is what signals CDEF as to the
2163 // actual coded size.
2164 let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2165 base_sbo.block_offset(0, 0).0.x,
2166 base_sbo.block_offset(0, 0).0.y,
2167 sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2168 sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2169 );
2170
2171 // const, no need to copy, just need the subregion (but do zero the
2172 // origin to match the other copies/new backing frames).
2173 let src_subset = ts
2174 .input_tile
2175 .subregion(Area::BlockRect {
2176 bo: base_sbo.block_offset(0, 0).0,
2177 width: (pixel_w + 7) >> 3 << 3,
2178 height: (pixel_h + 7) >> 3 << 3,
2179 })
2180 .home();
2181
2182 if deblock_p {
2183 // Find a good deblocking filter solution for the passed in area.
2184 // This is not RDO of deblocking itself, merely a solution to get
2185 // better results from CDEF/LRF RDO.
2186 let deblock_levels = deblock_filter_optimize(
2187 fi,
2188 &rec_subset.as_tile(),
2189 &src_subset,
2190 &tileblocks_subset.as_const(),
2191 crop_w,
2192 crop_h,
2193 );
2194
2195 // Deblock the contents of our reconstruction copy.
2196 if deblock_levels[0] != 0 || deblock_levels[1] != 0 {
2197 // copy ts.deblock because we need to set some of our own values here
2198 let mut deblock_copy = *ts.deblock;
2199 deblock_copy.levels = deblock_levels;
2200
2201 // finally, deblock the temp frame
2202 deblock_filter_frame(
2203 &deblock_copy,
2204 &mut rec_subset.as_tile_mut(),
2205 &tileblocks_subset.as_const(),
2206 crop_w,
2207 crop_h,
2208 fi.sequence.bit_depth,
2209 planes,
2210 );
2211 }
2212 }
2213
2214 let mut cdef_work =
2215 if fi.sequence.enable_cdef { Some(rec_subset.clone()) } else { None };
2216 let mut lrf_work = if fi.sequence.enable_restoration {
2217 Some(Frame {
2218 planes: {
2219 let new_plane = |pli: usize| {
2220 let PlaneConfig { xdec, ydec, width, height, .. } =
2221 rec_subset.planes[pli].cfg;
2222 Plane::new(width, height, xdec, ydec, 0, 0)
2223 };
2224 [new_plane(0), new_plane(1), new_plane(2)]
2225 },
2226 })
2227 } else {
2228 None
2229 };
2230
2231 // Precompute directional analysis for CDEF
2232 let cdef_data = {
2233 if cdef_work.is_some() {
2234 Some((
2235 &rec_subset,
2236 cdef_analyze_superblock_range(
2237 fi,
2238 &rec_subset,
2239 &tileblocks_subset.as_const(),
2240 sb_w,
2241 sb_h,
2242 ),
2243 ))
2244 } else {
2245 None
2246 }
2247 };
2248
2249 // CDEF/LRF decision iteration
2250 // Start with a default of CDEF 0 and RestorationFilter::None
2251 // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2252 // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2253 // If LRF choice changed for any plane, repeat until no changes
2254 // Limit iterations and where we break based on speed setting (in the TODO list ;-)
2255 let mut cdef_change = true;
2256 let mut lrf_change = true;
2257 while cdef_change || lrf_change {
2258 // search for improved cdef indices, superblock by superblock, if cdef is enabled.
2259 if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2260 (&cdef_data, &mut cdef_work.as_mut())
2261 {
2262 for sby in 0..sb_h {
2263 for sbx in 0..sb_w {
2264 let prev_best_index = best_index[sby * sb_w + sbx];
2265 let mut best_cost = -1.;
2266 let mut best_new_index = -1i8;
2267
2268 /* offset of the superblock we're currently testing within the larger
2269 analysis area */
2270 let loop_sbo =
2271 TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2272
2273 /* cdef index testing loop */
2274 for cdef_index in 0..(1 << fi.cdef_bits) {
2275 let mut err = ScaledDistortion::zero();
2276 let mut rate = 0;
2277
2278 cdef_filter_superblock(
2279 fi,
2280 &rec_subset,
2281 &mut cdef_ref.as_tile_mut(),
2282 &tileblocks_subset.as_const(),
2283 loop_sbo,
2284 cdef_index,
2285 &cdef_dirs[sby * sb_w + sbx],
2286 );
2287 // apply LRF if any
2288 for pli in 0..planes {
2289 // We need the cropped-to-visible-frame area of this SB
2290 let wh =
2291 if fi.sequence.use_128x128_superblock { 128 } else { 64 };
2292 let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2293 let vis_width = (wh >> xdec).min(
2294 (crop_w >> xdec)
2295 - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2296 as usize,
2297 );
2298 let vis_height = (wh >> ydec).min(
2299 (crop_h >> ydec)
2300 - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2301 as usize,
2302 );
2303 // which LRU are we currently testing against?
2304 if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2305 let rp = &ts.restoration.planes[pli];
2306 (
2307 rp.restoration_unit_offset(base_sbo, loop_sbo, false),
2308 &mut lrf_work,
2309 )
2310 } {
2311 // We have a valid LRU, apply LRF, compute error
2312 match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2313 RestorationFilter::None {} => {
2314 err += rdo_loop_plane_error(
2315 base_sbo,
2316 loop_sbo,
2317 1,
2318 1,
2319 fi,
2320 ts,
2321 &tileblocks_subset.as_const(),
2322 cdef_ref,
2323 &src_subset,
2324 pli,
2325 );
2326 rate += if fi.sequence.enable_restoration {
2327 cw.fc.count_lrf_switchable(
2328 w,
2329 &ts.restoration.as_const(),
2330 best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2331 pli,
2332 )
2333 } else {
2334 0 // no relative cost differeneces to different
2335 // CDEF params. If cdef is on, it's a wash.
2336 };
2337 }
2338 RestorationFilter::Sgrproj { set, xqd } => {
2339 // only run on this single superblock
2340 let loop_po =
2341 loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2342 // todo: experiment with borrowing border pixels
2343 // rather than edge-extending. Right now this is
2344 // hard-clipping to the superblock boundary.
2345 setup_integral_image(
2346 &mut ts.integral_buffer,
2347 SOLVE_IMAGE_STRIDE,
2348 vis_width,
2349 vis_height,
2350 vis_width,
2351 vis_height,
2352 &cdef_ref.planes[pli].slice(loop_po),
2353 &cdef_ref.planes[pli].slice(loop_po),
2354 );
2355 sgrproj_stripe_filter(
2356 set,
2357 xqd,
2358 fi,
2359 &ts.integral_buffer,
2360 SOLVE_IMAGE_STRIDE,
2361 &cdef_ref.planes[pli].slice(loop_po),
2362 &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2363 x: loop_po.x,
2364 y: loop_po.y,
2365 width: vis_width,
2366 height: vis_height,
2367 }),
2368 );
2369 err += rdo_loop_plane_error(
2370 base_sbo,
2371 loop_sbo,
2372 1,
2373 1,
2374 fi,
2375 ts,
2376 &tileblocks_subset.as_const(),
2377 lrf_ref,
2378 &src_subset,
2379 pli,
2380 );
2381 rate += cw.fc.count_lrf_switchable(
2382 w,
2383 &ts.restoration.as_const(),
2384 best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2385 pli,
2386 );
2387 }
2388 RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2389 }
2390 } else {
2391 // No actual LRU here, compute error directly from CDEF output.
2392 err += rdo_loop_plane_error(
2393 base_sbo,
2394 loop_sbo,
2395 1,
2396 1,
2397 fi,
2398 ts,
2399 &tileblocks_subset.as_const(),
2400 cdef_ref,
2401 &src_subset,
2402 pli,
2403 );
2404 // no relative cost differeneces to different
2405 // CDEF params. If cdef is on, it's a wash.
2406 // rate += 0;
2407 }
2408 }
2409
2410 let cost = compute_rd_cost(fi, rate, err);
2411 if best_cost < 0. || cost < best_cost {
2412 best_cost = cost;
2413 best_new_index = cdef_index as i8;
2414 }
2415 }
2416
2417 // Did we change any preexisting choices?
2418 if best_new_index != prev_best_index {
2419 cdef_change = true;
2420 best_index[sby * sb_w + sbx] = best_new_index;
2421 tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2422 }
2423
2424 let mut cdef_ref_tm = TileMut::new(
2425 cdef_ref,
2426 TileRect {
2427 x: 0,
2428 y: 0,
2429 width: cdef_ref.planes[0].cfg.width,
2430 height: cdef_ref.planes[0].cfg.height,
2431 },
2432 );
2433
2434 // Keep cdef output up to date; we need it for restoration
2435 // both below and above (padding)
2436 cdef_filter_superblock(
2437 fi,
2438 rec_copy,
2439 &mut cdef_ref_tm,
2440 &tileblocks_subset.as_const(),
2441 loop_sbo,
2442 best_index[sby * sb_w + sbx] as u8,
2443 &cdef_dirs[sby * sb_w + sbx],
2444 );
2445 }
2446 }
2447 }
2448
2449 if !cdef_change {
2450 break;
2451 }
2452 cdef_change = false;
2453 lrf_change = false;
2454
2455 // search for improved restoration filter parameters if restoration is enabled
2456 if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2457 let lrf_input = if cdef_work.is_some() {
2458 // When CDEF is enabled, we pull from the CDEF output
2459 &cdef_work.as_ref().unwrap()
2460 } else {
2461 // When CDEF is disabled, we pull from the [optionally
2462 // deblocked] reconstruction
2463 &rec_subset
2464 };
2465 for pli in 0..planes {
2466 // Nominal size of LRU in pixels before clipping to visible frame
2467 let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2468 // width, in sb, of an LRU in this plane
2469 let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2470 // height, in sb, of an LRU in this plane
2471 let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2472 let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2473 for lru_y in 0..lru_h[pli] {
2474 // number of LRUs vertically
2475 for lru_x in 0..lru_w[pli] {
2476 // number of LRUs horizontally
2477 let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2478 x: lru_x * lru_sb_w,
2479 y: lru_y * lru_sb_h,
2480 });
2481 if ts.restoration.has_restoration_unit(
2482 base_sbo + loop_sbo,
2483 pli,
2484 false,
2485 ) {
2486 let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2487 let lrf_in_plane = &lrf_input.planes[pli];
2488 let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2489 let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2490 let mut best_cost =
2491 best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2492
2493 // Check the no filter option
2494 {
2495 let err = rdo_loop_plane_error(
2496 base_sbo,
2497 loop_sbo,
2498 lru_sb_w,
2499 lru_sb_h,
2500 fi,
2501 ts,
2502 &tileblocks_subset.as_const(),
2503 lrf_input,
2504 &src_subset,
2505 pli,
2506 );
2507 let rate = cw.fc.count_lrf_switchable(
2508 w,
2509 &ts.restoration.as_const(),
2510 best_new_lrf,
2511 pli,
2512 );
2513
2514 let cost = compute_rd_cost(fi, rate, err);
2515 // Was this choice actually an improvement?
2516 if best_cost < 0. || cost < best_cost {
2517 best_cost = cost;
2518 best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2519 best_new_lrf = RestorationFilter::None;
2520 }
2521 }
2522
2523 // Look for a self guided filter
2524 // We need the cropped-to-visible-frame computation area of this LRU
2525 let vis_width = unit_size.min(
2526 (crop_w >> xdec)
2527 - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2528 );
2529 let vis_height = unit_size.min(
2530 (crop_h >> ydec)
2531 - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2532 );
2533
2534 // todo: experiment with borrowing border pixels
2535 // rather than edge-extending. Right now this is
2536 // hard-clipping to the superblock boundary.
2537 setup_integral_image(
2538 &mut ts.integral_buffer,
2539 SOLVE_IMAGE_STRIDE,
2540 vis_width,
2541 vis_height,
2542 vis_width,
2543 vis_height,
2544 &lrf_in_plane.slice(lrf_po),
2545 &lrf_in_plane.slice(lrf_po),
2546 );
2547
2548 for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2549 {
2550 let (xqd0, xqd1) = sgrproj_solve(
2551 set,
2552 fi,
2553 &ts.integral_buffer,
2554 &src_plane
2555 .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2556 &lrf_in_plane.slice(lrf_po),
2557 vis_width,
2558 vis_height,
2559 );
2560 let current_lrf =
2561 RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2562 if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2563 sgrproj_stripe_filter(
2564 set,
2565 xqd,
2566 fi,
2567 &ts.integral_buffer,
2568 SOLVE_IMAGE_STRIDE,
2569 &lrf_in_plane.slice(lrf_po),
2570 &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2571 x: lrf_po.x,
2572 y: lrf_po.y,
2573 width: vis_width,
2574 height: vis_height,
2575 }),
2576 );
2577 }
2578 let err = rdo_loop_plane_error(
2579 base_sbo,
2580 loop_sbo,
2581 lru_sb_w,
2582 lru_sb_h,
2583 fi,
2584 ts,
2585 &tileblocks_subset.as_const(),
2586 lrf_ref,
2587 &src_subset,
2588 pli,
2589 );
2590 let rate = cw.fc.count_lrf_switchable(
2591 w,
2592 &ts.restoration.as_const(),
2593 current_lrf,
2594 pli,
2595 );
2596 let cost = compute_rd_cost(fi, rate, err);
2597 if cost < best_cost {
2598 best_cost = cost;
2599 best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2600 best_new_lrf = current_lrf;
2601 }
2602 }
2603
2604 if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2605 .notequal(best_new_lrf)
2606 {
2607 best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2608 lrf_change = true;
2609 if let Some(ru) = ts.restoration.planes[pli]
2610 .restoration_unit_mut(base_sbo + loop_sbo)
2611 {
2612 ru.filter = best_new_lrf;
2613 }
2614 }
2615 }
2616 }
2617 }
2618 }
2619 }
2620 }
2621 }
2622
2623 #[test]
estimate_rate_test()2624 fn estimate_rate_test() {
2625 assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]);
2626 }
2627