1 // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2 // Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
3 //
4 // This source code is subject to the terms of the BSD 2 Clause License and
5 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 // was not distributed with this source code in the LICENSE file, you can
7 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 // Media Patent License 1.0 was not distributed with this source code in the
9 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 
11 #![allow(non_camel_case_types)]
12 
13 use crate::api::*;
14 use crate::cdef::*;
15 use crate::context::*;
16 use crate::cpu_features::CpuFeatureLevel;
17 use crate::deblock::*;
18 use crate::dist::*;
19 use crate::ec::{Writer, WriterCounter, OD_BITRES};
20 use crate::encode_block_with_modes;
21 use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
22 use crate::frame::Frame;
23 use crate::frame::*;
24 use crate::header::ReferenceMode;
25 use crate::lrf::*;
26 use crate::luma_ac;
27 use crate::mc::MotionVector;
28 use crate::me::*;
29 use crate::motion_compensate;
30 use crate::partition::RefType::*;
31 use crate::partition::*;
32 use crate::predict::{
33   AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
34   RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
35 };
36 use crate::rdo_tables::*;
37 use crate::tiling::*;
38 use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
39 use crate::util::{init_slice_repeat_mut, Aligned, CastFromPrimitive, Pixel};
40 use crate::write_tx_blocks;
41 use crate::write_tx_tree;
42 use crate::Tune;
43 use crate::{encode_block_post_cdef, encode_block_pre_cdef};
44 
45 use crate::partition::PartitionType::*;
46 use arrayvec::*;
47 use itertools::izip;
48 use std::fmt;
49 use std::mem::MaybeUninit;
50 
51 #[derive(Copy, Clone, PartialEq)]
52 pub enum RDOType {
53   PixelDistRealRate,
54   TxDistRealRate,
55   TxDistEstRate,
56 }
57 
58 impl RDOType {
59   #[inline]
needs_tx_dist(self) -> bool60   pub fn needs_tx_dist(self) -> bool {
61     match self {
62       // Pixel-domain distortion and exact ec rate
63       RDOType::PixelDistRealRate => false,
64       // Tx-domain distortion and exact ec rate
65       RDOType::TxDistRealRate => true,
66       // Tx-domain distortion and txdist-based rate
67       RDOType::TxDistEstRate => true,
68     }
69   }
70   #[inline]
needs_coeff_rate(self) -> bool71   pub fn needs_coeff_rate(self) -> bool {
72     match self {
73       RDOType::PixelDistRealRate => true,
74       RDOType::TxDistRealRate => true,
75       RDOType::TxDistEstRate => false,
76     }
77   }
78 }
79 
80 #[derive(Clone)]
81 pub struct PartitionGroupParameters {
82   pub rd_cost: f64,
83   pub part_type: PartitionType,
84   pub part_modes: ArrayVec<PartitionParameters, 4>,
85 }
86 
87 #[derive(Clone, Debug)]
88 pub struct PartitionParameters {
89   pub rd_cost: f64,
90   pub bo: TileBlockOffset,
91   pub bsize: BlockSize,
92   pub pred_mode_luma: PredictionMode,
93   pub pred_mode_chroma: PredictionMode,
94   pub pred_cfl_params: CFLParams,
95   pub angle_delta: AngleDelta,
96   pub ref_frames: [RefType; 2],
97   pub mvs: [MotionVector; 2],
98   pub skip: bool,
99   pub has_coeff: bool,
100   pub tx_size: TxSize,
101   pub tx_type: TxType,
102   pub sidx: u8,
103 }
104 
105 impl Default for PartitionParameters {
default() -> Self106   fn default() -> Self {
107     PartitionParameters {
108       rd_cost: std::f64::MAX,
109       bo: TileBlockOffset::default(),
110       bsize: BlockSize::BLOCK_INVALID,
111       pred_mode_luma: PredictionMode::default(),
112       pred_mode_chroma: PredictionMode::default(),
113       pred_cfl_params: CFLParams::default(),
114       angle_delta: AngleDelta::default(),
115       ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
116       mvs: [MotionVector::default(); 2],
117       skip: false,
118       has_coeff: true,
119       tx_size: TxSize::TX_4X4,
120       tx_type: TxType::DCT_DCT,
121       sidx: 0,
122     }
123   }
124 }
125 
estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64126 pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
127   let bs_index = ts as usize;
128   let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV;
129   let bin_idx_down =
130     ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64);
131   let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64);
132   let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
133   let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
134   let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
135   let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
136   let slope = ((y1 - y0) << 8) / (x1 - x0);
137   (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64
138 }
139 
140 // The microbenchmarks perform better with inlining turned off
141 #[inline(never)]
cdef_dist_wxh_8x8<T: Pixel>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize, ) -> RawDistortion142 fn cdef_dist_wxh_8x8<T: Pixel>(
143   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize,
144 ) -> RawDistortion {
145   debug_assert!(src1.plane_cfg.xdec == 0);
146   debug_assert!(src1.plane_cfg.ydec == 0);
147   debug_assert!(src2.plane_cfg.xdec == 0);
148   debug_assert!(src2.plane_cfg.ydec == 0);
149 
150   // Sum into columns to improve auto-vectorization
151   let mut sum_s_cols: [u16; 8] = [0; 8];
152   let mut sum_d_cols: [u16; 8] = [0; 8];
153   let mut sum_s2_cols: [u32; 8] = [0; 8];
154   let mut sum_d2_cols: [u32; 8] = [0; 8];
155   let mut sum_sd_cols: [u32; 8] = [0; 8];
156 
157   // Check upfront that 8 rows are available.
158   let _row1 = &src1[7];
159   let _row2 = &src2[7];
160 
161   for j in 0..8 {
162     let row1 = &src1[j][0..8];
163     let row2 = &src2[j][0..8];
164     for (sum_s, sum_d, sum_s2, sum_d2, sum_sd, s, d) in izip!(
165       &mut sum_s_cols,
166       &mut sum_d_cols,
167       &mut sum_s2_cols,
168       &mut sum_d2_cols,
169       &mut sum_sd_cols,
170       row1,
171       row2
172     ) {
173       // Don't convert directly to u32 to allow better vectorization
174       let s: u16 = u16::cast_from(*s);
175       let d: u16 = u16::cast_from(*d);
176       *sum_s += s;
177       *sum_d += d;
178 
179       // Convert to u32 to avoid overflows when multiplying
180       let s: u32 = s as u32;
181       let d: u32 = d as u32;
182 
183       *sum_s2 += s * s;
184       *sum_d2 += d * d;
185       *sum_sd += s * d;
186     }
187   }
188 
189   // Sum together the sum of columns
190   let sum_s: i64 =
191     sum_s_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
192   let sum_d: i64 =
193     sum_d_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
194   let sum_s2: i64 = sum_s2_cols.iter().sum::<u32>() as i64;
195   let sum_d2: i64 = sum_d2_cols.iter().sum::<u32>() as i64;
196   let sum_sd: i64 = sum_sd_cols.iter().sum::<u32>() as i64;
197 
198   // Use sums to calculate distortion
199   let svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
200   let dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
201   let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as u64;
202   RawDistortion::new(ssim_boost(svar, dvar, bit_depth).mul_u64(sse))
203 }
204 
205 #[inline(always)]
ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale206 pub fn ssim_boost(svar: i64, dvar: i64, bit_depth: usize) -> DistortionScale {
207   let coeff_shift = bit_depth - 8;
208 
209   //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO
210   DistortionScale::new(
211     (4033_f64 / 16_384_f64)
212       * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64
213       / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64),
214   )
215 }
216 
217 #[allow(unused)]
cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, compute_bias: F, ) -> Distortion218 pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
219   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
220   bit_depth: usize, compute_bias: F,
221 ) -> Distortion {
222   assert!(w & 0x7 == 0);
223   assert!(h & 0x7 == 0);
224   debug_assert!(src1.plane_cfg.xdec == 0);
225   debug_assert!(src1.plane_cfg.ydec == 0);
226   debug_assert!(src2.plane_cfg.xdec == 0);
227   debug_assert!(src2.plane_cfg.ydec == 0);
228 
229   let mut sum = Distortion::zero();
230   for j in 0isize..h as isize / 8 {
231     for i in 0isize..w as isize / 8 {
232       let area = Area::StartingAt { x: i * 8, y: j * 8 };
233       let value = cdef_dist_wxh_8x8(
234         &src1.subregion(area),
235         &src2.subregion(area),
236         bit_depth,
237       );
238 
239       // cdef is always called on non-subsampled planes, so BLOCK_8X8 is
240       // correct here.
241       sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
242     }
243   }
244   sum
245 }
246 
247 /// Sum of Squared Error for a wxh block
248 /// Currently limited to w and h of valid blocks
sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, ) -> Distortion249 pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
250   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
251   compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
252 ) -> Distortion {
253   // See get_weighted_sse in src/dist.rs.
254   // Provide a scale to get_weighted_sse for each square region of this size.
255   const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1;
256 
257   // To bias the distortion correctly, compute it in blocks up to the size
258   // importance block size in a non-subsampled plane.
259   let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
260   let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
261 
262   let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
263 
264   let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE;
265   let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE;
266 
267   // TODO: Copying biases into a buffer is slow. It would be best if biases were
268   // passed directly. To do this, we would need different versions of the
269   // weighted sse function for decimated/subsampled data. Also requires
270   // eliminating use of unbiased sse.
271   // It should also be noted that the current copy code does not auto-vectorize.
272 
273   // Copy biases into a buffer.
274   let mut buf_storage = Aligned::new(
275     [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE],
276   );
277   let buf_stride = n_imp_blocks_w.next_power_of_two();
278   let buf = init_slice_repeat_mut(
279     &mut buf_storage.data[..buf_stride * n_imp_blocks_h],
280     0,
281   );
282 
283   for block_y in 0..n_imp_blocks_h {
284     for block_x in 0..n_imp_blocks_w {
285       let block = Area::StartingAt {
286         x: (block_x * CHUNK_SIZE) as isize,
287         y: (block_y * CHUNK_SIZE) as isize,
288       };
289       buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
290     }
291   }
292 
293   Distortion(get_weighted_sse(
294     src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
295   ))
296 }
297 
clip_visible_bsize( frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize, ) -> (usize, usize)298 pub fn clip_visible_bsize(
299   frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
300 ) -> (usize, usize) {
301   let blk_w = bsize.width();
302   let blk_h = bsize.height();
303 
304   let visible_w: usize = if x + blk_w <= frame_w {
305     blk_w
306   } else if x >= frame_w {
307     0
308   } else {
309     frame_w - x
310   };
311 
312   let visible_h: usize = if y + blk_h <= frame_h {
313     blk_h
314   } else if y >= frame_h {
315     0
316   } else {
317     frame_h - y
318   };
319 
320   (visible_w, visible_h)
321 }
322 
323 // Compute the pixel-domain distortion for an encode
compute_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, ) -> ScaledDistortion324 fn compute_distortion<T: Pixel>(
325   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
326   is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
327 ) -> ScaledDistortion {
328   let area = Area::BlockStartingAt { bo: tile_bo.0 };
329   let input_region = ts.input_tile.planes[0].subregion(area);
330   let rec_region = ts.rec.planes[0].subregion(area);
331 
332   // clip a block to have visible pixles only
333   let frame_bo = ts.to_frame_block_offset(tile_bo);
334   let (visible_w, visible_h) = clip_visible_bsize(
335     fi.width,
336     fi.height,
337     bsize,
338     frame_bo.0.x << MI_SIZE_LOG2,
339     frame_bo.0.y << MI_SIZE_LOG2,
340   );
341 
342   if visible_w == 0 || visible_h == 0 {
343     return ScaledDistortion::zero();
344   }
345 
346   let mut distortion = match fi.config.tune {
347     Tune::Psychovisual if bsize.width() >= 8 && bsize.height() >= 8 => {
348       let w8 = visible_w & !7;
349       let h8 = visible_h & !7;
350       let mut sum = Distortion(0);
351       if w8 > 0 && h8 > 0 {
352         sum += cdef_dist_wxh(
353           &input_region,
354           &rec_region,
355           w8,
356           h8,
357           fi.sequence.bit_depth,
358           |bias_area, bsize| {
359             distortion_scale(
360               fi,
361               input_region.subregion(bias_area).frame_block_offset(),
362               bsize,
363             )
364           },
365         );
366       }
367       if visible_w > w8 && h8 > 0 {
368         let area = Area::StartingAt { x: w8 as isize, y: 0 };
369         sum += sse_wxh(
370           &input_region.subregion(area),
371           &rec_region.subregion(area),
372           visible_w - w8,
373           h8,
374           |bias_area, bsize| {
375             spatiotemporal_scale(
376               fi,
377               input_region
378                 .subregion(area)
379                 .subregion(bias_area)
380                 .frame_block_offset(),
381               bsize,
382             )
383           },
384           fi.sequence.bit_depth,
385           fi.cpu_feature_level,
386         );
387       }
388       if visible_h > h8 && visible_w > 0 {
389         let area = Area::StartingAt { x: 0, y: h8 as isize };
390         sum += sse_wxh(
391           &input_region.subregion(area),
392           &rec_region.subregion(area),
393           visible_w,
394           visible_h - h8,
395           |bias_area, bsize| {
396             spatiotemporal_scale(
397               fi,
398               input_region
399                 .subregion(area)
400                 .subregion(bias_area)
401                 .frame_block_offset(),
402               bsize,
403             )
404           },
405           fi.sequence.bit_depth,
406           fi.cpu_feature_level,
407         );
408       }
409       sum
410     }
411     Tune::Psnr | Tune::Psychovisual => sse_wxh(
412       &input_region,
413       &rec_region,
414       visible_w,
415       visible_h,
416       |bias_area, bsize| {
417         distortion_scale(
418           fi,
419           input_region.subregion(bias_area).frame_block_offset(),
420           bsize,
421         )
422       },
423       fi.sequence.bit_depth,
424       fi.cpu_feature_level,
425     ),
426   } * fi.dist_scale[0];
427 
428   if is_chroma_block
429     && !luma_only
430     && fi.sequence.chroma_sampling != ChromaSampling::Cs400
431   {
432     let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
433     let chroma_w = if bsize.width() >= 8 || xdec == 0 {
434       (visible_w + xdec) >> xdec
435     } else {
436       (4 + visible_w + xdec) >> xdec
437     };
438     let chroma_h = if bsize.height() >= 8 || ydec == 0 {
439       (visible_h + ydec) >> ydec
440     } else {
441       (4 + visible_h + ydec) >> ydec
442     };
443 
444     for p in 1..3 {
445       let input_region = ts.input_tile.planes[p].subregion(area);
446       let rec_region = ts.rec.planes[p].subregion(area);
447       distortion += sse_wxh(
448         &input_region,
449         &rec_region,
450         chroma_w,
451         chroma_h,
452         |bias_area, bsize| {
453           distortion_scale(
454             fi,
455             input_region.subregion(bias_area).frame_block_offset(),
456             bsize,
457           )
458         },
459         fi.sequence.bit_depth,
460         fi.cpu_feature_level,
461       ) * fi.dist_scale[p];
462     }
463   }
464   distortion
465 }
466 
467 // Compute the transform-domain distortion for an encode
compute_tx_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, skip: bool, luma_only: bool, ) -> ScaledDistortion468 fn compute_tx_distortion<T: Pixel>(
469   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
470   is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
471   skip: bool, luma_only: bool,
472 ) -> ScaledDistortion {
473   assert!(fi.config.tune == Tune::Psnr);
474   let area = Area::BlockStartingAt { bo: tile_bo.0 };
475   let input_region = ts.input_tile.planes[0].subregion(area);
476   let rec_region = ts.rec.planes[0].subregion(area);
477 
478   let (visible_w, visible_h) = if !skip {
479     (bsize.width(), bsize.height())
480   } else {
481     let frame_bo = ts.to_frame_block_offset(tile_bo);
482     clip_visible_bsize(
483       fi.width,
484       fi.height,
485       bsize,
486       frame_bo.0.x << MI_SIZE_LOG2,
487       frame_bo.0.y << MI_SIZE_LOG2,
488     )
489   };
490 
491   if visible_w == 0 || visible_h == 0 {
492     return ScaledDistortion::zero();
493   }
494 
495   let mut distortion = if skip {
496     sse_wxh(
497       &input_region,
498       &rec_region,
499       visible_w,
500       visible_h,
501       |bias_area, bsize| {
502         distortion_scale(
503           fi,
504           input_region.subregion(bias_area).frame_block_offset(),
505           bsize,
506         )
507       },
508       fi.sequence.bit_depth,
509       fi.cpu_feature_level,
510     ) * fi.dist_scale[0]
511   } else {
512     tx_dist
513   };
514 
515   if is_chroma_block
516     && !luma_only
517     && skip
518     && fi.sequence.chroma_sampling != ChromaSampling::Cs400
519   {
520     let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
521     let chroma_w = if bsize.width() >= 8 || xdec == 0 {
522       (visible_w + xdec) >> xdec
523     } else {
524       (4 + visible_w + xdec) >> xdec
525     };
526     let chroma_h = if bsize.height() >= 8 || ydec == 0 {
527       (visible_h + ydec) >> ydec
528     } else {
529       (4 + visible_h + ydec) >> ydec
530     };
531 
532     for p in 1..3 {
533       let input_region = ts.input_tile.planes[p].subregion(area);
534       let rec_region = ts.rec.planes[p].subregion(area);
535       distortion += sse_wxh(
536         &input_region,
537         &rec_region,
538         chroma_w,
539         chroma_h,
540         |bias_area, bsize| {
541           distortion_scale(
542             fi,
543             input_region.subregion(bias_area).frame_block_offset(),
544             bsize,
545           )
546         },
547         fi.sequence.bit_depth,
548         fi.cpu_feature_level,
549       ) * fi.dist_scale[p];
550     }
551   }
552   distortion
553 }
554 
555 /// Compute a scaling factor to multiply the distortion of a block by,
556 /// this factor is determined using temporal RDO.
distortion_scale<T: Pixel>( fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale557 pub fn distortion_scale<T: Pixel>(
558   fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
559 ) -> DistortionScale {
560   if !fi.config.temporal_rdo() {
561     return DistortionScale::default();
562   }
563   // EncoderConfig::temporal_rdo() should always return false in situations
564   // where distortion is computed on > 8x8 blocks, so we should never hit this
565   // assert.
566   assert!(bsize <= BlockSize::BLOCK_8X8);
567 
568   let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
569   let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
570 
571   fi.distortion_scales[y * fi.w_in_imp_b + x]
572 }
573 
spatiotemporal_scale<T: Pixel>( fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale574 pub fn spatiotemporal_scale<T: Pixel>(
575   fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
576 ) -> DistortionScale {
577   if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual {
578     return DistortionScale::default();
579   }
580 
581   let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
582   let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
583   let x1 = (x0 + bsize.width_imp_b()).min(fi.w_in_imp_b);
584   let y1 = (y0 + bsize.height_imp_b()).min(fi.h_in_imp_b);
585   let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT;
586 
587   let mut sum = 0;
588   for y in y0..y1 {
589     sum += fi.distortion_scales[y * fi.w_in_imp_b..][x0..x1]
590       .iter()
591       .zip(fi.activity_scales[y * fi.w_in_imp_b..][x0..x1].iter())
592       .take(MAX_SB_IN_IMP_B)
593       .map(|(d, a)| d.0 as u64 * a.0 as u64)
594       .sum::<u64>();
595   }
596   DistortionScale(((sum + (den >> 1)) / den) as u32)
597 }
598 
distortion_scale_for( propagate_cost: f64, intra_cost: f64, ) -> DistortionScale599 pub fn distortion_scale_for(
600   propagate_cost: f64, intra_cost: f64,
601 ) -> DistortionScale {
602   // The mbtree paper \cite{mbtree} uses the following formula:
603   //
604   //     QP_delta = -strength * log2(1 + (propagate_cost / intra_cost))
605   //
606   // Since this is H.264, this corresponds to the following quantizer:
607   //
608   //     Q' = Q * 2^(QP_delta/6)
609   //
610   // Since lambda is proportial to Q^2, this means we want to minimize:
611   //
612   //     D + lambda' * R
613   //   = D + 2^(QP_delta / 3) * lambda * R
614   //
615   // If we want to keep lambda fixed, we can instead scale distortion and
616   // minimize:
617   //
618   //     D * scale + lambda * R
619   //
620   // where:
621   //
622   //     scale = 2^(QP_delta / -3)
623   //           = (1 + (propagate_cost / intra_cost))^(strength / 3)
624   //
625   //  The original paper empirically chooses strength = 2.0, but strength = 1.0
626   //  seems to work best in rav1e currently, this may have something to do with
627   //  the fact that they use 16x16 blocks whereas our "importance blocks" are
628   //  8x8, but everything should be scale invariant here so that's weird.
629   //
630   // @article{mbtree,
631   //   title={A novel macroblock-tree algorithm for high-performance
632   //    optimization of dependent video coding in H.264/AVC},
633   //   author={Garrett-Glaser, Jason},
634   //   journal={Tech. Rep.},
635   //   year={2009},
636   //   url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
637   // }
638 
639   if intra_cost == 0. {
640     return DistortionScale::default(); // no scaling
641   }
642 
643   let strength = 1.0; // empirical, see comment above
644   let frac = (intra_cost + propagate_cost) / intra_cost;
645   DistortionScale::new(frac.powf(strength / 3.0))
646 }
647 
648 /// Fixed point arithmetic version of distortion scale
649 #[repr(transparent)]
650 #[derive(Copy, Clone)]
651 pub struct DistortionScale(pub u32);
652 
653 #[repr(transparent)]
654 pub struct RawDistortion(u64);
655 
656 #[repr(transparent)]
657 pub struct Distortion(pub u64);
658 
659 #[repr(transparent)]
660 pub struct ScaledDistortion(u64);
661 
662 impl DistortionScale {
663   /// Bits past the radix point
664   const SHIFT: u32 = 12;
665   /// Number of bits used. Determines the max value.
666   /// 24 bits is likely excessive.
667   const BITS: u32 = 24;
668 
669   #[inline]
new(scale: f64) -> Self670   pub fn new(scale: f64) -> Self {
671     Self(
672       (scale * (1 << Self::SHIFT) as f64 + 0.5)
673         .min(((1 << Self::BITS as u64) - 1) as f64) as u32,
674     )
675   }
676 
677   /// Multiply, round and shift
678   /// Internal implementation, so don't use multiply trait.
679   #[inline]
mul_u64(self, dist: u64) -> u64680   pub fn mul_u64(self, dist: u64) -> u64 {
681     (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
682   }
683 }
684 
685 // Default value for DistortionScale is a fixed point 1
686 impl Default for DistortionScale {
687   #[inline]
default() -> Self688   fn default() -> Self {
689     Self(1 << Self::SHIFT)
690   }
691 }
692 
693 impl fmt::Debug for DistortionScale {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result694   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
695     write!(f, "{}", f64::from(*self))
696   }
697 }
698 
699 impl From<DistortionScale> for f64 {
700   #[inline]
from(scale: DistortionScale) -> Self701   fn from(scale: DistortionScale) -> Self {
702     scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64
703   }
704 }
705 
706 impl RawDistortion {
707   #[inline]
new(dist: u64) -> Self708   pub const fn new(dist: u64) -> Self {
709     Self(dist)
710   }
711 }
712 
713 impl std::ops::Mul<DistortionScale> for RawDistortion {
714   type Output = Distortion;
715   #[inline]
mul(self, rhs: DistortionScale) -> Distortion716   fn mul(self, rhs: DistortionScale) -> Distortion {
717     Distortion(rhs.mul_u64(self.0))
718   }
719 }
720 
721 impl Distortion {
722   #[inline]
zero() -> Self723   pub const fn zero() -> Self {
724     Self(0)
725   }
726 }
727 
728 impl std::ops::Mul<f64> for Distortion {
729   type Output = ScaledDistortion;
730   #[inline]
mul(self, rhs: f64) -> ScaledDistortion731   fn mul(self, rhs: f64) -> ScaledDistortion {
732     ScaledDistortion((self.0 as f64 * rhs) as u64)
733   }
734 }
735 
736 impl std::ops::AddAssign for Distortion {
737   #[inline]
add_assign(&mut self, other: Self)738   fn add_assign(&mut self, other: Self) {
739     self.0 += other.0;
740   }
741 }
742 
743 impl ScaledDistortion {
744   #[inline]
zero() -> Self745   pub const fn zero() -> Self {
746     Self(0)
747   }
748 }
749 
750 impl std::ops::AddAssign for ScaledDistortion {
751   #[inline]
add_assign(&mut self, other: Self)752   fn add_assign(&mut self, other: Self) {
753     self.0 += other.0;
754   }
755 }
756 
compute_rd_cost<T: Pixel>( fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion, ) -> f64757 pub fn compute_rd_cost<T: Pixel>(
758   fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
759 ) -> f64 {
760   let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64);
761   distortion.0 as f64 + fi.lambda * rate_in_bits
762 }
763 
rdo_tx_size_type<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], skip: bool, ) -> (TxSize, TxType)764 pub fn rdo_tx_size_type<T: Pixel>(
765   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
766   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
767   luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
768   skip: bool,
769 ) -> (TxSize, TxType) {
770   let is_inter = !luma_mode.is_intra();
771   let mut tx_size = max_txsize_rect_lookup[bsize as usize];
772 
773   if fi.enable_inter_txfm_split && is_inter && !skip {
774     tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
775   }
776 
777   let mut best_tx_type = TxType::DCT_DCT;
778   let mut best_tx_size = tx_size;
779   let mut best_rd = std::f64::MAX;
780 
781   let do_rdo_tx_size =
782     fi.tx_mode_select && fi.config.speed_settings.rdo_tx_decision && !is_inter;
783   let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 };
784   let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
785 
786   for _ in 0..=rdo_tx_depth {
787     let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
788 
789     let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
790       && fi.config.speed_settings.rdo_tx_decision
791       && !is_inter
792       && !skip;
793 
794     if !do_rdo_tx_size && !do_rdo_tx_type {
795       return (best_tx_size, best_tx_type);
796     };
797 
798     let tx_types =
799       if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
800 
801     // Luma plane transform type decision
802     let (tx_type, rd_cost) = rdo_tx_type_decision(
803       fi,
804       ts,
805       cw,
806       &mut cw_checkpoint,
807       luma_mode,
808       ref_frames,
809       mvs,
810       bsize,
811       tile_bo,
812       tx_size,
813       tx_set,
814       tx_types,
815     );
816 
817     if rd_cost < best_rd {
818       best_tx_size = tx_size;
819       best_tx_type = tx_type;
820       best_rd = rd_cost;
821     }
822 
823     debug_assert!(tx_size.width_log2() <= bsize.width_log2());
824     debug_assert!(tx_size.height_log2() <= bsize.height_log2());
825     debug_assert!(
826       tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT
827     );
828 
829     let next_tx_size = sub_tx_size_map[tx_size as usize];
830 
831     if next_tx_size == tx_size {
832       break;
833     } else {
834       tx_size = next_tx_size;
835     };
836   }
837 
838   (best_tx_size, best_tx_type)
839 }
840 
841 #[inline]
dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool842 fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
843   let diff_row = mv.row as i32 - ref_mv.row as i32;
844   let diff_col = mv.col as i32 - ref_mv.col as i32;
845   diff_row >= MV_LOW
846     && diff_row <= MV_UPP
847     && diff_col >= MV_LOW
848     && diff_col <= MV_UPP
849 }
850 
851 #[inline]
luma_chroma_mode_rdo<T: Pixel>( luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize, tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, rdo_type: RDOType, cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters, mvs: [MotionVector; 2], ref_frames: [RefType; 2], mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool, mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>, angle_delta: AngleDelta, )852 fn luma_chroma_mode_rdo<T: Pixel>(
853   luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
854   tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
855   cw: &mut ContextWriter, rdo_type: RDOType,
856   cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
857   mvs: [MotionVector; 2], ref_frames: [RefType; 2],
858   mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
859   mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>,
860   angle_delta: AngleDelta,
861 ) {
862   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
863 
864   let is_chroma_block =
865     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
866 
867   if !luma_mode_is_intra {
868     let ref_mvs = if mv_stack.is_empty() {
869       [MotionVector::default(); 2]
870     } else {
871       [mv_stack[0].this_mv, mv_stack[0].comp_mv]
872     };
873 
874     if (luma_mode == PredictionMode::NEWMV
875       || luma_mode == PredictionMode::NEW_NEWMV
876       || luma_mode == PredictionMode::NEW_NEARESTMV)
877       && !dmv_in_range(mvs[0], ref_mvs[0])
878     {
879       return;
880     }
881 
882     if (luma_mode == PredictionMode::NEW_NEWMV
883       || luma_mode == PredictionMode::NEAREST_NEWMV)
884       && !dmv_in_range(mvs[1], ref_mvs[1])
885     {
886       return;
887     }
888   }
889 
890   // Find the best chroma prediction mode for the current luma prediction mode
891   let mut chroma_rdo = |skip: bool| -> bool {
892     use crate::segmentation::select_segment;
893 
894     let mut zero_distortion = false;
895 
896     for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
897       cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
898 
899       let (tx_size, tx_type) = rdo_tx_size_type(
900         fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
901       );
902       for &chroma_mode in mode_set_chroma.iter() {
903         let wr = &mut WriterCounter::new();
904         let tell = wr.tell_frac();
905 
906         if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
907           cw.write_partition(
908             wr,
909             tile_bo,
910             PartitionType::PARTITION_NONE,
911             bsize,
912           );
913         }
914 
915         // TODO(yushin): luma and chroma would have different decision based on chroma format
916         let need_recon_pixel =
917           luma_mode_is_intra && tx_size.block_size() != bsize;
918 
919         encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
920         let (has_coeff, tx_dist) = encode_block_post_cdef(
921           fi,
922           ts,
923           cw,
924           wr,
925           luma_mode,
926           chroma_mode,
927           angle_delta,
928           ref_frames,
929           mvs,
930           bsize,
931           tile_bo,
932           skip,
933           CFLParams::default(),
934           tx_size,
935           tx_type,
936           mode_context,
937           mv_stack,
938           rdo_type,
939           need_recon_pixel,
940           false,
941         );
942 
943         let rate = wr.tell_frac() - tell;
944         let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
945           compute_tx_distortion(
946             fi,
947             ts,
948             bsize,
949             is_chroma_block,
950             tile_bo,
951             tx_dist,
952             skip,
953             false,
954           )
955         } else {
956           compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
957         };
958         let is_zero_dist = distortion.0 == 0;
959         let rd = compute_rd_cost(fi, rate, distortion);
960         if rd < best.rd_cost {
961           //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV {
962           best.rd_cost = rd;
963           best.pred_mode_luma = luma_mode;
964           best.pred_mode_chroma = chroma_mode;
965           best.angle_delta = angle_delta;
966           best.ref_frames = ref_frames;
967           best.mvs = mvs;
968           best.skip = skip;
969           best.has_coeff = has_coeff;
970           best.tx_size = tx_size;
971           best.tx_type = tx_type;
972           best.sidx = sidx;
973           zero_distortion = is_zero_dist;
974         }
975 
976         cw.rollback(cw_checkpoint);
977       }
978     }
979 
980     zero_distortion
981   };
982 
983   // Don't skip when using intra modes
984   let zero_distortion =
985     if !luma_mode_is_intra { chroma_rdo(true) } else { false };
986   // early skip
987   if !zero_distortion {
988     chroma_rdo(false);
989   }
990 }
991 
992 // RDO-based mode decision
rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, ) -> PartitionParameters993 pub fn rdo_mode_decision<T: Pixel>(
994   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
995   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
996   inter_cfg: &InterConfig,
997 ) -> PartitionParameters {
998   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
999   let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1000 
1001   let rdo_type = if fi.use_tx_domain_rate {
1002     RDOType::TxDistEstRate
1003   } else if fi.use_tx_domain_distortion {
1004     RDOType::TxDistRealRate
1005   } else {
1006     RDOType::PixelDistRealRate
1007   };
1008 
1009   let mut best = if fi.frame_type.has_inter() {
1010     assert!(fi.frame_type != FrameType::KEY);
1011 
1012     inter_frame_rdo_mode_decision(
1013       fi,
1014       ts,
1015       cw,
1016       bsize,
1017       tile_bo,
1018       inter_cfg,
1019       &cw_checkpoint,
1020       rdo_type,
1021     )
1022   } else {
1023     PartitionParameters::default()
1024   };
1025 
1026   let is_chroma_block =
1027     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1028 
1029   if !best.skip {
1030     best = intra_frame_rdo_mode_decision(
1031       fi,
1032       ts,
1033       cw,
1034       bsize,
1035       tile_bo,
1036       &cw_checkpoint,
1037       rdo_type,
1038       best,
1039       is_chroma_block,
1040     );
1041   }
1042 
1043   if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
1044     cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
1045 
1046     let chroma_mode = PredictionMode::UV_CFL_PRED;
1047     let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1048     let mut wr = WriterCounter::new();
1049     let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
1050 
1051     write_tx_blocks(
1052       fi,
1053       ts,
1054       cw,
1055       &mut wr,
1056       best.pred_mode_luma,
1057       best.pred_mode_luma,
1058       angle_delta,
1059       tile_bo,
1060       bsize,
1061       best.tx_size,
1062       best.tx_type,
1063       false,
1064       CFLParams::default(),
1065       true,
1066       rdo_type,
1067       true,
1068     );
1069     cw.rollback(&cw_checkpoint);
1070     if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1071       if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1072         let mut wr = WriterCounter::new();
1073         let tell = wr.tell_frac();
1074 
1075         encode_block_pre_cdef(
1076           &fi.sequence,
1077           ts,
1078           cw,
1079           &mut wr,
1080           bsize,
1081           tile_bo,
1082           best.skip,
1083         );
1084         let (has_coeff, _) = encode_block_post_cdef(
1085           fi,
1086           ts,
1087           cw,
1088           &mut wr,
1089           best.pred_mode_luma,
1090           chroma_mode,
1091           angle_delta,
1092           best.ref_frames,
1093           best.mvs,
1094           bsize,
1095           tile_bo,
1096           best.skip,
1097           cfl,
1098           best.tx_size,
1099           best.tx_type,
1100           0,
1101           &[],
1102           rdo_type,
1103           true, // For CFL, luma should be always reconstructed.
1104           false,
1105         );
1106 
1107         let rate = wr.tell_frac() - tell;
1108 
1109         // For CFL, tx-domain distortion is not an option.
1110         let distortion =
1111           compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
1112         let rd = compute_rd_cost(fi, rate, distortion);
1113         if rd < best.rd_cost {
1114           best.rd_cost = rd;
1115           best.pred_mode_chroma = chroma_mode;
1116           best.angle_delta = angle_delta;
1117           best.has_coeff = has_coeff;
1118           best.pred_cfl_params = cfl;
1119         }
1120 
1121         cw.rollback(&cw_checkpoint);
1122       }
1123     }
1124   }
1125 
1126   cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1127   cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1128   cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1129 
1130   assert!(best.rd_cost >= 0_f64);
1131 
1132   PartitionParameters {
1133     bo: tile_bo,
1134     bsize,
1135     pred_mode_luma: best.pred_mode_luma,
1136     pred_mode_chroma: best.pred_mode_chroma,
1137     pred_cfl_params: best.pred_cfl_params,
1138     angle_delta: best.angle_delta,
1139     ref_frames: best.ref_frames,
1140     mvs: best.mvs,
1141     rd_cost: best.rd_cost,
1142     skip: best.skip,
1143     has_coeff: best.has_coeff,
1144     tx_size: best.tx_size,
1145     tx_type: best.tx_type,
1146     sidx: best.sidx,
1147   }
1148 }
1149 
inter_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, ) -> PartitionParameters1150 fn inter_frame_rdo_mode_decision<T: Pixel>(
1151   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1152   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1153   inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1154   rdo_type: RDOType,
1155 ) -> PartitionParameters {
1156   let mut best = PartitionParameters::default();
1157 
1158   // we can never have more than 7 reference frame sets
1159   let mut ref_frames_set = ArrayVec::<_, 7>::new();
1160   // again, max of 7 ref slots
1161   let mut ref_slot_set = ArrayVec::<_, 7>::new();
1162   // our implementation never returns more than 3 at the moment
1163   let mut mvs_from_me = ArrayVec::<_, 3>::new();
1164   let mut fwdref = None;
1165   let mut bwdref = None;
1166 
1167   for i in inter_cfg.allowed_ref_frames().iter().copied() {
1168     // Don't search LAST3 since it's used only for probs
1169     if i == LAST3_FRAME {
1170       continue;
1171     }
1172 
1173     if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1174       if fwdref == None && i.is_fwd_ref() {
1175         fwdref = Some(ref_frames_set.len());
1176       }
1177       if bwdref == None && i.is_bwd_ref() {
1178         bwdref = Some(ref_frames_set.len());
1179       }
1180       ref_frames_set.push([i, NONE_FRAME]);
1181       let slot_idx = fi.ref_frames[i.to_index()];
1182       ref_slot_set.push(slot_idx);
1183     }
1184   }
1185   assert!(!ref_frames_set.is_empty());
1186 
1187   let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new();
1188   let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new();
1189   let mut satds = ArrayVec::<u32, 20>::new();
1190   let mut mv_stacks = ArrayVec::<_, 20>::new();
1191   let mut mode_contexts = ArrayVec::<_, 7>::new();
1192 
1193   for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1194     let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1195     mode_contexts.push(cw.find_mvrefs(
1196       tile_bo,
1197       ref_frames,
1198       &mut mv_stack,
1199       bsize,
1200       fi,
1201       false,
1202     ));
1203 
1204     let mut pmv = [MotionVector::default(); 2];
1205     if !mv_stack.is_empty() {
1206       pmv[0] = mv_stack[0].this_mv;
1207     }
1208     if mv_stack.len() > 1 {
1209       pmv[1] = mv_stack[1].this_mv;
1210     }
1211 
1212     let res = motion_estimation(fi, ts, bsize, tile_bo, ref_frames[0], pmv);
1213     let b_me = res.0;
1214 
1215     mvs_from_me.push([b_me, MotionVector::default()]);
1216 
1217     for &x in RAV1E_INTER_MODES_MINIMAL {
1218       inter_mode_set.push((x, i));
1219     }
1220     if !mv_stack.is_empty() {
1221       inter_mode_set.push((PredictionMode::NEAR0MV, i));
1222     }
1223     if mv_stack.len() >= 2 {
1224       inter_mode_set.push((PredictionMode::GLOBALMV, i));
1225     }
1226     let include_near_mvs = fi.config.speed_settings.include_near_mvs;
1227     if include_near_mvs {
1228       if mv_stack.len() >= 3 {
1229         inter_mode_set.push((PredictionMode::NEAR1MV, i));
1230       }
1231       if mv_stack.len() >= 4 {
1232         inter_mode_set.push((PredictionMode::NEAR2MV, i));
1233       }
1234     }
1235     let same_row_col = |x: &CandidateMV| {
1236       x.this_mv.row == mvs_from_me[i][0].row
1237         && x.this_mv.col == mvs_from_me[i][0].col
1238     };
1239     if !mv_stack
1240       .iter()
1241       .take(if include_near_mvs { 4 } else { 2 })
1242       .any(same_row_col)
1243       && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0)
1244     {
1245       inter_mode_set.push((PredictionMode::NEWMV, i));
1246     }
1247 
1248     mv_stacks.push(mv_stack);
1249   }
1250 
1251   let sz = bsize.width_mi().min(bsize.height_mi());
1252 
1253   // To use non single reference modes, block width and height must be greater than 4.
1254   if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 {
1255     // Adding compound candidate
1256     if let Some(r0) = fwdref {
1257       if let Some(r1) = bwdref {
1258         let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]];
1259         ref_frames_set.push(ref_frames);
1260         let mv0 = mvs_from_me[r0][0];
1261         let mv1 = mvs_from_me[r1][0];
1262         mvs_from_me.push([mv0, mv1]);
1263         let mut mv_stack = ArrayVec::<CandidateMV, 9>::new();
1264         mode_contexts.push(cw.find_mvrefs(
1265           tile_bo,
1266           ref_frames,
1267           &mut mv_stack,
1268           bsize,
1269           fi,
1270           true,
1271         ));
1272         for &x in RAV1E_INTER_COMPOUND_MODES {
1273           // exclude any NEAR mode based on speed setting
1274           if fi.config.speed_settings.include_near_mvs || !x.has_nearmv() {
1275             let mv_stack_idx = ref_frames_set.len() - 1;
1276             // exclude NEAR modes if the mv_stack is too short
1277             if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1278               inter_mode_set.push((x, mv_stack_idx));
1279             }
1280           }
1281         }
1282         mv_stacks.push(mv_stack);
1283       }
1284     }
1285   }
1286 
1287   let num_modes_rdo = if fi.config.speed_settings.prediction_modes
1288     >= PredictionModesSetting::ComplexAll
1289   {
1290     inter_mode_set.len()
1291   } else {
1292     9 // This number is determined by AWCY test
1293   };
1294 
1295   inter_mode_set.iter().for_each(|&(luma_mode, i)| {
1296     let mvs = match luma_mode {
1297       PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i],
1298       PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => {
1299         if !mv_stacks[i].is_empty() {
1300           [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv]
1301         } else {
1302           [MotionVector::default(); 2]
1303         }
1304       }
1305       PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => {
1306         if mv_stacks[i].len() > 1 {
1307           [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv]
1308         } else {
1309           [MotionVector::default(); 2]
1310         }
1311       }
1312       PredictionMode::NEAR1MV
1313       | PredictionMode::NEAR2MV
1314       | PredictionMode::NEAR_NEAR1MV
1315       | PredictionMode::NEAR_NEAR2MV => [
1316         mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1317         mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1318       ],
1319       PredictionMode::NEAREST_NEWMV => {
1320         [mv_stacks[i][0].this_mv, mvs_from_me[i][1]]
1321       }
1322       PredictionMode::NEW_NEARESTMV => {
1323         [mvs_from_me[i][0], mv_stacks[i][0].comp_mv]
1324       }
1325       PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => {
1326         [MotionVector::default(); 2]
1327       }
1328       _ => {
1329         unimplemented!();
1330       }
1331     };
1332     mvs_set.push(mvs);
1333 
1334     // Calculate SATD for each mode
1335     if num_modes_rdo != inter_mode_set.len() {
1336       let tile_rect = ts.tile_rect();
1337       let rec = &mut ts.rec.planes[0];
1338       let po = tile_bo.plane_offset(rec.plane_cfg);
1339       let mut rec_region =
1340         rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1341 
1342       luma_mode.predict_inter(
1343         fi,
1344         tile_rect,
1345         0,
1346         po,
1347         &mut rec_region,
1348         bsize.width(),
1349         bsize.height(),
1350         ref_frames_set[i],
1351         mvs,
1352         &mut ts.inter_compound_buffers,
1353       );
1354 
1355       let plane_org = ts.input_tile.planes[0]
1356         .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1357       let plane_ref = rec_region.as_const();
1358 
1359       let satd = get_satd(
1360         &plane_org,
1361         &plane_ref,
1362         bsize,
1363         fi.sequence.bit_depth,
1364         fi.cpu_feature_level,
1365       );
1366       satds.push(satd);
1367     } else {
1368       satds.push(0);
1369     }
1370   });
1371 
1372   let mut sorted =
1373     izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, 20>>();
1374   if num_modes_rdo != sorted.len() {
1375     sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd);
1376   }
1377 
1378   sorted.iter().take(num_modes_rdo).for_each(
1379     |&((luma_mode, i), mvs, _satd)| {
1380       let mode_set_chroma = ArrayVec::from([luma_mode]);
1381 
1382       luma_chroma_mode_rdo(
1383         luma_mode,
1384         fi,
1385         bsize,
1386         tile_bo,
1387         ts,
1388         cw,
1389         rdo_type,
1390         cw_checkpoint,
1391         &mut best,
1392         mvs,
1393         ref_frames_set[i],
1394         &mode_set_chroma,
1395         false,
1396         mode_contexts[i],
1397         &mv_stacks[i],
1398         AngleDelta::default(),
1399       );
1400     },
1401   );
1402 
1403   best
1404 }
1405 
intra_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, mut best: PartitionParameters, is_chroma_block: bool, ) -> PartitionParameters1406 fn intra_frame_rdo_mode_decision<T: Pixel>(
1407   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1408   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1409   cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1410   mut best: PartitionParameters, is_chroma_block: bool,
1411 ) -> PartitionParameters {
1412   let num_modes_rdo: usize;
1413   let mut modes = ArrayVec::<_, INTRA_MODES>::new();
1414 
1415   // Reduce number of prediction modes at higher speed levels
1416   num_modes_rdo = if (fi.frame_type == FrameType::KEY
1417     && fi.config.speed_settings.prediction_modes
1418       >= PredictionModesSetting::ComplexKeyframes)
1419     || (fi.frame_type.has_inter()
1420       && fi.config.speed_settings.prediction_modes
1421         >= PredictionModesSetting::ComplexAll)
1422   {
1423     7
1424   } else {
1425     3
1426   };
1427 
1428   let intra_mode_set = RAV1E_INTRA_MODES;
1429 
1430   // Find mode with lowest rate cost
1431   {
1432     use crate::ec::cdf_to_pdf;
1433 
1434     let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() {
1435       cw.get_cdf_intra_mode(bsize)
1436     } else {
1437       cw.get_cdf_intra_mode_kf(tile_bo)
1438     });
1439 
1440     modes.try_extend_from_slice(intra_mode_set).unwrap();
1441     modes.sort_by_key(|&a| !probs_all[a as usize]);
1442   }
1443 
1444   // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1445   // may be improved by emulating prediction for each tx block.
1446   {
1447     let satds = {
1448       // FIXME: If tx partition is used, this whole sads block should be fixed
1449       let tx_size = bsize.tx_size();
1450       let edge_buf = {
1451         let rec = &ts.rec.planes[0].as_const();
1452         let po = tile_bo.plane_offset(rec.plane_cfg);
1453         // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1454         get_intra_edges(
1455           rec,
1456           tile_bo,
1457           0,
1458           0,
1459           bsize,
1460           po,
1461           tx_size,
1462           fi.sequence.bit_depth,
1463           None,
1464           fi.sequence.enable_intra_edge_filter,
1465           IntraParam::None,
1466         )
1467       };
1468 
1469       let ief_params = if fi.sequence.enable_intra_edge_filter {
1470         let above_block_info = ts.above_block_info(tile_bo, 0, 0);
1471         let left_block_info = ts.left_block_info(tile_bo, 0, 0);
1472         Some(IntraEdgeFilterParameters::new(
1473           0,
1474           above_block_info,
1475           left_block_info,
1476         ))
1477       } else {
1478         None
1479       };
1480 
1481       let mut satds_all = [0; INTRA_MODES];
1482       for &luma_mode in modes.iter().skip(num_modes_rdo / 2) {
1483         let tile_rect = ts.tile_rect();
1484         let rec = &mut ts.rec.planes[0];
1485         let mut rec_region =
1486           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1487         // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1488         luma_mode.predict_intra(
1489           tile_rect,
1490           &mut rec_region,
1491           tx_size,
1492           fi.sequence.bit_depth,
1493           &[0i16; 2],
1494           IntraParam::None,
1495           if luma_mode.is_directional() { ief_params } else { None },
1496           &edge_buf,
1497           fi.cpu_feature_level,
1498         );
1499 
1500         let plane_org = ts.input_tile.planes[0]
1501           .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1502         let plane_ref = rec_region.as_const();
1503 
1504         satds_all[luma_mode as usize] = get_satd(
1505           &plane_org,
1506           &plane_ref,
1507           tx_size.block_size(),
1508           fi.sequence.bit_depth,
1509           fi.cpu_feature_level,
1510         );
1511       }
1512       satds_all
1513     };
1514 
1515     modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]);
1516   }
1517 
1518   debug_assert!(num_modes_rdo >= 1);
1519 
1520   modes.iter().take(num_modes_rdo).for_each(|&luma_mode| {
1521     let mvs = [MotionVector::default(); 2];
1522     let ref_frames = [INTRA_FRAME, NONE_FRAME];
1523     let mut mode_set_chroma = ArrayVec::<_, 2>::new();
1524     mode_set_chroma.push(luma_mode);
1525     if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1526       mode_set_chroma.push(PredictionMode::DC_PRED);
1527     }
1528     luma_chroma_mode_rdo(
1529       luma_mode,
1530       fi,
1531       bsize,
1532       tile_bo,
1533       ts,
1534       cw,
1535       rdo_type,
1536       cw_checkpoint,
1537       &mut best,
1538       mvs,
1539       ref_frames,
1540       &mode_set_chroma,
1541       true,
1542       0,
1543       &ArrayVec::<CandidateMV, 9>::new(),
1544       AngleDelta::default(),
1545     );
1546   });
1547 
1548   if fi.config.speed_settings.fine_directional_intra
1549     && bsize >= BlockSize::BLOCK_8X8
1550   {
1551     // Find the best angle delta for the current best prediction mode
1552     let luma_deltas = best.pred_mode_luma.angle_delta_count();
1553     let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1554 
1555     let mvs = [MotionVector::default(); 2];
1556     let ref_frames = [INTRA_FRAME, NONE_FRAME];
1557     let mode_set_chroma = [best.pred_mode_chroma];
1558     let mv_stack = ArrayVec::<_, 9>::new();
1559     let mut best_angle_delta = best.angle_delta;
1560     let mut angle_delta_rdo = |y, uv| -> AngleDelta {
1561       if best.angle_delta.y != y || best.angle_delta.uv != uv {
1562         luma_chroma_mode_rdo(
1563           best.pred_mode_luma,
1564           fi,
1565           bsize,
1566           tile_bo,
1567           ts,
1568           cw,
1569           rdo_type,
1570           cw_checkpoint,
1571           &mut best,
1572           mvs,
1573           ref_frames,
1574           &mode_set_chroma,
1575           true,
1576           0,
1577           &mv_stack,
1578           AngleDelta { y, uv },
1579         );
1580       }
1581       best.angle_delta
1582     };
1583 
1584     for i in 0..luma_deltas {
1585       let angle_delta_y =
1586         if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 };
1587       best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1588     }
1589     for j in 0..chroma_deltas {
1590       let angle_delta_uv =
1591         if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 };
1592       best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1593     }
1594   }
1595 
1596   best
1597 }
1598 
rdo_cfl_alpha<T: Pixel>( ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, luma_tx_size: TxSize, fi: &FrameInvariants<T>, ) -> Option<CFLParams>1599 pub fn rdo_cfl_alpha<T: Pixel>(
1600   ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1601   luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1602 ) -> Option<CFLParams> {
1603   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1604   let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1605   debug_assert!(bsize.subsampled_size(xdec, ydec) == uv_tx_size.block_size());
1606 
1607   let frame_bo = ts.to_frame_block_offset(tile_bo);
1608   let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1609     (fi.width + xdec) >> xdec,
1610     (fi.height + ydec) >> ydec,
1611     uv_tx_size.block_size(),
1612     (frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1613     (frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1614   );
1615 
1616   if visible_tx_w == 0 || visible_tx_h == 0 {
1617     return None;
1618   };
1619   let mut ac: Aligned<[i16; 32 * 32]> = Aligned::uninitialized();
1620   luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1621   let best_alpha: ArrayVec<i16, 2> = (1..3)
1622     .map(|p| {
1623       let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1624       let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1625       let rec = &mut ts.rec.planes[p];
1626       let input = &ts.input_tile.planes[p];
1627       let po = tile_bo.plane_offset(rec.plane_cfg);
1628       let edge_buf = get_intra_edges(
1629         &rec.as_const(),
1630         tile_bo,
1631         0,
1632         0,
1633         bsize,
1634         po,
1635         uv_tx_size,
1636         fi.sequence.bit_depth,
1637         Some(PredictionMode::UV_CFL_PRED),
1638         fi.sequence.enable_intra_edge_filter,
1639         IntraParam::None,
1640       );
1641       let mut alpha_cost = |alpha: i16| -> u64 {
1642         let mut rec_region =
1643           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1644         PredictionMode::UV_CFL_PRED.predict_intra(
1645           tile_rect,
1646           &mut rec_region,
1647           uv_tx_size,
1648           fi.sequence.bit_depth,
1649           &ac.data,
1650           IntraParam::Alpha(alpha),
1651           None,
1652           &edge_buf,
1653           fi.cpu_feature_level,
1654         );
1655         sse_wxh(
1656           &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1657           &rec_region.as_const(),
1658           visible_tx_w,
1659           visible_tx_h,
1660           |_, _| DistortionScale::default(), // We're not doing RDO here.
1661           fi.sequence.bit_depth,
1662           fi.cpu_feature_level,
1663         )
1664         .0
1665       };
1666       let mut best = (alpha_cost(0), 0);
1667       let mut count = 2;
1668       for alpha in 1i16..=16i16 {
1669         let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1670         if cost.0 < best.0 {
1671           best = (cost.0, alpha);
1672           count += 2;
1673         }
1674         if cost.1 < best.0 {
1675           best = (cost.1, -alpha);
1676           count += 2;
1677         }
1678         if count < alpha {
1679           break;
1680         }
1681       }
1682       best.1
1683     })
1684     .collect();
1685 
1686   if best_alpha[0] == 0 && best_alpha[1] == 0 {
1687     None
1688   } else {
1689     Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
1690   }
1691 }
1692 
1693 /// RDO-based transform type decision
1694 /// If cw_checkpoint is None, a checkpoint for cw's (ContextWriter) current
1695 /// state is created and stored for later use.
rdo_tx_type_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>, mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet, tx_types: &[TxType], ) -> (TxType, f64)1696 pub fn rdo_tx_type_decision<T: Pixel>(
1697   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1698   cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1699   mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
1700   bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1701   tx_types: &[TxType],
1702 ) -> (TxType, f64) {
1703   let mut best_type = TxType::DCT_DCT;
1704   let mut best_rd = std::f64::MAX;
1705 
1706   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1707   let is_chroma_block =
1708     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1709 
1710   let is_inter = !mode.is_intra();
1711 
1712   if cw_checkpoint.is_none() {
1713     // Only run the first call
1714     // Prevents creating multiple checkpoints for own version of cw
1715     *cw_checkpoint =
1716       Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling));
1717   }
1718 
1719   let rdo_type = if fi.use_tx_domain_distortion {
1720     RDOType::TxDistRealRate
1721   } else {
1722     RDOType::PixelDistRealRate
1723   };
1724   let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1725 
1726   for &tx_type in tx_types {
1727     // Skip unsupported transform types
1728     if av1_tx_used[tx_set as usize][tx_type as usize] == 0 {
1729       continue;
1730     }
1731 
1732     if is_inter {
1733       motion_compensate(
1734         fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
1735       );
1736     }
1737 
1738     let mut wr = WriterCounter::new();
1739     let tell = wr.tell_frac();
1740     let (_, tx_dist) = if is_inter {
1741       write_tx_tree(
1742         fi,
1743         ts,
1744         cw,
1745         &mut wr,
1746         mode,
1747         0,
1748         tile_bo,
1749         bsize,
1750         tx_size,
1751         tx_type,
1752         false,
1753         true,
1754         rdo_type,
1755         need_recon_pixel,
1756       )
1757     } else {
1758       write_tx_blocks(
1759         fi,
1760         ts,
1761         cw,
1762         &mut wr,
1763         mode,
1764         mode,
1765         AngleDelta::default(),
1766         tile_bo,
1767         bsize,
1768         tx_size,
1769         tx_type,
1770         false,
1771         CFLParams::default(), // Unused.
1772         true,
1773         rdo_type,
1774         need_recon_pixel,
1775       )
1776     };
1777 
1778     let rate = wr.tell_frac() - tell;
1779     let distortion = if fi.use_tx_domain_distortion {
1780       compute_tx_distortion(
1781         fi,
1782         ts,
1783         bsize,
1784         is_chroma_block,
1785         tile_bo,
1786         tx_dist,
1787         false,
1788         true,
1789       )
1790     } else {
1791       compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
1792     };
1793     let rd = compute_rd_cost(fi, rate, distortion);
1794     if rd < best_rd {
1795       best_rd = rd;
1796       best_type = tx_type;
1797     }
1798 
1799     cw.rollback(cw_checkpoint.as_ref().unwrap());
1800   }
1801 
1802   assert!(best_rd >= 0_f64);
1803 
1804   (best_type, best_rd)
1805 }
1806 
get_sub_partitions( four_partitions: &[TileBlockOffset; 4], partition: PartitionType, ) -> ArrayVec<TileBlockOffset, 4>1807 pub fn get_sub_partitions(
1808   four_partitions: &[TileBlockOffset; 4], partition: PartitionType,
1809 ) -> ArrayVec<TileBlockOffset, 4> {
1810   let mut partition_offsets = ArrayVec::<TileBlockOffset, 4>::new();
1811 
1812   partition_offsets.push(four_partitions[0]);
1813 
1814   if partition == PARTITION_NONE {
1815     return partition_offsets;
1816   }
1817   if partition == PARTITION_VERT || partition == PARTITION_SPLIT {
1818     partition_offsets.push(four_partitions[1]);
1819   };
1820   if partition == PARTITION_HORZ || partition == PARTITION_SPLIT {
1821     partition_offsets.push(four_partitions[2]);
1822   };
1823   if partition == PARTITION_SPLIT {
1824     partition_offsets.push(four_partitions[3]);
1825   };
1826 
1827   partition_offsets
1828 }
1829 
1830 #[inline(always)]
rdo_partition_none<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>, ) -> f641831 fn rdo_partition_none<T: Pixel>(
1832   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1833   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1834   inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>,
1835 ) -> f64 {
1836   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1837 
1838   let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1839   let cost = mode.rd_cost;
1840 
1841   child_modes.push(mode);
1842 
1843   cost
1844 }
1845 
1846 // VERTICAL, HORIZONTAL or simple SPLIT
1847 #[inline(always)]
rdo_partition_simple<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, partition: PartitionType, rdo_type: RDOType, best_rd: f64, child_modes: &mut ArrayVec<PartitionParameters, 4>, ) -> Option<f64>1848 fn rdo_partition_simple<T: Pixel, W: Writer>(
1849   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1850   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1851   bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1852   partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1853   child_modes: &mut ArrayVec<PartitionParameters, 4>,
1854 ) -> Option<f64> {
1855   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1856   let subsize = bsize.subsize(partition);
1857 
1858   debug_assert!(subsize != BlockSize::BLOCK_INVALID);
1859 
1860   let cost = if bsize >= BlockSize::BLOCK_8X8 {
1861     let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1862     let tell = w.tell_frac();
1863     cw.write_partition(w, tile_bo, partition, bsize);
1864     compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1865   } else {
1866     0.0
1867   };
1868 
1869   let hbsw = subsize.width_mi(); // Half the block size width in blocks
1870   let hbsh = subsize.height_mi(); // Half the block size height in blocks
1871   let four_partitions = [
1872     tile_bo,
1873     TileBlockOffset(BlockOffset {
1874       x: tile_bo.0.x + hbsw as usize,
1875       y: tile_bo.0.y,
1876     }),
1877     TileBlockOffset(BlockOffset {
1878       x: tile_bo.0.x,
1879       y: tile_bo.0.y + hbsh as usize,
1880     }),
1881     TileBlockOffset(BlockOffset {
1882       x: tile_bo.0.x + hbsw as usize,
1883       y: tile_bo.0.y + hbsh as usize,
1884     }),
1885   ];
1886 
1887   let partitions = get_sub_partitions(&four_partitions, partition);
1888 
1889   let mut rd_cost_sum = 0.0;
1890 
1891   for offset in partitions {
1892     let hbs = subsize.width_mi() >> 1;
1893     let has_cols = offset.0.x + hbs < ts.mi_width;
1894     let has_rows = offset.0.y + hbs < ts.mi_height;
1895 
1896     if has_cols && has_rows {
1897       let mode_decision =
1898         rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1899 
1900       rd_cost_sum += mode_decision.rd_cost;
1901 
1902       if fi.enable_early_exit && rd_cost_sum > best_rd {
1903         return None;
1904       }
1905       if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1906         let w: &mut W =
1907           if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1908         cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1909       }
1910       encode_block_with_modes(
1911         fi,
1912         ts,
1913         cw,
1914         w_pre_cdef,
1915         w_post_cdef,
1916         subsize,
1917         offset,
1918         &mode_decision,
1919         rdo_type,
1920         false,
1921       );
1922       child_modes.push(mode_decision);
1923     } else {
1924       //rd_cost_sum += std::f64::MAX;
1925       return None;
1926     }
1927   }
1928 
1929   Some(cost + rd_cost_sum)
1930 }
1931 
1932 // RDO-based single level partitioning decision
rdo_partition_decision<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, cached_block: &PartitionGroupParameters, partition_types: &[PartitionType], rdo_type: RDOType, inter_cfg: &InterConfig, ) -> PartitionGroupParameters1933 pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1934   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1935   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1936   bsize: BlockSize, tile_bo: TileBlockOffset,
1937   cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1938   rdo_type: RDOType, inter_cfg: &InterConfig,
1939 ) -> PartitionGroupParameters {
1940   let mut best_partition = cached_block.part_type;
1941   let mut best_rd = cached_block.rd_cost;
1942   let mut best_pred_modes = cached_block.part_modes.clone();
1943 
1944   let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1945   let w_pre_checkpoint = w_pre_cdef.checkpoint();
1946   let w_post_checkpoint = w_post_cdef.checkpoint();
1947 
1948   for &partition in partition_types {
1949     // Do not re-encode results we already have
1950     if partition == cached_block.part_type {
1951       continue;
1952     }
1953 
1954     let mut child_modes = ArrayVec::<_, 4>::new();
1955 
1956     let cost = match partition {
1957       PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1958         Some(rdo_partition_none(
1959           fi,
1960           ts,
1961           cw,
1962           bsize,
1963           tile_bo,
1964           inter_cfg,
1965           &mut child_modes,
1966         ))
1967       }
1968       PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
1969         rdo_partition_simple(
1970           fi,
1971           ts,
1972           cw,
1973           w_pre_cdef,
1974           w_post_cdef,
1975           bsize,
1976           tile_bo,
1977           inter_cfg,
1978           partition,
1979           rdo_type,
1980           best_rd,
1981           &mut child_modes,
1982         )
1983       }
1984       _ => {
1985         unreachable!();
1986       }
1987     };
1988 
1989     if let Some(rd) = cost {
1990       if rd < best_rd {
1991         best_rd = rd;
1992         best_partition = partition;
1993         best_pred_modes = child_modes.clone();
1994       }
1995     }
1996     cw.rollback(&cw_checkpoint);
1997     w_pre_cdef.rollback(&w_pre_checkpoint);
1998     w_post_cdef.rollback(&w_post_checkpoint);
1999   }
2000 
2001   assert!(best_rd >= 0_f64);
2002 
2003   PartitionGroupParameters {
2004     rd_cost: best_rd,
2005     part_type: best_partition,
2006     part_modes: best_pred_modes,
2007   }
2008 }
2009 
rdo_loop_plane_error<T: Pixel>( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize, ) -> ScaledDistortion2010 fn rdo_loop_plane_error<T: Pixel>(
2011   base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
2012   sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
2013   blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
2014 ) -> ScaledDistortion {
2015   let sb_w_blocks =
2016     if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w;
2017   let sb_h_blocks =
2018     if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h;
2019   // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
2020   // accumulating in-frame and unpadded
2021   let mut err = Distortion::zero();
2022   for by in 0..sb_h_blocks {
2023     for bx in 0..sb_w_blocks {
2024       let loop_bo = offset_sbo.block_offset(bx << 1, by << 1);
2025       if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
2026         let src_plane = &src.planes[pli];
2027         let test_plane = &test.planes[pli];
2028         let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
2029         debug_assert_eq!(xdec, test_plane.cfg.xdec);
2030         debug_assert_eq!(ydec, test_plane.cfg.ydec);
2031 
2032         // Unfortunately, our distortion biases are only available via
2033         // Frame-absolute addressing, so we need a block offset
2034         // relative to the full frame origin (not the tile or analysis
2035         // area)
2036         let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1);
2037         let bias = distortion_scale(
2038           fi,
2039           ts.to_frame_block_offset(frame_bo),
2040           BlockSize::BLOCK_8X8,
2041         );
2042 
2043         let src_region =
2044           src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
2045         let test_region =
2046           test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
2047 
2048         err += if pli == 0 {
2049           // For loop filters, We intentionally use cdef_dist even with
2050           // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
2051           // significant negative impact on other metrics and visual quality.
2052           cdef_dist_wxh_8x8(&src_region, &test_region, fi.sequence.bit_depth)
2053             * bias
2054         } else {
2055           sse_wxh(
2056             &src_region,
2057             &test_region,
2058             8 >> xdec,
2059             8 >> ydec,
2060             |_, _| bias,
2061             fi.sequence.bit_depth,
2062             fi.cpu_feature_level,
2063           )
2064         };
2065       }
2066     }
2067   }
2068   err * fi.dist_scale[pli]
2069 }
2070 
2071 // Passed in a superblock offset representing the upper left corner of
2072 // the LRU area we're optimizing.  This area covers the largest LRU in
2073 // any of the present planes, but may consist of a number of
2074 // superblocks and full, smaller LRUs in the other planes
rdo_loop_decision<T: Pixel, W: Writer>( base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, deblock_p: bool, )2075 pub fn rdo_loop_decision<T: Pixel, W: Writer>(
2076   base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2077   ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
2078   deblock_p: bool,
2079 ) {
2080   let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2081     1
2082   } else {
2083     MAX_PLANES
2084   };
2085   assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration);
2086   // Determine area of optimization: Which plane has the largest LRUs?
2087   // How many LRUs for each?
2088   let mut sb_w = 1; // how many superblocks wide the largest LRU
2089                     // is/how many SBs we're processing (same thing)
2090   let mut sb_h = 1; // how many superblocks wide the largest LRU
2091                     // is/how many SBs we're processing (same thing)
2092   let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing
2093   let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing
2094   for pli in 0..planes {
2095     let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2096     let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2097     if sb_w < (1 << sb_h_shift) {
2098       sb_w = 1 << sb_h_shift;
2099     }
2100     if sb_h < (1 << sb_v_shift) {
2101       sb_h = 1 << sb_v_shift;
2102     }
2103   }
2104   for pli in 0..planes {
2105     let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2106     let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2107     lru_w[pli] = sb_w / (1 << sb_h_shift);
2108     lru_h[pli] = sb_h / (1 << sb_v_shift);
2109   }
2110 
2111   // The superblock width/height determinations may be calling for us
2112   // to compute over superblocks that do not actually exist in the
2113   // frame (off the right or lower edge).  Trim sb width/height down
2114   // to actual superblocks.  Note that these last superblocks on the
2115   // right/bottom may themselves still span the edge of the frame, but
2116   // they do hold at least some visible pixels.
2117   sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2118   sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2119 
2120   // We have need to know the Y visible pixel limits as well (the
2121   // sb_w/sb_h figures above can be used to determine how many
2122   // allocated pixels, possibly beyond the visible frame, exist).
2123   let crop_w =
2124     fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2125   let crop_h =
2126     fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2127   let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2128   let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2129 
2130   // Based on `RestorationState::new`
2131   const MAX_SB_SHIFT: usize = 4;
2132   const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT;
2133   const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2134 
2135   // Static allocation relies on the "minimal LRU area for all N planes" invariant.
2136   let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE];
2137   let mut best_lrf =
2138     [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2139 
2140   // due to imprecision in the reconstruction parameter solver, we
2141   // need to make sure we don't fall into a limit cycle.  Track our
2142   // best cost at LRF so that we can break if we get a solution that doesn't
2143   // improve at the reconstruction stage.
2144   let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2145 
2146   // Loop filter RDO is an iterative process and we need temporary
2147   // scratch data to hold the results of deblocking, cdef, and the
2148   // loop reconstruction filter so that each can be partially updated
2149   // without recomputing the entire stack.  Construct
2150   // largest-LRU-sized frames for each, accounting for padding
2151   // required by deblocking, cdef and [optionally] LR.
2152   let mut rec_subset = ts
2153     .rec
2154     .subregion(Area::BlockRect {
2155       bo: base_sbo.block_offset(0, 0).0,
2156       width: (pixel_w + 7) >> 3 << 3,
2157       height: (pixel_h + 7) >> 3 << 3,
2158     })
2159     .scratch_copy();
2160 
2161   // sub-setted region of the TileBlocks for our working frame area.
2162   // Note that the size of this subset is what signals CDEF as to the
2163   // actual coded size.
2164   let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2165     base_sbo.block_offset(0, 0).0.x,
2166     base_sbo.block_offset(0, 0).0.y,
2167     sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2168     sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2169   );
2170 
2171   // const, no need to copy, just need the subregion (but do zero the
2172   // origin to match the other copies/new backing frames).
2173   let src_subset = ts
2174     .input_tile
2175     .subregion(Area::BlockRect {
2176       bo: base_sbo.block_offset(0, 0).0,
2177       width: (pixel_w + 7) >> 3 << 3,
2178       height: (pixel_h + 7) >> 3 << 3,
2179     })
2180     .home();
2181 
2182   if deblock_p {
2183     // Find a good deblocking filter solution for the passed in area.
2184     // This is not RDO of deblocking itself, merely a solution to get
2185     // better results from CDEF/LRF RDO.
2186     let deblock_levels = deblock_filter_optimize(
2187       fi,
2188       &rec_subset.as_tile(),
2189       &src_subset,
2190       &tileblocks_subset.as_const(),
2191       crop_w,
2192       crop_h,
2193     );
2194 
2195     // Deblock the contents of our reconstruction copy.
2196     if deblock_levels[0] != 0 || deblock_levels[1] != 0 {
2197       // copy ts.deblock because we need to set some of our own values here
2198       let mut deblock_copy = *ts.deblock;
2199       deblock_copy.levels = deblock_levels;
2200 
2201       // finally, deblock the temp frame
2202       deblock_filter_frame(
2203         &deblock_copy,
2204         &mut rec_subset.as_tile_mut(),
2205         &tileblocks_subset.as_const(),
2206         crop_w,
2207         crop_h,
2208         fi.sequence.bit_depth,
2209         planes,
2210       );
2211     }
2212   }
2213 
2214   let mut cdef_work =
2215     if fi.sequence.enable_cdef { Some(rec_subset.clone()) } else { None };
2216   let mut lrf_work = if fi.sequence.enable_restoration {
2217     Some(Frame {
2218       planes: {
2219         let new_plane = |pli: usize| {
2220           let PlaneConfig { xdec, ydec, width, height, .. } =
2221             rec_subset.planes[pli].cfg;
2222           Plane::new(width, height, xdec, ydec, 0, 0)
2223         };
2224         [new_plane(0), new_plane(1), new_plane(2)]
2225       },
2226     })
2227   } else {
2228     None
2229   };
2230 
2231   // Precompute directional analysis for CDEF
2232   let cdef_data = {
2233     if cdef_work.is_some() {
2234       Some((
2235         &rec_subset,
2236         cdef_analyze_superblock_range(
2237           fi,
2238           &rec_subset,
2239           &tileblocks_subset.as_const(),
2240           sb_w,
2241           sb_h,
2242         ),
2243       ))
2244     } else {
2245       None
2246     }
2247   };
2248 
2249   // CDEF/LRF decision iteration
2250   // Start with a default of CDEF 0 and RestorationFilter::None
2251   // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2252   // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2253   // If LRF choice changed for any plane, repeat until no changes
2254   // Limit iterations and where we break based on speed setting (in the TODO list ;-)
2255   let mut cdef_change = true;
2256   let mut lrf_change = true;
2257   while cdef_change || lrf_change {
2258     // search for improved cdef indices, superblock by superblock, if cdef is enabled.
2259     if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2260       (&cdef_data, &mut cdef_work.as_mut())
2261     {
2262       for sby in 0..sb_h {
2263         for sbx in 0..sb_w {
2264           let prev_best_index = best_index[sby * sb_w + sbx];
2265           let mut best_cost = -1.;
2266           let mut best_new_index = -1i8;
2267 
2268           /* offset of the superblock we're currently testing within the larger
2269           analysis area */
2270           let loop_sbo =
2271             TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2272 
2273           /* cdef index testing loop */
2274           for cdef_index in 0..(1 << fi.cdef_bits) {
2275             let mut err = ScaledDistortion::zero();
2276             let mut rate = 0;
2277 
2278             cdef_filter_superblock(
2279               fi,
2280               &rec_subset,
2281               &mut cdef_ref.as_tile_mut(),
2282               &tileblocks_subset.as_const(),
2283               loop_sbo,
2284               cdef_index,
2285               &cdef_dirs[sby * sb_w + sbx],
2286             );
2287             // apply LRF if any
2288             for pli in 0..planes {
2289               // We need the cropped-to-visible-frame area of this SB
2290               let wh =
2291                 if fi.sequence.use_128x128_superblock { 128 } else { 64 };
2292               let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2293               let vis_width = (wh >> xdec).min(
2294                 (crop_w >> xdec)
2295                   - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2296                     as usize,
2297               );
2298               let vis_height = (wh >> ydec).min(
2299                 (crop_h >> ydec)
2300                   - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2301                     as usize,
2302               );
2303               // which LRU are we currently testing against?
2304               if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2305                 let rp = &ts.restoration.planes[pli];
2306                 (
2307                   rp.restoration_unit_offset(base_sbo, loop_sbo, false),
2308                   &mut lrf_work,
2309                 )
2310               } {
2311                 // We have a valid LRU, apply LRF, compute error
2312                 match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2313                   RestorationFilter::None {} => {
2314                     err += rdo_loop_plane_error(
2315                       base_sbo,
2316                       loop_sbo,
2317                       1,
2318                       1,
2319                       fi,
2320                       ts,
2321                       &tileblocks_subset.as_const(),
2322                       cdef_ref,
2323                       &src_subset,
2324                       pli,
2325                     );
2326                     rate += if fi.sequence.enable_restoration {
2327                       cw.fc.count_lrf_switchable(
2328                         w,
2329                         &ts.restoration.as_const(),
2330                         best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2331                         pli,
2332                       )
2333                     } else {
2334                       0 // no relative cost differeneces to different
2335                         // CDEF params.  If cdef is on, it's a wash.
2336                     };
2337                   }
2338                   RestorationFilter::Sgrproj { set, xqd } => {
2339                     // only run on this single superblock
2340                     let loop_po =
2341                       loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2342                     // todo: experiment with borrowing border pixels
2343                     // rather than edge-extending. Right now this is
2344                     // hard-clipping to the superblock boundary.
2345                     setup_integral_image(
2346                       &mut ts.integral_buffer,
2347                       SOLVE_IMAGE_STRIDE,
2348                       vis_width,
2349                       vis_height,
2350                       vis_width,
2351                       vis_height,
2352                       &cdef_ref.planes[pli].slice(loop_po),
2353                       &cdef_ref.planes[pli].slice(loop_po),
2354                     );
2355                     sgrproj_stripe_filter(
2356                       set,
2357                       xqd,
2358                       fi,
2359                       &ts.integral_buffer,
2360                       SOLVE_IMAGE_STRIDE,
2361                       &cdef_ref.planes[pli].slice(loop_po),
2362                       &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2363                         x: loop_po.x,
2364                         y: loop_po.y,
2365                         width: vis_width,
2366                         height: vis_height,
2367                       }),
2368                     );
2369                     err += rdo_loop_plane_error(
2370                       base_sbo,
2371                       loop_sbo,
2372                       1,
2373                       1,
2374                       fi,
2375                       ts,
2376                       &tileblocks_subset.as_const(),
2377                       lrf_ref,
2378                       &src_subset,
2379                       pli,
2380                     );
2381                     rate += cw.fc.count_lrf_switchable(
2382                       w,
2383                       &ts.restoration.as_const(),
2384                       best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2385                       pli,
2386                     );
2387                   }
2388                   RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2389                 }
2390               } else {
2391                 // No actual LRU here, compute error directly from CDEF output.
2392                 err += rdo_loop_plane_error(
2393                   base_sbo,
2394                   loop_sbo,
2395                   1,
2396                   1,
2397                   fi,
2398                   ts,
2399                   &tileblocks_subset.as_const(),
2400                   cdef_ref,
2401                   &src_subset,
2402                   pli,
2403                 );
2404                 // no relative cost differeneces to different
2405                 // CDEF params.  If cdef is on, it's a wash.
2406                 // rate += 0;
2407               }
2408             }
2409 
2410             let cost = compute_rd_cost(fi, rate, err);
2411             if best_cost < 0. || cost < best_cost {
2412               best_cost = cost;
2413               best_new_index = cdef_index as i8;
2414             }
2415           }
2416 
2417           // Did we change any preexisting choices?
2418           if best_new_index != prev_best_index {
2419             cdef_change = true;
2420             best_index[sby * sb_w + sbx] = best_new_index;
2421             tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2422           }
2423 
2424           let mut cdef_ref_tm = TileMut::new(
2425             cdef_ref,
2426             TileRect {
2427               x: 0,
2428               y: 0,
2429               width: cdef_ref.planes[0].cfg.width,
2430               height: cdef_ref.planes[0].cfg.height,
2431             },
2432           );
2433 
2434           // Keep cdef output up to date; we need it for restoration
2435           // both below and above (padding)
2436           cdef_filter_superblock(
2437             fi,
2438             rec_copy,
2439             &mut cdef_ref_tm,
2440             &tileblocks_subset.as_const(),
2441             loop_sbo,
2442             best_index[sby * sb_w + sbx] as u8,
2443             &cdef_dirs[sby * sb_w + sbx],
2444           );
2445         }
2446       }
2447     }
2448 
2449     if !cdef_change {
2450       break;
2451     }
2452     cdef_change = false;
2453     lrf_change = false;
2454 
2455     // search for improved restoration filter parameters if restoration is enabled
2456     if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2457       let lrf_input = if cdef_work.is_some() {
2458         // When CDEF is enabled, we pull from the CDEF output
2459         &cdef_work.as_ref().unwrap()
2460       } else {
2461         // When CDEF is disabled, we pull from the [optionally
2462         // deblocked] reconstruction
2463         &rec_subset
2464       };
2465       for pli in 0..planes {
2466         // Nominal size of LRU in pixels before clipping to visible frame
2467         let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2468         // width, in sb, of an LRU in this plane
2469         let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2470         // height, in sb, of an LRU in this plane
2471         let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2472         let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2473         for lru_y in 0..lru_h[pli] {
2474           // number of LRUs vertically
2475           for lru_x in 0..lru_w[pli] {
2476             // number of LRUs horizontally
2477             let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2478               x: lru_x * lru_sb_w,
2479               y: lru_y * lru_sb_h,
2480             });
2481             if ts.restoration.has_restoration_unit(
2482               base_sbo + loop_sbo,
2483               pli,
2484               false,
2485             ) {
2486               let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2487               let lrf_in_plane = &lrf_input.planes[pli];
2488               let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2489               let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2490               let mut best_cost =
2491                 best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2492 
2493               // Check the no filter option
2494               {
2495                 let err = rdo_loop_plane_error(
2496                   base_sbo,
2497                   loop_sbo,
2498                   lru_sb_w,
2499                   lru_sb_h,
2500                   fi,
2501                   ts,
2502                   &tileblocks_subset.as_const(),
2503                   lrf_input,
2504                   &src_subset,
2505                   pli,
2506                 );
2507                 let rate = cw.fc.count_lrf_switchable(
2508                   w,
2509                   &ts.restoration.as_const(),
2510                   best_new_lrf,
2511                   pli,
2512                 );
2513 
2514                 let cost = compute_rd_cost(fi, rate, err);
2515                 // Was this choice actually an improvement?
2516                 if best_cost < 0. || cost < best_cost {
2517                   best_cost = cost;
2518                   best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2519                   best_new_lrf = RestorationFilter::None;
2520                 }
2521               }
2522 
2523               // Look for a self guided filter
2524               // We need the cropped-to-visible-frame computation area of this LRU
2525               let vis_width = unit_size.min(
2526                 (crop_w >> xdec)
2527                   - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2528               );
2529               let vis_height = unit_size.min(
2530                 (crop_h >> ydec)
2531                   - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2532               );
2533 
2534               // todo: experiment with borrowing border pixels
2535               // rather than edge-extending. Right now this is
2536               // hard-clipping to the superblock boundary.
2537               setup_integral_image(
2538                 &mut ts.integral_buffer,
2539                 SOLVE_IMAGE_STRIDE,
2540                 vis_width,
2541                 vis_height,
2542                 vis_width,
2543                 vis_height,
2544                 &lrf_in_plane.slice(lrf_po),
2545                 &lrf_in_plane.slice(lrf_po),
2546               );
2547 
2548               for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2549               {
2550                 let (xqd0, xqd1) = sgrproj_solve(
2551                   set,
2552                   fi,
2553                   &ts.integral_buffer,
2554                   &src_plane
2555                     .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2556                   &lrf_in_plane.slice(lrf_po),
2557                   vis_width,
2558                   vis_height,
2559                 );
2560                 let current_lrf =
2561                   RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2562                 if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2563                   sgrproj_stripe_filter(
2564                     set,
2565                     xqd,
2566                     fi,
2567                     &ts.integral_buffer,
2568                     SOLVE_IMAGE_STRIDE,
2569                     &lrf_in_plane.slice(lrf_po),
2570                     &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2571                       x: lrf_po.x,
2572                       y: lrf_po.y,
2573                       width: vis_width,
2574                       height: vis_height,
2575                     }),
2576                   );
2577                 }
2578                 let err = rdo_loop_plane_error(
2579                   base_sbo,
2580                   loop_sbo,
2581                   lru_sb_w,
2582                   lru_sb_h,
2583                   fi,
2584                   ts,
2585                   &tileblocks_subset.as_const(),
2586                   lrf_ref,
2587                   &src_subset,
2588                   pli,
2589                 );
2590                 let rate = cw.fc.count_lrf_switchable(
2591                   w,
2592                   &ts.restoration.as_const(),
2593                   current_lrf,
2594                   pli,
2595                 );
2596                 let cost = compute_rd_cost(fi, rate, err);
2597                 if cost < best_cost {
2598                   best_cost = cost;
2599                   best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2600                   best_new_lrf = current_lrf;
2601                 }
2602               }
2603 
2604               if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2605                 .notequal(best_new_lrf)
2606               {
2607                 best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2608                 lrf_change = true;
2609                 if let Some(ru) = ts.restoration.planes[pli]
2610                   .restoration_unit_mut(base_sbo + loop_sbo)
2611                 {
2612                   ru.filter = best_new_lrf;
2613                 }
2614               }
2615             }
2616           }
2617         }
2618       }
2619     }
2620   }
2621 }
2622 
2623 #[test]
estimate_rate_test()2624 fn estimate_rate_test() {
2625   assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]);
2626 }
2627