1 // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2 // Copyright (c) 2017-2020, The rav1e contributors. All rights reserved
3 //
4 // This source code is subject to the terms of the BSD 2 Clause License and
5 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 // was not distributed with this source code in the LICENSE file, you can
7 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 // Media Patent License 1.0 was not distributed with this source code in the
9 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 
11 #![allow(non_camel_case_types)]
12 
13 use crate::api::*;
14 use crate::cdef::*;
15 use crate::context::*;
16 use crate::cpu_features::CpuFeatureLevel;
17 use crate::deblock::*;
18 use crate::dist::*;
19 use crate::ec::{Writer, WriterCounter, OD_BITRES};
20 use crate::encode_block_with_modes;
21 use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
22 use crate::frame::Frame;
23 use crate::frame::*;
24 use crate::header::ReferenceMode;
25 use crate::lrf::*;
26 use crate::luma_ac;
27 use crate::mc::MotionVector;
28 use crate::me::*;
29 use crate::motion_compensate;
30 use crate::partition::RefType::*;
31 use crate::partition::*;
32 use crate::predict::{
33   AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
34   RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
35 };
36 use crate::rdo_tables::*;
37 use crate::tiling::*;
38 use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
39 use crate::util::{init_slice_repeat_mut, Aligned, CastFromPrimitive, Pixel};
40 use crate::write_tx_blocks;
41 use crate::write_tx_tree;
42 use crate::Tune;
43 use crate::{encode_block_post_cdef, encode_block_pre_cdef};
44 
45 use crate::partition::PartitionType::*;
46 use arrayvec::*;
47 use itertools::izip;
48 use std::fmt;
49 use std::mem::MaybeUninit;
50 
51 #[derive(Copy, Clone, PartialEq)]
52 pub enum RDOType {
53   PixelDistRealRate,
54   TxDistRealRate,
55   TxDistEstRate,
56 }
57 
58 impl RDOType {
59   #[inline]
needs_tx_dist(self) -> bool60   pub fn needs_tx_dist(self) -> bool {
61     match self {
62       // Pixel-domain distortion and exact ec rate
63       RDOType::PixelDistRealRate => false,
64       // Tx-domain distortion and exact ec rate
65       RDOType::TxDistRealRate => true,
66       // Tx-domain distortion and txdist-based rate
67       RDOType::TxDistEstRate => true,
68     }
69   }
70   #[inline]
needs_coeff_rate(self) -> bool71   pub fn needs_coeff_rate(self) -> bool {
72     match self {
73       RDOType::PixelDistRealRate => true,
74       RDOType::TxDistRealRate => true,
75       RDOType::TxDistEstRate => false,
76     }
77   }
78 }
79 
80 #[derive(Clone)]
81 pub struct PartitionGroupParameters {
82   pub rd_cost: f64,
83   pub part_type: PartitionType,
84   pub part_modes: ArrayVec<[PartitionParameters; 4]>,
85 }
86 
87 #[derive(Clone, Debug)]
88 pub struct PartitionParameters {
89   pub rd_cost: f64,
90   pub bo: TileBlockOffset,
91   pub bsize: BlockSize,
92   pub pred_mode_luma: PredictionMode,
93   pub pred_mode_chroma: PredictionMode,
94   pub pred_cfl_params: CFLParams,
95   pub angle_delta: AngleDelta,
96   pub ref_frames: [RefType; 2],
97   pub mvs: [MotionVector; 2],
98   pub skip: bool,
99   pub has_coeff: bool,
100   pub tx_size: TxSize,
101   pub tx_type: TxType,
102   pub sidx: u8,
103 }
104 
105 impl Default for PartitionParameters {
default() -> Self106   fn default() -> Self {
107     PartitionParameters {
108       rd_cost: std::f64::MAX,
109       bo: TileBlockOffset::default(),
110       bsize: BlockSize::BLOCK_INVALID,
111       pred_mode_luma: PredictionMode::default(),
112       pred_mode_chroma: PredictionMode::default(),
113       pred_cfl_params: CFLParams::default(),
114       angle_delta: AngleDelta::default(),
115       ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
116       mvs: [MotionVector::default(); 2],
117       skip: false,
118       has_coeff: true,
119       tx_size: TxSize::TX_4X4,
120       tx_type: TxType::DCT_DCT,
121       sidx: 0,
122     }
123   }
124 }
125 
estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64126 pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
127   let bs_index = ts as usize;
128   let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV;
129   let bin_idx_down =
130     ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64);
131   let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64);
132   let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
133   let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
134   let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
135   let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
136   let slope = ((y1 - y0) << 8) / (x1 - x0);
137   (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64
138 }
139 
140 // The microbenchmarks perform better with inlining turned off
141 #[inline(never)]
cdef_dist_wxh_8x8<T: Pixel>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize, ) -> RawDistortion142 fn cdef_dist_wxh_8x8<T: Pixel>(
143   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, bit_depth: usize,
144 ) -> RawDistortion {
145   debug_assert!(src1.plane_cfg.xdec == 0);
146   debug_assert!(src1.plane_cfg.ydec == 0);
147   debug_assert!(src2.plane_cfg.xdec == 0);
148   debug_assert!(src2.plane_cfg.ydec == 0);
149 
150   let coeff_shift = bit_depth - 8;
151 
152   // Sum into columns to improve auto-vectorization
153   let mut sum_s_cols: [u16; 8] = [0; 8];
154   let mut sum_d_cols: [u16; 8] = [0; 8];
155   let mut sum_s2_cols: [u32; 8] = [0; 8];
156   let mut sum_d2_cols: [u32; 8] = [0; 8];
157   let mut sum_sd_cols: [u32; 8] = [0; 8];
158 
159   for j in 0..8 {
160     let row1 = &src1[j][0..8];
161     let row2 = &src2[j][0..8];
162     for (sum_s, sum_d, sum_s2, sum_d2, sum_sd, s, d) in izip!(
163       &mut sum_s_cols,
164       &mut sum_d_cols,
165       &mut sum_s2_cols,
166       &mut sum_d2_cols,
167       &mut sum_sd_cols,
168       row1,
169       row2
170     ) {
171       // Don't convert directly to u32 to allow better vectorization
172       let s: u16 = u16::cast_from(*s);
173       let d: u16 = u16::cast_from(*d);
174       *sum_s += s;
175       *sum_d += d;
176 
177       // Convert to u32 to avoid overflows when multiplying
178       let s: u32 = s as u32;
179       let d: u32 = d as u32;
180 
181       *sum_s2 += s * s;
182       *sum_d2 += d * d;
183       *sum_sd += s * d;
184     }
185   }
186 
187   // Sum together the sum of columns
188   let sum_s: i64 =
189     sum_s_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
190   let sum_d: i64 =
191     sum_d_cols.iter().map(|&a| u32::cast_from(a)).sum::<u32>() as i64;
192   let sum_s2: i64 = sum_s2_cols.iter().sum::<u32>() as i64;
193   let sum_d2: i64 = sum_d2_cols.iter().sum::<u32>() as i64;
194   let sum_sd: i64 = sum_sd_cols.iter().sum::<u32>() as i64;
195 
196   // Use sums to calculate distortion
197   let svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
198   let dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
199   let sse = (sum_d2 + sum_s2 - 2 * sum_sd) as f64;
200   //The two constants were tuned for CDEF, but can probably be better tuned for use in general RDO
201   let ssim_boost = (4033_f64 / 16_384_f64)
202     * (svar + dvar + (16_384 << (2 * coeff_shift))) as f64
203     / f64::sqrt(((16_265_089i64 << (4 * coeff_shift)) + svar * dvar) as f64);
204   RawDistortion::new((sse * ssim_boost + 0.5_f64) as u64)
205 }
206 
207 #[allow(unused)]
cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, compute_bias: F, ) -> Distortion208 pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
209   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
210   bit_depth: usize, compute_bias: F,
211 ) -> Distortion {
212   assert!(w & 0x7 == 0);
213   assert!(h & 0x7 == 0);
214   debug_assert!(src1.plane_cfg.xdec == 0);
215   debug_assert!(src1.plane_cfg.ydec == 0);
216   debug_assert!(src2.plane_cfg.xdec == 0);
217   debug_assert!(src2.plane_cfg.ydec == 0);
218 
219   let mut sum = Distortion::zero();
220   for j in 0isize..h as isize / 8 {
221     for i in 0isize..w as isize / 8 {
222       let area = Area::StartingAt { x: i * 8, y: j * 8 };
223       let value = cdef_dist_wxh_8x8(
224         &src1.subregion(area),
225         &src2.subregion(area),
226         bit_depth,
227       );
228 
229       // cdef is always called on non-subsampled planes, so BLOCK_8X8 is
230       // correct here.
231       sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
232     }
233   }
234   sum
235 }
236 
237 /// Sum of Squared Error for a wxh block
238 /// Currently limited to w and h of valid blocks
sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, ) -> Distortion239 pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
240   src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
241   compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
242 ) -> Distortion {
243   // See get_weighted_sse in src/dist.rs.
244   // Provide a scale to get_weighted_sse for each square region of this size.
245   const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1;
246 
247   // To bias the distortion correctly, compute it in blocks up to the size
248   // importance block size in a non-subsampled plane.
249   let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
250   let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
251 
252   let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
253 
254   let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE;
255   let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE;
256 
257   // TODO: Copying biases into a buffer is slow. It would be best if biases were
258   // passed directly. To do this, we would need different versions of the
259   // weighted sse function for decimated/subsampled data. Also requires
260   // eliminating use of unbiased sse.
261   // It should also be noted that the current copy code does not auto-vectorize.
262 
263   // Copy biases into a buffer.
264   let mut buf_storage = Aligned::new(
265     [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE],
266   );
267   let buf_stride = n_imp_blocks_w.next_power_of_two();
268   let buf = init_slice_repeat_mut(
269     &mut buf_storage.data[..buf_stride * n_imp_blocks_h],
270     0,
271   );
272 
273   for block_y in 0..n_imp_blocks_h {
274     for block_x in 0..n_imp_blocks_w {
275       let block = Area::StartingAt {
276         x: (block_x * CHUNK_SIZE) as isize,
277         y: (block_y * CHUNK_SIZE) as isize,
278       };
279       buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
280     }
281   }
282 
283   Distortion(get_weighted_sse(
284     src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
285   ))
286 }
287 
clip_visible_bsize( frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize, ) -> (usize, usize)288 pub fn clip_visible_bsize(
289   frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
290 ) -> (usize, usize) {
291   let blk_w = bsize.width();
292   let blk_h = bsize.height();
293 
294   let visible_w: usize = if x + blk_w <= frame_w {
295     blk_w
296   } else if x >= frame_w {
297     0
298   } else {
299     frame_w - x
300   };
301 
302   let visible_h: usize = if y + blk_h <= frame_h {
303     blk_h
304   } else if y >= frame_h {
305     0
306   } else {
307     frame_h - y
308   };
309 
310   (visible_w, visible_h)
311 }
312 
313 // Compute the pixel-domain distortion for an encode
compute_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, ) -> ScaledDistortion314 fn compute_distortion<T: Pixel>(
315   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
316   is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
317 ) -> ScaledDistortion {
318   let area = Area::BlockStartingAt { bo: tile_bo.0 };
319   let input_region = ts.input_tile.planes[0].subregion(area);
320   let rec_region = ts.rec.planes[0].subregion(area);
321 
322   // clip a block to have visible pixles only
323   let frame_bo = ts.to_frame_block_offset(tile_bo);
324   let (visible_w, visible_h) = clip_visible_bsize(
325     fi.width,
326     fi.height,
327     bsize,
328     frame_bo.0.x << MI_SIZE_LOG2,
329     frame_bo.0.y << MI_SIZE_LOG2,
330   );
331 
332   if visible_w == 0 || visible_h == 0 {
333     return ScaledDistortion::zero();
334   }
335 
336   let mut distortion = match fi.config.tune {
337     Tune::Psychovisual
338       if bsize.width() >= 8
339         && bsize.height() >= 8
340         && (visible_w & 0x7 == 0)
341         && (visible_h & 0x7 == 0) =>
342     {
343       cdef_dist_wxh(
344         &input_region,
345         &rec_region,
346         visible_w,
347         visible_h,
348         fi.sequence.bit_depth,
349         |bias_area, bsize| {
350           distortion_scale(
351             fi,
352             input_region.subregion(bias_area).frame_block_offset(),
353             bsize,
354           )
355         },
356       )
357     }
358     Tune::Psnr | Tune::Psychovisual => sse_wxh(
359       &input_region,
360       &rec_region,
361       visible_w,
362       visible_h,
363       |bias_area, bsize| {
364         distortion_scale(
365           fi,
366           input_region.subregion(bias_area).frame_block_offset(),
367           bsize,
368         )
369       },
370       fi.sequence.bit_depth,
371       fi.cpu_feature_level,
372     ),
373   } * fi.dist_scale[0];
374 
375   if is_chroma_block
376     && !luma_only
377     && fi.sequence.chroma_sampling != ChromaSampling::Cs400
378   {
379     let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
380     let chroma_w = if bsize.width() >= 8 || xdec == 0 {
381       (visible_w + xdec) >> xdec
382     } else {
383       (4 + visible_w + xdec) >> xdec
384     };
385     let chroma_h = if bsize.height() >= 8 || ydec == 0 {
386       (visible_h + ydec) >> ydec
387     } else {
388       (4 + visible_h + ydec) >> ydec
389     };
390 
391     for p in 1..3 {
392       let input_region = ts.input_tile.planes[p].subregion(area);
393       let rec_region = ts.rec.planes[p].subregion(area);
394       distortion += sse_wxh(
395         &input_region,
396         &rec_region,
397         chroma_w,
398         chroma_h,
399         |bias_area, bsize| {
400           distortion_scale(
401             fi,
402             input_region.subregion(bias_area).frame_block_offset(),
403             bsize,
404           )
405         },
406         fi.sequence.bit_depth,
407         fi.cpu_feature_level,
408       ) * fi.dist_scale[p];
409     }
410   }
411   distortion
412 }
413 
414 // Compute the transform-domain distortion for an encode
compute_tx_distortion<T: Pixel>( fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, skip: bool, luma_only: bool, ) -> ScaledDistortion415 fn compute_tx_distortion<T: Pixel>(
416   fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
417   is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
418   skip: bool, luma_only: bool,
419 ) -> ScaledDistortion {
420   assert!(fi.config.tune == Tune::Psnr);
421   let area = Area::BlockStartingAt { bo: tile_bo.0 };
422   let input_region = ts.input_tile.planes[0].subregion(area);
423   let rec_region = ts.rec.planes[0].subregion(area);
424 
425   let (visible_w, visible_h) = if !skip {
426     (bsize.width(), bsize.height())
427   } else {
428     let frame_bo = ts.to_frame_block_offset(tile_bo);
429     clip_visible_bsize(
430       fi.width,
431       fi.height,
432       bsize,
433       frame_bo.0.x << MI_SIZE_LOG2,
434       frame_bo.0.y << MI_SIZE_LOG2,
435     )
436   };
437 
438   if visible_w == 0 || visible_h == 0 {
439     return ScaledDistortion::zero();
440   }
441 
442   let mut distortion = if skip {
443     sse_wxh(
444       &input_region,
445       &rec_region,
446       visible_w,
447       visible_h,
448       |bias_area, bsize| {
449         distortion_scale(
450           fi,
451           input_region.subregion(bias_area).frame_block_offset(),
452           bsize,
453         )
454       },
455       fi.sequence.bit_depth,
456       fi.cpu_feature_level,
457     ) * fi.dist_scale[0]
458   } else {
459     tx_dist
460   };
461 
462   if is_chroma_block
463     && !luma_only
464     && skip
465     && fi.sequence.chroma_sampling != ChromaSampling::Cs400
466   {
467     let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
468     let chroma_w = if bsize.width() >= 8 || xdec == 0 {
469       (visible_w + xdec) >> xdec
470     } else {
471       (4 + visible_w + xdec) >> xdec
472     };
473     let chroma_h = if bsize.height() >= 8 || ydec == 0 {
474       (visible_h + ydec) >> ydec
475     } else {
476       (4 + visible_h + ydec) >> ydec
477     };
478 
479     for p in 1..3 {
480       let input_region = ts.input_tile.planes[p].subregion(area);
481       let rec_region = ts.rec.planes[p].subregion(area);
482       distortion += sse_wxh(
483         &input_region,
484         &rec_region,
485         chroma_w,
486         chroma_h,
487         |bias_area, bsize| {
488           distortion_scale(
489             fi,
490             input_region.subregion(bias_area).frame_block_offset(),
491             bsize,
492           )
493         },
494         fi.sequence.bit_depth,
495         fi.cpu_feature_level,
496       ) * fi.dist_scale[p];
497     }
498   }
499   distortion
500 }
501 
502 /// Compute a scaling factor to multiply the distortion of a block by,
503 /// this factor is determined using temporal RDO.
distortion_scale<T: Pixel>( fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale504 pub fn distortion_scale<T: Pixel>(
505   fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
506 ) -> DistortionScale {
507   if !fi.config.temporal_rdo() {
508     return DistortionScale::default();
509   }
510   // EncoderConfig::temporal_rdo() should always return false in situations
511   // where distortion is computed on > 8x8 blocks, so we should never hit this
512   // assert.
513   assert!(bsize <= BlockSize::BLOCK_8X8);
514 
515   let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
516   let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
517 
518   fi.distortion_scales[y * fi.w_in_imp_b + x]
519 }
520 
distortion_scale_for( propagate_cost: f64, intra_cost: f64, ) -> DistortionScale521 pub fn distortion_scale_for(
522   propagate_cost: f64, intra_cost: f64,
523 ) -> DistortionScale {
524   // The mbtree paper \cite{mbtree} uses the following formula:
525   //
526   //     QP_delta = -strength * log2(1 + (propagate_cost / intra_cost))
527   //
528   // Since this is H.264, this corresponds to the following quantizer:
529   //
530   //     Q' = Q * 2^(QP_delta/6)
531   //
532   // Since lambda is proportial to Q^2, this means we want to minimize:
533   //
534   //     D + lambda' * R
535   //   = D + 2^(QP_delta / 3) * lambda * R
536   //
537   // If we want to keep lambda fixed, we can instead scale distortion and
538   // minimize:
539   //
540   //     D * scale + lambda * R
541   //
542   // where:
543   //
544   //     scale = 2^(QP_delta / -3)
545   //           = (1 + (propagate_cost / intra_cost))^(strength / 3)
546   //
547   //  The original paper empirically chooses strength = 2.0, but strength = 1.0
548   //  seems to work best in rav1e currently, this may have something to do with
549   //  the fact that they use 16x16 blocks whereas our "importance blocks" are
550   //  8x8, but everything should be scale invariant here so that's weird.
551   //
552   // @article{mbtree,
553   //   title={A novel macroblock-tree algorithm for high-performance
554   //    optimization of dependent video coding in H.264/AVC},
555   //   author={Garrett-Glaser, Jason},
556   //   journal={Tech. Rep.},
557   //   year={2009},
558   //   url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
559   // }
560 
561   if intra_cost == 0. {
562     return DistortionScale::default(); // no scaling
563   }
564 
565   let strength = 1.0; // empirical, see comment above
566   let frac = (intra_cost + propagate_cost) / intra_cost;
567   DistortionScale::new(frac.powf(strength / 3.0))
568 }
569 
570 /// Fixed point arithmetic version of distortion scale
571 #[repr(transparent)]
572 #[derive(Copy, Clone)]
573 pub struct DistortionScale(pub u32);
574 
575 #[repr(transparent)]
576 pub struct RawDistortion(u64);
577 
578 #[repr(transparent)]
579 pub struct Distortion(pub u64);
580 
581 #[repr(transparent)]
582 pub struct ScaledDistortion(u64);
583 
584 impl DistortionScale {
585   /// Bits past the radix point
586   const SHIFT: u32 = 12;
587   /// Number of bits used. Determines the max value.
588   /// 24 bits is likely excessive.
589   const BITS: u32 = 24;
590 
591   #[inline]
new(scale: f64) -> Self592   pub fn new(scale: f64) -> Self {
593     Self(
594       (scale * (1 << Self::SHIFT) as f64 + 0.5)
595         .min(((1 << Self::BITS as u64) - 1) as f64) as u32,
596     )
597   }
598 
599   /// Multiply, round and shift
600   /// Internal implementation, so don't use multiply trait.
601   #[inline]
mul_u64(self, dist: u64) -> u64602   fn mul_u64(self, dist: u64) -> u64 {
603     (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT
604   }
605 }
606 
607 // Default value for DistortionScale is a fixed point 1
608 impl Default for DistortionScale {
609   #[inline]
default() -> Self610   fn default() -> Self {
611     Self(1 << Self::SHIFT)
612   }
613 }
614 
615 impl fmt::Debug for DistortionScale {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result616   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
617     write!(f, "{}", f64::from(*self))
618   }
619 }
620 
621 impl From<DistortionScale> for f64 {
622   #[inline]
from(scale: DistortionScale) -> Self623   fn from(scale: DistortionScale) -> Self {
624     scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64
625   }
626 }
627 
628 impl RawDistortion {
629   #[inline]
new(dist: u64) -> Self630   pub const fn new(dist: u64) -> Self {
631     Self(dist)
632   }
633 }
634 
635 impl std::ops::Mul<DistortionScale> for RawDistortion {
636   type Output = Distortion;
637   #[inline]
mul(self, rhs: DistortionScale) -> Distortion638   fn mul(self, rhs: DistortionScale) -> Distortion {
639     Distortion(rhs.mul_u64(self.0))
640   }
641 }
642 
643 impl Distortion {
644   #[inline]
zero() -> Self645   pub const fn zero() -> Self {
646     Self(0)
647   }
648 }
649 
650 impl std::ops::Mul<f64> for Distortion {
651   type Output = ScaledDistortion;
652   #[inline]
mul(self, rhs: f64) -> ScaledDistortion653   fn mul(self, rhs: f64) -> ScaledDistortion {
654     ScaledDistortion((self.0 as f64 * rhs) as u64)
655   }
656 }
657 
658 impl std::ops::AddAssign for Distortion {
659   #[inline]
add_assign(&mut self, other: Self)660   fn add_assign(&mut self, other: Self) {
661     self.0 += other.0;
662   }
663 }
664 
665 impl ScaledDistortion {
666   #[inline]
zero() -> Self667   pub const fn zero() -> Self {
668     Self(0)
669   }
670 }
671 
672 impl std::ops::AddAssign for ScaledDistortion {
673   #[inline]
add_assign(&mut self, other: Self)674   fn add_assign(&mut self, other: Self) {
675     self.0 += other.0;
676   }
677 }
678 
compute_rd_cost<T: Pixel>( fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion, ) -> f64679 pub fn compute_rd_cost<T: Pixel>(
680   fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
681 ) -> f64 {
682   let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64);
683   distortion.0 as f64 + fi.lambda * rate_in_bits
684 }
685 
rdo_tx_size_type<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], skip: bool, ) -> (TxSize, TxType)686 pub fn rdo_tx_size_type<T: Pixel>(
687   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
688   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
689   luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
690   skip: bool,
691 ) -> (TxSize, TxType) {
692   let is_inter = !luma_mode.is_intra();
693   let mut tx_size = max_txsize_rect_lookup[bsize as usize];
694 
695   if fi.enable_inter_txfm_split && is_inter && !skip {
696     tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
697   }
698 
699   let mut best_tx_type = TxType::DCT_DCT;
700   let mut best_tx_size = tx_size;
701   let mut best_rd = std::f64::MAX;
702 
703   let do_rdo_tx_size =
704     fi.tx_mode_select && fi.config.speed_settings.rdo_tx_decision && !is_inter;
705   let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 };
706   let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
707 
708   for _ in 0..=rdo_tx_depth {
709     let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
710 
711     let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
712       && fi.config.speed_settings.rdo_tx_decision
713       && !is_inter
714       && !skip;
715 
716     if !do_rdo_tx_size && !do_rdo_tx_type {
717       return (best_tx_size, best_tx_type);
718     };
719 
720     let tx_types =
721       if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
722 
723     // Luma plane transform type decision
724     let (tx_type, rd_cost) = rdo_tx_type_decision(
725       fi,
726       ts,
727       cw,
728       &mut cw_checkpoint,
729       luma_mode,
730       ref_frames,
731       mvs,
732       bsize,
733       tile_bo,
734       tx_size,
735       tx_set,
736       tx_types,
737     );
738 
739     if rd_cost < best_rd {
740       best_tx_size = tx_size;
741       best_tx_type = tx_type;
742       best_rd = rd_cost;
743     }
744 
745     debug_assert!(tx_size.width_log2() <= bsize.width_log2());
746     debug_assert!(tx_size.height_log2() <= bsize.height_log2());
747     debug_assert!(
748       tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT
749     );
750 
751     let next_tx_size = sub_tx_size_map[tx_size as usize];
752 
753     if next_tx_size == tx_size {
754       break;
755     } else {
756       tx_size = next_tx_size;
757     };
758   }
759 
760   (best_tx_size, best_tx_type)
761 }
762 
763 #[inline]
dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool764 fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
765   let diff_row = mv.row as i32 - ref_mv.row as i32;
766   let diff_col = mv.col as i32 - ref_mv.col as i32;
767   diff_row >= MV_LOW
768     && diff_row <= MV_UPP
769     && diff_col >= MV_LOW
770     && diff_col <= MV_UPP
771 }
772 
773 #[inline]
luma_chroma_mode_rdo<T: Pixel>( luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize, tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, rdo_type: RDOType, cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters, mvs: [MotionVector; 2], ref_frames: [RefType; 2], mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool, mode_context: usize, mv_stack: &ArrayVec<[CandidateMV; 9]>, angle_delta: AngleDelta, )774 fn luma_chroma_mode_rdo<T: Pixel>(
775   luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
776   tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
777   cw: &mut ContextWriter, rdo_type: RDOType,
778   cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
779   mvs: [MotionVector; 2], ref_frames: [RefType; 2],
780   mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
781   mode_context: usize, mv_stack: &ArrayVec<[CandidateMV; 9]>,
782   angle_delta: AngleDelta,
783 ) {
784   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
785 
786   let is_chroma_block =
787     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
788 
789   if !luma_mode_is_intra {
790     let ref_mvs = if mv_stack.is_empty() {
791       [MotionVector::default(); 2]
792     } else {
793       [mv_stack[0].this_mv, mv_stack[0].comp_mv]
794     };
795 
796     if (luma_mode == PredictionMode::NEWMV
797       || luma_mode == PredictionMode::NEW_NEWMV
798       || luma_mode == PredictionMode::NEW_NEARESTMV)
799       && !dmv_in_range(mvs[0], ref_mvs[0])
800     {
801       return;
802     }
803 
804     if (luma_mode == PredictionMode::NEW_NEWMV
805       || luma_mode == PredictionMode::NEAREST_NEWMV)
806       && !dmv_in_range(mvs[1], ref_mvs[1])
807     {
808       return;
809     }
810   }
811 
812   // Find the best chroma prediction mode for the current luma prediction mode
813   let mut chroma_rdo = |skip: bool| -> bool {
814     let mut zero_distortion = false;
815 
816     // If skip is true or segmentation is turned off, sidx is not coded.
817     let sidx_range = if skip || !fi.enable_segmentation {
818       0..=0
819     } else if fi.base_q_idx as i16
820       + ts.segmentation.data[2][SegLvl::SEG_LVL_ALT_Q as usize]
821       < 1
822     {
823       0..=1
824     } else {
825       0..=2
826     };
827 
828     for sidx in sidx_range {
829       cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
830 
831       let (tx_size, tx_type) = rdo_tx_size_type(
832         fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
833       );
834       for &chroma_mode in mode_set_chroma.iter() {
835         let wr = &mut WriterCounter::new();
836         let tell = wr.tell_frac();
837 
838         if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
839           cw.write_partition(
840             wr,
841             tile_bo,
842             PartitionType::PARTITION_NONE,
843             bsize,
844           );
845         }
846 
847         // TODO(yushin): luma and chroma would have different decision based on chroma format
848         let need_recon_pixel =
849           luma_mode_is_intra && tx_size.block_size() != bsize;
850 
851         encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
852         let (has_coeff, tx_dist) = encode_block_post_cdef(
853           fi,
854           ts,
855           cw,
856           wr,
857           luma_mode,
858           chroma_mode,
859           angle_delta,
860           ref_frames,
861           mvs,
862           bsize,
863           tile_bo,
864           skip,
865           CFLParams::default(),
866           tx_size,
867           tx_type,
868           mode_context,
869           mv_stack,
870           rdo_type,
871           need_recon_pixel,
872           false,
873         );
874 
875         let rate = wr.tell_frac() - tell;
876         let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
877           compute_tx_distortion(
878             fi,
879             ts,
880             bsize,
881             is_chroma_block,
882             tile_bo,
883             tx_dist,
884             skip,
885             false,
886           )
887         } else {
888           compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false)
889         };
890         let is_zero_dist = distortion.0 == 0;
891         let rd = compute_rd_cost(fi, rate, distortion);
892         if rd < best.rd_cost {
893           //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV {
894           best.rd_cost = rd;
895           best.pred_mode_luma = luma_mode;
896           best.pred_mode_chroma = chroma_mode;
897           best.angle_delta = angle_delta;
898           best.ref_frames = ref_frames;
899           best.mvs = mvs;
900           best.skip = skip;
901           best.has_coeff = has_coeff;
902           best.tx_size = tx_size;
903           best.tx_type = tx_type;
904           best.sidx = sidx;
905           zero_distortion = is_zero_dist;
906         }
907 
908         cw.rollback(cw_checkpoint);
909       }
910     }
911 
912     zero_distortion
913   };
914 
915   // Don't skip when using intra modes
916   let zero_distortion =
917     if !luma_mode_is_intra { chroma_rdo(true) } else { false };
918   // early skip
919   if !zero_distortion {
920     chroma_rdo(false);
921   }
922 }
923 
924 // RDO-based mode decision
rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, ) -> PartitionParameters925 pub fn rdo_mode_decision<T: Pixel>(
926   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
927   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
928   inter_cfg: &InterConfig,
929 ) -> PartitionParameters {
930   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
931   let cw_checkpoint = cw.checkpoint();
932 
933   let rdo_type = if fi.use_tx_domain_rate {
934     RDOType::TxDistEstRate
935   } else if fi.use_tx_domain_distortion {
936     RDOType::TxDistRealRate
937   } else {
938     RDOType::PixelDistRealRate
939   };
940 
941   let mut best = if fi.frame_type.has_inter() {
942     assert!(fi.frame_type != FrameType::KEY);
943 
944     inter_frame_rdo_mode_decision(
945       fi,
946       ts,
947       cw,
948       bsize,
949       tile_bo,
950       inter_cfg,
951       &cw_checkpoint,
952       rdo_type,
953     )
954   } else {
955     PartitionParameters::default()
956   };
957 
958   let is_chroma_block =
959     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
960 
961   if !best.skip {
962     best = intra_frame_rdo_mode_decision(
963       fi,
964       ts,
965       cw,
966       bsize,
967       tile_bo,
968       &cw_checkpoint,
969       rdo_type,
970       best,
971       is_chroma_block,
972     );
973   }
974 
975   if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
976     cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
977 
978     let chroma_mode = PredictionMode::UV_CFL_PRED;
979     let cw_checkpoint = cw.checkpoint();
980     let wr: &mut dyn Writer = &mut WriterCounter::new();
981     let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 };
982 
983     write_tx_blocks(
984       fi,
985       ts,
986       cw,
987       wr,
988       best.pred_mode_luma,
989       best.pred_mode_luma,
990       angle_delta,
991       tile_bo,
992       bsize,
993       best.tx_size,
994       best.tx_type,
995       false,
996       CFLParams::default(),
997       true,
998       rdo_type,
999       true,
1000     );
1001     cw.rollback(&cw_checkpoint);
1002     if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1003       if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1004         let wr: &mut dyn Writer = &mut WriterCounter::new();
1005         let tell = wr.tell_frac();
1006 
1007         encode_block_pre_cdef(
1008           &fi.sequence,
1009           ts,
1010           cw,
1011           wr,
1012           bsize,
1013           tile_bo,
1014           best.skip,
1015         );
1016         let (has_coeff, _) = encode_block_post_cdef(
1017           fi,
1018           ts,
1019           cw,
1020           wr,
1021           best.pred_mode_luma,
1022           chroma_mode,
1023           angle_delta,
1024           best.ref_frames,
1025           best.mvs,
1026           bsize,
1027           tile_bo,
1028           best.skip,
1029           cfl,
1030           best.tx_size,
1031           best.tx_type,
1032           0,
1033           &[],
1034           rdo_type,
1035           true, // For CFL, luma should be always reconstructed.
1036           false,
1037         );
1038 
1039         let rate = wr.tell_frac() - tell;
1040 
1041         // For CFL, tx-domain distortion is not an option.
1042         let distortion =
1043           compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false);
1044         let rd = compute_rd_cost(fi, rate, distortion);
1045         if rd < best.rd_cost {
1046           best.rd_cost = rd;
1047           best.pred_mode_chroma = chroma_mode;
1048           best.angle_delta = angle_delta;
1049           best.has_coeff = has_coeff;
1050           best.pred_cfl_params = cfl;
1051         }
1052 
1053         cw.rollback(&cw_checkpoint);
1054       }
1055     }
1056   }
1057 
1058   cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1059   cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1060   cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1061 
1062   assert!(best.rd_cost >= 0_f64);
1063 
1064   PartitionParameters {
1065     bo: tile_bo,
1066     bsize,
1067     pred_mode_luma: best.pred_mode_luma,
1068     pred_mode_chroma: best.pred_mode_chroma,
1069     pred_cfl_params: best.pred_cfl_params,
1070     angle_delta: best.angle_delta,
1071     ref_frames: best.ref_frames,
1072     mvs: best.mvs,
1073     rd_cost: best.rd_cost,
1074     skip: best.skip,
1075     has_coeff: best.has_coeff,
1076     tx_size: best.tx_size,
1077     tx_type: best.tx_type,
1078     sidx: best.sidx,
1079   }
1080 }
1081 
inter_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, ) -> PartitionParameters1082 fn inter_frame_rdo_mode_decision<T: Pixel>(
1083   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1084   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1085   inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1086   rdo_type: RDOType,
1087 ) -> PartitionParameters {
1088   let mut best = PartitionParameters::default();
1089 
1090   // we can never have more than 7 reference frame sets
1091   let mut ref_frames_set = ArrayVec::<[_; 7]>::new();
1092   // again, max of 7 ref slots
1093   let mut ref_slot_set = ArrayVec::<[_; 7]>::new();
1094   // our implementation never returns more than 3 at the moment
1095   let mut mvs_from_me = ArrayVec::<[_; 3]>::new();
1096   let mut fwdref = None;
1097   let mut bwdref = None;
1098 
1099   for i in inter_cfg.allowed_ref_frames().iter().copied() {
1100     // Don't search LAST3 since it's used only for probs
1101     if i == LAST3_FRAME {
1102       continue;
1103     }
1104 
1105     if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1106       if fwdref == None && i.is_fwd_ref() {
1107         fwdref = Some(ref_frames_set.len());
1108       }
1109       if bwdref == None && i.is_bwd_ref() {
1110         bwdref = Some(ref_frames_set.len());
1111       }
1112       ref_frames_set.push([i, NONE_FRAME]);
1113       let slot_idx = fi.ref_frames[i.to_index()];
1114       ref_slot_set.push(slot_idx);
1115     }
1116   }
1117   assert!(!ref_frames_set.is_empty());
1118 
1119   let mut inter_mode_set = ArrayVec::<[(PredictionMode, usize); 20]>::new();
1120   let mut mvs_set = ArrayVec::<[[MotionVector; 2]; 20]>::new();
1121   let mut satds = ArrayVec::<[u32; 20]>::new();
1122   let mut mv_stacks = ArrayVec::<[_; 20]>::new();
1123   let mut mode_contexts = ArrayVec::<[_; 7]>::new();
1124 
1125   for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1126     let mut mv_stack = ArrayVec::<[CandidateMV; 9]>::new();
1127     mode_contexts.push(cw.find_mvrefs(
1128       tile_bo,
1129       ref_frames,
1130       &mut mv_stack,
1131       bsize,
1132       fi,
1133       false,
1134     ));
1135 
1136     let mut pmv = [MotionVector::default(); 2];
1137     if !mv_stack.is_empty() {
1138       pmv[0] = mv_stack[0].this_mv;
1139     }
1140     if mv_stack.len() > 1 {
1141       pmv[1] = mv_stack[1].this_mv;
1142     }
1143 
1144     let res = motion_estimation(fi, ts, bsize, tile_bo, ref_frames[0], pmv);
1145     let b_me = res.0;
1146 
1147     mvs_from_me.push([b_me, MotionVector::default()]);
1148 
1149     for &x in RAV1E_INTER_MODES_MINIMAL {
1150       inter_mode_set.push((x, i));
1151     }
1152     if !mv_stack.is_empty() {
1153       inter_mode_set.push((PredictionMode::NEAR0MV, i));
1154     }
1155     if mv_stack.len() >= 2 {
1156       inter_mode_set.push((PredictionMode::GLOBALMV, i));
1157     }
1158     let include_near_mvs = fi.config.speed_settings.include_near_mvs;
1159     if include_near_mvs {
1160       if mv_stack.len() >= 3 {
1161         inter_mode_set.push((PredictionMode::NEAR1MV, i));
1162       }
1163       if mv_stack.len() >= 4 {
1164         inter_mode_set.push((PredictionMode::NEAR2MV, i));
1165       }
1166     }
1167     let same_row_col = |x: &CandidateMV| {
1168       x.this_mv.row == mvs_from_me[i][0].row
1169         && x.this_mv.col == mvs_from_me[i][0].col
1170     };
1171     if !mv_stack
1172       .iter()
1173       .take(if include_near_mvs { 4 } else { 2 })
1174       .any(same_row_col)
1175       && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0)
1176     {
1177       inter_mode_set.push((PredictionMode::NEWMV, i));
1178     }
1179 
1180     mv_stacks.push(mv_stack);
1181   }
1182 
1183   let sz = bsize.width_mi().min(bsize.height_mi());
1184 
1185   // To use non single reference modes, block width and height must be greater than 4.
1186   if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 {
1187     // Adding compound candidate
1188     if let Some(r0) = fwdref {
1189       if let Some(r1) = bwdref {
1190         let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]];
1191         ref_frames_set.push(ref_frames);
1192         let mv0 = mvs_from_me[r0][0];
1193         let mv1 = mvs_from_me[r1][0];
1194         mvs_from_me.push([mv0, mv1]);
1195         let mut mv_stack = ArrayVec::<[CandidateMV; 9]>::new();
1196         mode_contexts.push(cw.find_mvrefs(
1197           tile_bo,
1198           ref_frames,
1199           &mut mv_stack,
1200           bsize,
1201           fi,
1202           true,
1203         ));
1204         for &x in RAV1E_INTER_COMPOUND_MODES {
1205           // exclude any NEAR mode based on speed setting
1206           if fi.config.speed_settings.include_near_mvs || !x.has_nearmv() {
1207             let mv_stack_idx = ref_frames_set.len() - 1;
1208             // exclude NEAR modes if the mv_stack is too short
1209             if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1210               inter_mode_set.push((x, mv_stack_idx));
1211             }
1212           }
1213         }
1214         mv_stacks.push(mv_stack);
1215       }
1216     }
1217   }
1218 
1219   let num_modes_rdo = if fi.config.speed_settings.prediction_modes
1220     >= PredictionModesSetting::ComplexAll
1221   {
1222     inter_mode_set.len()
1223   } else {
1224     9 // This number is determined by AWCY test
1225   };
1226 
1227   inter_mode_set.iter().for_each(|&(luma_mode, i)| {
1228     let mvs = match luma_mode {
1229       PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i],
1230       PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => {
1231         if !mv_stacks[i].is_empty() {
1232           [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv]
1233         } else {
1234           [MotionVector::default(); 2]
1235         }
1236       }
1237       PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => {
1238         if mv_stacks[i].len() > 1 {
1239           [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv]
1240         } else {
1241           [MotionVector::default(); 2]
1242         }
1243       }
1244       PredictionMode::NEAR1MV
1245       | PredictionMode::NEAR2MV
1246       | PredictionMode::NEAR_NEAR1MV
1247       | PredictionMode::NEAR_NEAR2MV => [
1248         mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1249         mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1250       ],
1251       PredictionMode::NEAREST_NEWMV => {
1252         [mv_stacks[i][0].this_mv, mvs_from_me[i][1]]
1253       }
1254       PredictionMode::NEW_NEARESTMV => {
1255         [mvs_from_me[i][0], mv_stacks[i][0].comp_mv]
1256       }
1257       PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => {
1258         [MotionVector::default(); 2]
1259       }
1260       _ => {
1261         unimplemented!();
1262       }
1263     };
1264     mvs_set.push(mvs);
1265 
1266     // Calculate SATD for each mode
1267     if num_modes_rdo != inter_mode_set.len() {
1268       let tile_rect = ts.tile_rect();
1269       let rec = &mut ts.rec.planes[0];
1270       let po = tile_bo.plane_offset(rec.plane_cfg);
1271       let mut rec_region =
1272         rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1273 
1274       luma_mode.predict_inter(
1275         fi,
1276         tile_rect,
1277         0,
1278         po,
1279         &mut rec_region,
1280         bsize.width(),
1281         bsize.height(),
1282         ref_frames_set[i],
1283         mvs,
1284         &mut ts.inter_compound_buffers,
1285       );
1286 
1287       let plane_org = ts.input_tile.planes[0]
1288         .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1289       let plane_ref = rec_region.as_const();
1290 
1291       let satd = get_satd(
1292         &plane_org,
1293         &plane_ref,
1294         bsize,
1295         fi.sequence.bit_depth,
1296         fi.cpu_feature_level,
1297       );
1298       satds.push(satd);
1299     } else {
1300       satds.push(0);
1301     }
1302   });
1303 
1304   let mut sorted =
1305     izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<[_; 20]>>();
1306   if num_modes_rdo != sorted.len() {
1307     sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd);
1308   }
1309 
1310   sorted.iter().take(num_modes_rdo).for_each(
1311     |&((luma_mode, i), mvs, _satd)| {
1312       let mode_set_chroma = ArrayVec::from([luma_mode]);
1313 
1314       luma_chroma_mode_rdo(
1315         luma_mode,
1316         fi,
1317         bsize,
1318         tile_bo,
1319         ts,
1320         cw,
1321         rdo_type,
1322         cw_checkpoint,
1323         &mut best,
1324         mvs,
1325         ref_frames_set[i],
1326         &mode_set_chroma,
1327         false,
1328         mode_contexts[i],
1329         &mv_stacks[i],
1330         AngleDelta::default(),
1331       );
1332     },
1333   );
1334 
1335   best
1336 }
1337 
intra_frame_rdo_mode_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, mut best: PartitionParameters, is_chroma_block: bool, ) -> PartitionParameters1338 fn intra_frame_rdo_mode_decision<T: Pixel>(
1339   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1340   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1341   cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1342   mut best: PartitionParameters, is_chroma_block: bool,
1343 ) -> PartitionParameters {
1344   let num_modes_rdo: usize;
1345   let mut modes = ArrayVec::<[_; INTRA_MODES]>::new();
1346 
1347   // Reduce number of prediction modes at higher speed levels
1348   num_modes_rdo = if (fi.frame_type == FrameType::KEY
1349     && fi.config.speed_settings.prediction_modes
1350       >= PredictionModesSetting::ComplexKeyframes)
1351     || (fi.frame_type.has_inter()
1352       && fi.config.speed_settings.prediction_modes
1353         >= PredictionModesSetting::ComplexAll)
1354   {
1355     7
1356   } else {
1357     3
1358   };
1359 
1360   let intra_mode_set = RAV1E_INTRA_MODES;
1361 
1362   // Find mode with lowest rate cost
1363   {
1364     let probs_all = if fi.frame_type.has_inter() {
1365       cw.get_cdf_intra_mode(bsize)
1366     } else {
1367       cw.get_cdf_intra_mode_kf(tile_bo)
1368     }
1369     .iter()
1370     .take(INTRA_MODES)
1371     .scan(32768, |z, &a| {
1372       let d = *z - a;
1373       *z = a;
1374       Some(!d)
1375     })
1376     .collect::<ArrayVec<[_; INTRA_MODES]>>();
1377 
1378     modes.try_extend_from_slice(intra_mode_set).unwrap();
1379     modes.sort_by_key(|&a| probs_all[a as usize]);
1380   }
1381 
1382   // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1383   // may be improved by emulating prediction for each tx block.
1384   {
1385     let satds = {
1386       // FIXME: If tx partition is used, this whole sads block should be fixed
1387       let tx_size = bsize.tx_size();
1388       let edge_buf = {
1389         let rec = &ts.rec.planes[0].as_const();
1390         let po = tile_bo.plane_offset(rec.plane_cfg);
1391         // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1392         get_intra_edges(
1393           rec,
1394           tile_bo,
1395           0,
1396           0,
1397           bsize,
1398           po,
1399           tx_size,
1400           fi.sequence.bit_depth,
1401           None,
1402           fi.sequence.enable_intra_edge_filter,
1403           IntraParam::None,
1404         )
1405       };
1406 
1407       let ief_params = if fi.sequence.enable_intra_edge_filter {
1408         let above_block_info = ts.above_block_info(tile_bo, 0, 0);
1409         let left_block_info = ts.left_block_info(tile_bo, 0, 0);
1410         Some(IntraEdgeFilterParameters::new(
1411           0,
1412           above_block_info,
1413           left_block_info,
1414         ))
1415       } else {
1416         None
1417       };
1418 
1419       let mut satds_all = [0; INTRA_MODES];
1420       for &luma_mode in modes.iter().skip(num_modes_rdo / 2) {
1421         let tile_rect = ts.tile_rect();
1422         let rec = &mut ts.rec.planes[0];
1423         let mut rec_region =
1424           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1425         // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1426         luma_mode.predict_intra(
1427           tile_rect,
1428           &mut rec_region,
1429           tx_size,
1430           fi.sequence.bit_depth,
1431           &[0i16; 2],
1432           IntraParam::None,
1433           if luma_mode.is_directional() { ief_params } else { None },
1434           &edge_buf,
1435           fi.cpu_feature_level,
1436         );
1437 
1438         let plane_org = ts.input_tile.planes[0]
1439           .subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1440         let plane_ref = rec_region.as_const();
1441 
1442         satds_all[luma_mode as usize] = get_satd(
1443           &plane_org,
1444           &plane_ref,
1445           tx_size.block_size(),
1446           fi.sequence.bit_depth,
1447           fi.cpu_feature_level,
1448         );
1449       }
1450       satds_all
1451     };
1452 
1453     modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]);
1454   }
1455 
1456   debug_assert!(num_modes_rdo >= 1);
1457 
1458   modes.iter().take(num_modes_rdo).for_each(|&luma_mode| {
1459     let mvs = [MotionVector::default(); 2];
1460     let ref_frames = [INTRA_FRAME, NONE_FRAME];
1461     let mut mode_set_chroma = ArrayVec::<[_; 2]>::new();
1462     mode_set_chroma.push(luma_mode);
1463     if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1464       mode_set_chroma.push(PredictionMode::DC_PRED);
1465     }
1466     luma_chroma_mode_rdo(
1467       luma_mode,
1468       fi,
1469       bsize,
1470       tile_bo,
1471       ts,
1472       cw,
1473       rdo_type,
1474       cw_checkpoint,
1475       &mut best,
1476       mvs,
1477       ref_frames,
1478       &mode_set_chroma,
1479       true,
1480       0,
1481       &ArrayVec::<[CandidateMV; 9]>::new(),
1482       AngleDelta::default(),
1483     );
1484   });
1485 
1486   if fi.config.speed_settings.fine_directional_intra
1487     && bsize >= BlockSize::BLOCK_8X8
1488   {
1489     // Find the best angle delta for the current best prediction mode
1490     let luma_deltas = best.pred_mode_luma.angle_delta_count();
1491     let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1492 
1493     let mvs = [MotionVector::default(); 2];
1494     let ref_frames = [INTRA_FRAME, NONE_FRAME];
1495     let mode_set_chroma = [best.pred_mode_chroma];
1496     let mv_stack = ArrayVec::<[_; 9]>::new();
1497     let mut best_angle_delta = best.angle_delta;
1498     let mut angle_delta_rdo = |y, uv| -> AngleDelta {
1499       if best.angle_delta.y != y || best.angle_delta.uv != uv {
1500         luma_chroma_mode_rdo(
1501           best.pred_mode_luma,
1502           fi,
1503           bsize,
1504           tile_bo,
1505           ts,
1506           cw,
1507           rdo_type,
1508           cw_checkpoint,
1509           &mut best,
1510           mvs,
1511           ref_frames,
1512           &mode_set_chroma,
1513           true,
1514           0,
1515           &mv_stack,
1516           AngleDelta { y, uv },
1517         );
1518       }
1519       best.angle_delta
1520     };
1521 
1522     for i in 0..luma_deltas {
1523       let angle_delta_y =
1524         if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 };
1525       best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1526     }
1527     for j in 0..chroma_deltas {
1528       let angle_delta_uv =
1529         if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 };
1530       best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1531     }
1532   }
1533 
1534   best
1535 }
1536 
rdo_cfl_alpha<T: Pixel>( ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, luma_tx_size: TxSize, fi: &FrameInvariants<T>, ) -> Option<CFLParams>1537 pub fn rdo_cfl_alpha<T: Pixel>(
1538   ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1539   luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1540 ) -> Option<CFLParams> {
1541   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1542   let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1543   debug_assert!(bsize.subsampled_size(xdec, ydec) == uv_tx_size.block_size());
1544 
1545   let frame_bo = ts.to_frame_block_offset(tile_bo);
1546   let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1547     (fi.width + xdec) >> xdec,
1548     (fi.height + ydec) >> ydec,
1549     uv_tx_size.block_size(),
1550     (frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1551     (frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1552   );
1553 
1554   if visible_tx_w == 0 || visible_tx_h == 0 {
1555     return None;
1556   };
1557   let mut ac: Aligned<[i16; 32 * 32]> = Aligned::uninitialized();
1558   luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1559   let best_alpha: ArrayVec<[i16; 2]> = (1..3)
1560     .map(|p| {
1561       let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1562       let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1563       let rec = &mut ts.rec.planes[p];
1564       let input = &ts.input_tile.planes[p];
1565       let po = tile_bo.plane_offset(rec.plane_cfg);
1566       let edge_buf = get_intra_edges(
1567         &rec.as_const(),
1568         tile_bo,
1569         0,
1570         0,
1571         bsize,
1572         po,
1573         uv_tx_size,
1574         fi.sequence.bit_depth,
1575         Some(PredictionMode::UV_CFL_PRED),
1576         fi.sequence.enable_intra_edge_filter,
1577         IntraParam::None,
1578       );
1579       let mut alpha_cost = |alpha: i16| -> u64 {
1580         let mut rec_region =
1581           rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1582         PredictionMode::UV_CFL_PRED.predict_intra(
1583           tile_rect,
1584           &mut rec_region,
1585           uv_tx_size,
1586           fi.sequence.bit_depth,
1587           &ac.data,
1588           IntraParam::Alpha(alpha),
1589           None,
1590           &edge_buf,
1591           fi.cpu_feature_level,
1592         );
1593         sse_wxh(
1594           &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1595           &rec_region.as_const(),
1596           visible_tx_w,
1597           visible_tx_h,
1598           |_, _| DistortionScale::default(), // We're not doing RDO here.
1599           fi.sequence.bit_depth,
1600           fi.cpu_feature_level,
1601         )
1602         .0
1603       };
1604       let mut best = (alpha_cost(0), 0);
1605       let mut count = 2;
1606       for alpha in 1i16..=16i16 {
1607         let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1608         if cost.0 < best.0 {
1609           best = (cost.0, alpha);
1610           count += 2;
1611         }
1612         if cost.1 < best.0 {
1613           best = (cost.1, -alpha);
1614           count += 2;
1615         }
1616         if count < alpha {
1617           break;
1618         }
1619       }
1620       best.1
1621     })
1622     .collect();
1623 
1624   if best_alpha[0] == 0 && best_alpha[1] == 0 {
1625     None
1626   } else {
1627     Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1]))
1628   }
1629 }
1630 
1631 /// RDO-based transform type decision
1632 /// If cw_checkpoint is None, a checkpoint for cw's (ContextWriter) current
1633 /// state is created and stored for later use.
rdo_tx_type_decision<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>, mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet, tx_types: &[TxType], ) -> (TxType, f64)1634 pub fn rdo_tx_type_decision<T: Pixel>(
1635   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1636   cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1637   mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2],
1638   bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1639   tx_types: &[TxType],
1640 ) -> (TxType, f64) {
1641   let mut best_type = TxType::DCT_DCT;
1642   let mut best_rd = std::f64::MAX;
1643 
1644   let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg;
1645   let is_chroma_block =
1646     has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1647 
1648   let is_inter = !mode.is_intra();
1649 
1650   if cw_checkpoint.is_none() {
1651     // Only run the first call
1652     // Prevents creating multiple checkpoints for own version of cw
1653     *cw_checkpoint = Some(cw.checkpoint());
1654   }
1655 
1656   let rdo_type = if fi.use_tx_domain_distortion {
1657     RDOType::TxDistRealRate
1658   } else {
1659     RDOType::PixelDistRealRate
1660   };
1661   let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1662 
1663   for &tx_type in tx_types {
1664     // Skip unsupported transform types
1665     if av1_tx_used[tx_set as usize][tx_type as usize] == 0 {
1666       continue;
1667     }
1668 
1669     if is_inter {
1670       motion_compensate(
1671         fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true,
1672       );
1673     }
1674 
1675     let wr: &mut dyn Writer = &mut WriterCounter::new();
1676     let tell = wr.tell_frac();
1677     let (_, tx_dist) = if is_inter {
1678       write_tx_tree(
1679         fi,
1680         ts,
1681         cw,
1682         wr,
1683         mode,
1684         0,
1685         tile_bo,
1686         bsize,
1687         tx_size,
1688         tx_type,
1689         false,
1690         true,
1691         rdo_type,
1692         need_recon_pixel,
1693       )
1694     } else {
1695       write_tx_blocks(
1696         fi,
1697         ts,
1698         cw,
1699         wr,
1700         mode,
1701         mode,
1702         AngleDelta::default(),
1703         tile_bo,
1704         bsize,
1705         tx_size,
1706         tx_type,
1707         false,
1708         CFLParams::default(), // Unused.
1709         true,
1710         rdo_type,
1711         need_recon_pixel,
1712       )
1713     };
1714 
1715     let rate = wr.tell_frac() - tell;
1716     let distortion = if fi.use_tx_domain_distortion {
1717       compute_tx_distortion(
1718         fi,
1719         ts,
1720         bsize,
1721         is_chroma_block,
1722         tile_bo,
1723         tx_dist,
1724         false,
1725         true,
1726       )
1727     } else {
1728       compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true)
1729     };
1730     let rd = compute_rd_cost(fi, rate, distortion);
1731     if rd < best_rd {
1732       best_rd = rd;
1733       best_type = tx_type;
1734     }
1735 
1736     cw.rollback(cw_checkpoint.as_ref().unwrap());
1737   }
1738 
1739   assert!(best_rd >= 0_f64);
1740 
1741   (best_type, best_rd)
1742 }
1743 
get_sub_partitions( four_partitions: &[TileBlockOffset; 4], partition: PartitionType, ) -> ArrayVec<[TileBlockOffset; 4]>1744 pub fn get_sub_partitions(
1745   four_partitions: &[TileBlockOffset; 4], partition: PartitionType,
1746 ) -> ArrayVec<[TileBlockOffset; 4]> {
1747   let mut partition_offsets = ArrayVec::<[TileBlockOffset; 4]>::new();
1748 
1749   partition_offsets.push(four_partitions[0]);
1750 
1751   if partition == PARTITION_NONE {
1752     return partition_offsets;
1753   }
1754   if partition == PARTITION_VERT || partition == PARTITION_SPLIT {
1755     partition_offsets.push(four_partitions[1]);
1756   };
1757   if partition == PARTITION_HORZ || partition == PARTITION_SPLIT {
1758     partition_offsets.push(four_partitions[2]);
1759   };
1760   if partition == PARTITION_SPLIT {
1761     partition_offsets.push(four_partitions[3]);
1762   };
1763 
1764   partition_offsets
1765 }
1766 
1767 #[inline(always)]
rdo_partition_none<T: Pixel>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec<[PartitionParameters; 4]>, ) -> f641768 fn rdo_partition_none<T: Pixel>(
1769   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1770   cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1771   inter_cfg: &InterConfig,
1772   child_modes: &mut ArrayVec<[PartitionParameters; 4]>,
1773 ) -> f64 {
1774   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1775 
1776   let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1777   let cost = mode.rd_cost;
1778 
1779   child_modes.push(mode);
1780 
1781   cost
1782 }
1783 
1784 // VERTICAL, HORIZONTAL or simple SPLIT
1785 #[inline(always)]
rdo_partition_simple<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, partition: PartitionType, rdo_type: RDOType, best_rd: f64, child_modes: &mut ArrayVec<[PartitionParameters; 4]>, ) -> Option<f64>1786 fn rdo_partition_simple<T: Pixel, W: Writer>(
1787   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1788   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1789   bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1790   partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1791   child_modes: &mut ArrayVec<[PartitionParameters; 4]>,
1792 ) -> Option<f64> {
1793   debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1794   let subsize = bsize.subsize(partition);
1795 
1796   debug_assert!(subsize != BlockSize::BLOCK_INVALID);
1797 
1798   let cost = if bsize >= BlockSize::BLOCK_8X8 {
1799     let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1800     let tell = w.tell_frac();
1801     cw.write_partition(w, tile_bo, partition, bsize);
1802     compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1803   } else {
1804     0.0
1805   };
1806 
1807   let hbsw = subsize.width_mi(); // Half the block size width in blocks
1808   let hbsh = subsize.height_mi(); // Half the block size height in blocks
1809   let four_partitions = [
1810     tile_bo,
1811     TileBlockOffset(BlockOffset {
1812       x: tile_bo.0.x + hbsw as usize,
1813       y: tile_bo.0.y,
1814     }),
1815     TileBlockOffset(BlockOffset {
1816       x: tile_bo.0.x,
1817       y: tile_bo.0.y + hbsh as usize,
1818     }),
1819     TileBlockOffset(BlockOffset {
1820       x: tile_bo.0.x + hbsw as usize,
1821       y: tile_bo.0.y + hbsh as usize,
1822     }),
1823   ];
1824 
1825   let partitions = get_sub_partitions(&four_partitions, partition);
1826 
1827   let mut rd_cost_sum = 0.0;
1828 
1829   for offset in partitions {
1830     let hbs = subsize.width_mi() >> 1;
1831     let has_cols = offset.0.x + hbs < ts.mi_width;
1832     let has_rows = offset.0.y + hbs < ts.mi_height;
1833 
1834     if has_cols && has_rows {
1835       let mode_decision =
1836         rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1837 
1838       rd_cost_sum += mode_decision.rd_cost;
1839 
1840       if fi.enable_early_exit && rd_cost_sum > best_rd {
1841         return None;
1842       }
1843       if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1844         let w: &mut W =
1845           if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1846         cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1847       }
1848       encode_block_with_modes(
1849         fi,
1850         ts,
1851         cw,
1852         w_pre_cdef,
1853         w_post_cdef,
1854         subsize,
1855         offset,
1856         &mode_decision,
1857         rdo_type,
1858         false,
1859       );
1860       child_modes.push(mode_decision);
1861     } else {
1862       //rd_cost_sum += std::f64::MAX;
1863       return None;
1864     }
1865   }
1866 
1867   Some(cost + rd_cost_sum)
1868 }
1869 
1870 // RDO-based single level partitioning decision
rdo_partition_decision<T: Pixel, W: Writer>( fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, cached_block: &PartitionGroupParameters, partition_types: &[PartitionType], rdo_type: RDOType, inter_cfg: &InterConfig, ) -> PartitionGroupParameters1871 pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1872   fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1873   cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1874   bsize: BlockSize, tile_bo: TileBlockOffset,
1875   cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1876   rdo_type: RDOType, inter_cfg: &InterConfig,
1877 ) -> PartitionGroupParameters {
1878   let mut best_partition = cached_block.part_type;
1879   let mut best_rd = cached_block.rd_cost;
1880   let mut best_pred_modes = cached_block.part_modes.clone();
1881 
1882   let cw_checkpoint = cw.checkpoint();
1883   let w_pre_checkpoint = w_pre_cdef.checkpoint();
1884   let w_post_checkpoint = w_post_cdef.checkpoint();
1885 
1886   for &partition in partition_types {
1887     // Do not re-encode results we already have
1888     if partition == cached_block.part_type {
1889       continue;
1890     }
1891 
1892     let mut child_modes = ArrayVec::<[_; 4]>::new();
1893 
1894     let cost = match partition {
1895       PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1896         Some(rdo_partition_none(
1897           fi,
1898           ts,
1899           cw,
1900           bsize,
1901           tile_bo,
1902           inter_cfg,
1903           &mut child_modes,
1904         ))
1905       }
1906       PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => {
1907         rdo_partition_simple(
1908           fi,
1909           ts,
1910           cw,
1911           w_pre_cdef,
1912           w_post_cdef,
1913           bsize,
1914           tile_bo,
1915           inter_cfg,
1916           partition,
1917           rdo_type,
1918           best_rd,
1919           &mut child_modes,
1920         )
1921       }
1922       _ => {
1923         unreachable!();
1924       }
1925     };
1926 
1927     if let Some(rd) = cost {
1928       if rd < best_rd {
1929         best_rd = rd;
1930         best_partition = partition;
1931         best_pred_modes = child_modes.clone();
1932       }
1933     }
1934     cw.rollback(&cw_checkpoint);
1935     w_pre_cdef.rollback(&w_pre_checkpoint);
1936     w_post_cdef.rollback(&w_post_checkpoint);
1937   }
1938 
1939   assert!(best_rd >= 0_f64);
1940 
1941   PartitionGroupParameters {
1942     rd_cost: best_rd,
1943     part_type: best_partition,
1944     part_modes: best_pred_modes,
1945   }
1946 }
1947 
rdo_loop_plane_error<T: Pixel>( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize, ) -> ScaledDistortion1948 fn rdo_loop_plane_error<T: Pixel>(
1949   base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
1950   sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
1951   blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
1952 ) -> ScaledDistortion {
1953   let sb_w_blocks =
1954     if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w;
1955   let sb_h_blocks =
1956     if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h;
1957   // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
1958   // accumulating in-frame and unpadded
1959   let mut err = Distortion::zero();
1960   for by in 0..sb_h_blocks {
1961     for bx in 0..sb_w_blocks {
1962       let loop_bo = offset_sbo.block_offset(bx << 1, by << 1);
1963       if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
1964         let src_plane = &src.planes[pli];
1965         let test_plane = &test.planes[pli];
1966         let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
1967         debug_assert_eq!(xdec, test_plane.cfg.xdec);
1968         debug_assert_eq!(ydec, test_plane.cfg.ydec);
1969 
1970         // Unfortunately, our distortion biases are only available via
1971         // Frame-absolute addressing, so we need a block offset
1972         // relative to the full frame origin (not the tile or analysis
1973         // area)
1974         let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1);
1975         let bias = distortion_scale(
1976           fi,
1977           ts.to_frame_block_offset(frame_bo),
1978           BlockSize::BLOCK_8X8,
1979         );
1980 
1981         let src_region =
1982           src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
1983         let test_region =
1984           test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
1985 
1986         err += if pli == 0 {
1987           // For loop filters, We intentionally use cdef_dist even with
1988           // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
1989           // significant negative impact on other metrics and visual quality.
1990           cdef_dist_wxh_8x8(&src_region, &test_region, fi.sequence.bit_depth)
1991             * bias
1992         } else {
1993           sse_wxh(
1994             &src_region,
1995             &test_region,
1996             8 >> xdec,
1997             8 >> ydec,
1998             |_, _| bias,
1999             fi.sequence.bit_depth,
2000             fi.cpu_feature_level,
2001           )
2002         };
2003       }
2004     }
2005   }
2006   err * fi.dist_scale[pli]
2007 }
2008 
2009 // Passed in a superblock offset representing the upper left corner of
2010 // the LRU area we're optimizing.  This area covers the largest LRU in
2011 // any of the present planes, but may consist of a number of
2012 // superblocks and full, smaller LRUs in the other planes
rdo_loop_decision<T: Pixel>( base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut dyn Writer, deblock_p: bool, )2013 pub fn rdo_loop_decision<T: Pixel>(
2014   base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2015   ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut dyn Writer,
2016   deblock_p: bool,
2017 ) {
2018   let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2019     1
2020   } else {
2021     MAX_PLANES
2022   };
2023   assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration);
2024   // Determine area of optimization: Which plane has the largest LRUs?
2025   // How many LRUs for each?
2026   let mut sb_w = 1; // how many superblocks wide the largest LRU
2027                     // is/how many SBs we're processing (same thing)
2028   let mut sb_h = 1; // how many superblocks wide the largest LRU
2029                     // is/how many SBs we're processing (same thing)
2030   let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing
2031   let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing
2032   for pli in 0..planes {
2033     let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2034     let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2035     if sb_w < (1 << sb_h_shift) {
2036       sb_w = 1 << sb_h_shift;
2037     }
2038     if sb_h < (1 << sb_v_shift) {
2039       sb_h = 1 << sb_v_shift;
2040     }
2041   }
2042   for pli in 0..planes {
2043     let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2044     let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2045     lru_w[pli] = sb_w / (1 << sb_h_shift);
2046     lru_h[pli] = sb_h / (1 << sb_v_shift);
2047   }
2048 
2049   // The superblock width/height determinations may be calling for us
2050   // to compute over superblocks that do not actually exist in the
2051   // frame (off the right or lower edge).  Trim sb width/height down
2052   // to actual superblocks.  Note that these last superblocks on the
2053   // right/bottom may themselves still span the edge of the frame, but
2054   // they do hold at least some visible pixels.
2055   sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2056   sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2057 
2058   // We have need to know the Y visible pixel limits as well (the
2059   // sb_w/sb_h figures above can be used to determine how many
2060   // allocated pixels, possibly beyond the visible frame, exist).
2061   let crop_w =
2062     fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2063   let crop_h =
2064     fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2065   let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2066   let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2067 
2068   // Based on `RestorationState::new`
2069   const MAX_SB_SHIFT: usize = 4;
2070   const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT;
2071   const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2072 
2073   // Static allocation relies on the "minimal LRU area for all N planes" invariant.
2074   let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE];
2075   let mut best_lrf =
2076     [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2077 
2078   // due to imprecision in the reconstruction parameter solver, we
2079   // need to make sure we don't fall into a limit cycle.  Track our
2080   // best cost at LRF so that we can break if we get a solution that doesn't
2081   // improve at the reconstruction stage.
2082   let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2083 
2084   // Loop filter RDO is an iterative process and we need temporary
2085   // scratch data to hold the results of deblocking, cdef, and the
2086   // loop reconstruction filter so that each can be partially updated
2087   // without recomputing the entire stack.  Construct
2088   // largest-LRU-sized frames for each, accounting for padding
2089   // required by deblocking, cdef and [optionally] LR.
2090   let mut rec_subset = ts
2091     .rec
2092     .subregion(Area::BlockRect {
2093       bo: base_sbo.block_offset(0, 0).0,
2094       width: (pixel_w + 7) >> 3 << 3,
2095       height: (pixel_h + 7) >> 3 << 3,
2096     })
2097     .scratch_copy();
2098 
2099   // sub-setted region of the TileBlocks for our working frame area.
2100   // Note that the size of this subset is what signals CDEF as to the
2101   // actual coded size.
2102   let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2103     base_sbo.block_offset(0, 0).0.x,
2104     base_sbo.block_offset(0, 0).0.y,
2105     sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2106     sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2107   );
2108 
2109   // const, no need to copy, just need the subregion (but do zero the
2110   // origin to match the other copies/new backing frames).
2111   let src_subset = ts
2112     .input_tile
2113     .subregion(Area::BlockRect {
2114       bo: base_sbo.block_offset(0, 0).0,
2115       width: (pixel_w + 7) >> 3 << 3,
2116       height: (pixel_h + 7) >> 3 << 3,
2117     })
2118     .home();
2119 
2120   if deblock_p {
2121     // Find a good deblocking filter solution for the passed in area.
2122     // This is not RDO of deblocking itself, merely a solution to get
2123     // better results from CDEF/LRF RDO.
2124     let deblock_levels = deblock_filter_optimize(
2125       fi,
2126       &rec_subset.as_tile(),
2127       &src_subset,
2128       &tileblocks_subset.as_const(),
2129       crop_w,
2130       crop_h,
2131     );
2132 
2133     // Deblock the contents of our reconstruction copy.
2134     if deblock_levels[0] != 0 || deblock_levels[1] != 0 {
2135       // copy ts.deblock because we need to set some of our own values here
2136       let mut deblock_copy = *ts.deblock;
2137       deblock_copy.levels = deblock_levels;
2138 
2139       // finally, deblock the temp frame
2140       deblock_filter_frame(
2141         &deblock_copy,
2142         &mut rec_subset.as_tile_mut(),
2143         &tileblocks_subset.as_const(),
2144         crop_w,
2145         crop_h,
2146         fi.sequence.bit_depth,
2147         planes,
2148       );
2149     }
2150   }
2151 
2152   let mut cdef_work =
2153     if fi.sequence.enable_cdef { Some(rec_subset.clone()) } else { None };
2154   let mut lrf_work = if fi.sequence.enable_restoration {
2155     Some(Frame {
2156       planes: {
2157         let new_plane = |pli: usize| {
2158           let PlaneConfig { xdec, ydec, width, height, .. } =
2159             rec_subset.planes[pli].cfg;
2160           Plane::new(width, height, xdec, ydec, 0, 0)
2161         };
2162         [new_plane(0), new_plane(1), new_plane(2)]
2163       },
2164     })
2165   } else {
2166     None
2167   };
2168 
2169   // Precompute directional analysis for CDEF
2170   let cdef_data = {
2171     if cdef_work.is_some() {
2172       Some((
2173         &rec_subset,
2174         cdef_analyze_superblock_range(
2175           fi,
2176           &rec_subset,
2177           &tileblocks_subset.as_const(),
2178           sb_w,
2179           sb_h,
2180         ),
2181       ))
2182     } else {
2183       None
2184     }
2185   };
2186 
2187   // CDEF/LRF decision iteration
2188   // Start with a default of CDEF 0 and RestorationFilter::None
2189   // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2190   // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2191   // If LRF choice changed for any plane, repeat until no changes
2192   // Limit iterations and where we break based on speed setting (in the TODO list ;-)
2193   let mut cdef_change = true;
2194   let mut lrf_change = true;
2195   while cdef_change || lrf_change {
2196     // search for improved cdef indices, superblock by superblock, if cdef is enabled.
2197     if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2198       (&cdef_data, &mut cdef_work.as_mut())
2199     {
2200       for sby in 0..sb_h {
2201         for sbx in 0..sb_w {
2202           let prev_best_index = best_index[sby * sb_w + sbx];
2203           let mut best_cost = -1.;
2204           let mut best_new_index = -1i8;
2205 
2206           /* offset of the superblock we're currently testing within the larger
2207           analysis area */
2208           let loop_sbo =
2209             TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2210 
2211           /* cdef index testing loop */
2212           for cdef_index in 0..(1 << fi.cdef_bits) {
2213             let mut err = ScaledDistortion::zero();
2214             let mut rate = 0;
2215 
2216             cdef_filter_superblock(
2217               fi,
2218               &rec_subset,
2219               &mut cdef_ref.as_tile_mut(),
2220               &tileblocks_subset.as_const(),
2221               loop_sbo,
2222               cdef_index,
2223               &cdef_dirs[sby * sb_w + sbx],
2224             );
2225             // apply LRF if any
2226             for pli in 0..planes {
2227               // We need the cropped-to-visible-frame area of this SB
2228               let wh =
2229                 if fi.sequence.use_128x128_superblock { 128 } else { 64 };
2230               let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2231               let vis_width = (wh >> xdec).min(
2232                 (crop_w >> xdec)
2233                   - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2234                     as usize,
2235               );
2236               let vis_height = (wh >> ydec).min(
2237                 (crop_h >> ydec)
2238                   - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2239                     as usize,
2240               );
2241               // which LRU are we currently testing against?
2242               if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2243                 let rp = &ts.restoration.planes[pli];
2244                 (
2245                   rp.restoration_unit_offset(base_sbo, loop_sbo, false),
2246                   &mut lrf_work,
2247                 )
2248               } {
2249                 // We have a valid LRU, apply LRF, compute error
2250                 match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2251                   RestorationFilter::None {} => {
2252                     err += rdo_loop_plane_error(
2253                       base_sbo,
2254                       loop_sbo,
2255                       1,
2256                       1,
2257                       fi,
2258                       ts,
2259                       &tileblocks_subset.as_const(),
2260                       cdef_ref,
2261                       &src_subset,
2262                       pli,
2263                     );
2264                     rate += if fi.sequence.enable_restoration {
2265                       cw.fc.count_lrf_switchable(
2266                         w,
2267                         &ts.restoration.as_const(),
2268                         best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2269                         pli,
2270                       )
2271                     } else {
2272                       0 // no relative cost differeneces to different
2273                         // CDEF params.  If cdef is on, it's a wash.
2274                     };
2275                   }
2276                   RestorationFilter::Sgrproj { set, xqd } => {
2277                     // only run on this single superblock
2278                     let loop_po =
2279                       loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2280                     // todo: experiment with borrowing border pixels
2281                     // rather than edge-extending. Right now this is
2282                     // hard-clipping to the superblock boundary.
2283                     setup_integral_image(
2284                       &mut ts.integral_buffer,
2285                       SOLVE_IMAGE_STRIDE,
2286                       vis_width,
2287                       vis_height,
2288                       vis_width,
2289                       vis_height,
2290                       &cdef_ref.planes[pli].slice(loop_po),
2291                       &cdef_ref.planes[pli].slice(loop_po),
2292                     );
2293                     sgrproj_stripe_filter(
2294                       set,
2295                       xqd,
2296                       fi,
2297                       &ts.integral_buffer,
2298                       SOLVE_IMAGE_STRIDE,
2299                       &cdef_ref.planes[pli].slice(loop_po),
2300                       &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2301                         x: loop_po.x,
2302                         y: loop_po.y,
2303                         width: vis_width,
2304                         height: vis_height,
2305                       }),
2306                     );
2307                     err += rdo_loop_plane_error(
2308                       base_sbo,
2309                       loop_sbo,
2310                       1,
2311                       1,
2312                       fi,
2313                       ts,
2314                       &tileblocks_subset.as_const(),
2315                       lrf_ref,
2316                       &src_subset,
2317                       pli,
2318                     );
2319                     rate += cw.fc.count_lrf_switchable(
2320                       w,
2321                       &ts.restoration.as_const(),
2322                       best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2323                       pli,
2324                     );
2325                   }
2326                   RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2327                 }
2328               } else {
2329                 // No actual LRU here, compute error directly from CDEF output.
2330                 err += rdo_loop_plane_error(
2331                   base_sbo,
2332                   loop_sbo,
2333                   1,
2334                   1,
2335                   fi,
2336                   ts,
2337                   &tileblocks_subset.as_const(),
2338                   cdef_ref,
2339                   &src_subset,
2340                   pli,
2341                 );
2342                 // no relative cost differeneces to different
2343                 // CDEF params.  If cdef is on, it's a wash.
2344                 // rate += 0;
2345               }
2346             }
2347 
2348             let cost = compute_rd_cost(fi, rate, err);
2349             if best_cost < 0. || cost < best_cost {
2350               best_cost = cost;
2351               best_new_index = cdef_index as i8;
2352             }
2353           }
2354 
2355           // Did we change any preexisting choices?
2356           if best_new_index != prev_best_index {
2357             cdef_change = true;
2358             best_index[sby * sb_w + sbx] = best_new_index;
2359             tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2360           }
2361 
2362           let mut cdef_ref_tm = TileMut::new(
2363             cdef_ref,
2364             TileRect {
2365               x: 0,
2366               y: 0,
2367               width: cdef_ref.planes[0].cfg.width,
2368               height: cdef_ref.planes[0].cfg.height,
2369             },
2370           );
2371 
2372           // Keep cdef output up to date; we need it for restoration
2373           // both below and above (padding)
2374           cdef_filter_superblock(
2375             fi,
2376             rec_copy,
2377             &mut cdef_ref_tm,
2378             &tileblocks_subset.as_const(),
2379             loop_sbo,
2380             best_index[sby * sb_w + sbx] as u8,
2381             &cdef_dirs[sby * sb_w + sbx],
2382           );
2383         }
2384       }
2385     }
2386 
2387     if !cdef_change {
2388       break;
2389     }
2390     cdef_change = false;
2391     lrf_change = false;
2392 
2393     // search for improved restoration filter parameters if restoration is enabled
2394     if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2395       let lrf_input = if cdef_work.is_some() {
2396         // When CDEF is enabled, we pull from the CDEF output
2397         &cdef_work.as_ref().unwrap()
2398       } else {
2399         // When CDEF is disabled, we pull from the [optionally
2400         // deblocked] reconstruction
2401         &rec_subset
2402       };
2403       for pli in 0..planes {
2404         // Nominal size of LRU in pixels before clipping to visible frame
2405         let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2406         // width, in sb, of an LRU in this plane
2407         let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2408         // height, in sb, of an LRU in this plane
2409         let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2410         let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2411         for lru_y in 0..lru_h[pli] {
2412           // number of LRUs vertically
2413           for lru_x in 0..lru_w[pli] {
2414             // number of LRUs horizontally
2415             let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2416               x: lru_x * lru_sb_w,
2417               y: lru_y * lru_sb_h,
2418             });
2419             if ts.restoration.has_restoration_unit(
2420               base_sbo + loop_sbo,
2421               pli,
2422               false,
2423             ) {
2424               let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2425               let lrf_in_plane = &lrf_input.planes[pli];
2426               let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2427               let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2428               let mut best_cost =
2429                 best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2430 
2431               // Check the no filter option
2432               {
2433                 let err = rdo_loop_plane_error(
2434                   base_sbo,
2435                   loop_sbo,
2436                   lru_sb_w,
2437                   lru_sb_h,
2438                   fi,
2439                   ts,
2440                   &tileblocks_subset.as_const(),
2441                   lrf_input,
2442                   &src_subset,
2443                   pli,
2444                 );
2445                 let rate = cw.fc.count_lrf_switchable(
2446                   w,
2447                   &ts.restoration.as_const(),
2448                   best_new_lrf,
2449                   pli,
2450                 );
2451 
2452                 let cost = compute_rd_cost(fi, rate, err);
2453                 // Was this choice actually an improvement?
2454                 if best_cost < 0. || cost < best_cost {
2455                   best_cost = cost;
2456                   best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2457                   best_new_lrf = RestorationFilter::None;
2458                 }
2459               }
2460 
2461               // Look for a self guided filter
2462               // We need the cropped-to-visible-frame computation area of this LRU
2463               let vis_width = unit_size.min(
2464                 (crop_w >> xdec)
2465                   - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2466               );
2467               let vis_height = unit_size.min(
2468                 (crop_h >> ydec)
2469                   - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2470               );
2471 
2472               // todo: experiment with borrowing border pixels
2473               // rather than edge-extending. Right now this is
2474               // hard-clipping to the superblock boundary.
2475               setup_integral_image(
2476                 &mut ts.integral_buffer,
2477                 SOLVE_IMAGE_STRIDE,
2478                 vis_width,
2479                 vis_height,
2480                 vis_width,
2481                 vis_height,
2482                 &lrf_in_plane.slice(lrf_po),
2483                 &lrf_in_plane.slice(lrf_po),
2484               );
2485 
2486               for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2487               {
2488                 let (xqd0, xqd1) = sgrproj_solve(
2489                   set,
2490                   fi,
2491                   &ts.integral_buffer,
2492                   &src_plane
2493                     .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2494                   &lrf_in_plane.slice(lrf_po),
2495                   vis_width,
2496                   vis_height,
2497                 );
2498                 let current_lrf =
2499                   RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2500                 if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2501                   sgrproj_stripe_filter(
2502                     set,
2503                     xqd,
2504                     fi,
2505                     &ts.integral_buffer,
2506                     SOLVE_IMAGE_STRIDE,
2507                     &lrf_in_plane.slice(lrf_po),
2508                     &mut lrf_ref.planes[pli].region_mut(Area::Rect {
2509                       x: lrf_po.x,
2510                       y: lrf_po.y,
2511                       width: vis_width,
2512                       height: vis_height,
2513                     }),
2514                   );
2515                 }
2516                 let err = rdo_loop_plane_error(
2517                   base_sbo,
2518                   loop_sbo,
2519                   lru_sb_w,
2520                   lru_sb_h,
2521                   fi,
2522                   ts,
2523                   &tileblocks_subset.as_const(),
2524                   lrf_ref,
2525                   &src_subset,
2526                   pli,
2527                 );
2528                 let rate = cw.fc.count_lrf_switchable(
2529                   w,
2530                   &ts.restoration.as_const(),
2531                   current_lrf,
2532                   pli,
2533                 );
2534                 let cost = compute_rd_cost(fi, rate, err);
2535                 if cost < best_cost {
2536                   best_cost = cost;
2537                   best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2538                   best_new_lrf = current_lrf;
2539                 }
2540               }
2541 
2542               if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2543                 .notequal(best_new_lrf)
2544               {
2545                 best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2546                 lrf_change = true;
2547                 if let Some(ru) = ts.restoration.planes[pli]
2548                   .restoration_unit_mut(base_sbo + loop_sbo)
2549                 {
2550                   ru.filter = best_new_lrf;
2551                 }
2552               }
2553             }
2554           }
2555         }
2556       }
2557     }
2558   }
2559 }
2560 
2561 #[test]
estimate_rate_test()2562 fn estimate_rate_test() {
2563   assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]);
2564 }
2565