// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { use crate::asm::x86::lrf::*; } else { use self::rust::*; } } use crate::color::ChromaSampling::Cs400; use crate::context::{MAX_PLANES, SB_SIZE}; use crate::encoder::FrameInvariants; use crate::frame::{ AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice, }; use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect}; use crate::util::{clamp, CastFromPrimitive, ILog, Pixel}; use rust_hawktracer::*; use crate::api::SGRComplexityLevel; use std::cmp; use std::iter::FusedIterator; use std::ops::{Index, IndexMut}; pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8; pub const RESTORE_NONE: u8 = 0; pub const RESTORE_SWITCHABLE: u8 = 1; pub const RESTORE_WIENER: u8 = 2; pub const RESTORE_SGRPROJ: u8 = 3; pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17]; pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15]; pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46]; #[allow(unused)] pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3]; pub const WIENER_BITS: usize = 7; pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32]; pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31]; pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95]; pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4; pub const SGRPROJ_PRJ_BITS: u8 = 7; pub const SGRPROJ_PARAMS_BITS: u8 = 4; pub const SGRPROJ_MTABLE_BITS: u8 = 20; pub const SGRPROJ_SGR_BITS: u8 = 8; pub const SGRPROJ_RECIP_BITS: u8 = 12; pub const SGRPROJ_RST_BITS: u8 = 4; pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [ [140, 3236], [112, 2158], [93, 1618], [80, 1438], [70, 1295], [58, 1177], [47, 1079], [37, 996], [30, 925], [25, 863], [0, 2589], [0, 1618], [0, 1177], [0, 925], [56, 0], [22, 0], ]; // List of indices to SGRPROJ_PARAMS_S values that at a given complexity level. // SGRPROJ_ALL_SETS contains every possible index const SGRPROJ_ALL_SETS: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; // SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives // most of the gains from sgr. The decision of which values to use is somewhat // arbitrary. The sgr parameters has 3 discontinuous groups. The first has both // parameters as non-zero. The other two are distinguishable by which of the // two parameters is zero. There are an even number of each of these groups and // the non-zero parameters grow as the indices increase. This array uses the // 1st, 3rd, ... smallest params of each group. const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15]; pub fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] { match complexity { SGRComplexityLevel::Full => SGRPROJ_ALL_SETS, SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS, } } pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2; pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2; pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE; pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT; pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2) + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1)); pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2; pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2; pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT; pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX] [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize]; /// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`. #[derive(Debug)] pub struct IntegralImageBuffer { pub integral_image: Vec, pub sq_integral_image: Vec, } impl IntegralImageBuffer { /// Creates a new buffer with the given size, filled with zeros. #[inline] pub fn zeroed(size: usize) -> Self { Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] } } } #[allow(unused)] // Wiener coming soon! #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum RestorationFilter { None, Wiener { coeffs: [[i8; 3]; 2] }, Sgrproj { set: u8, xqd: [i8; 2] }, } impl Default for RestorationFilter { fn default() -> RestorationFilter { RestorationFilter::None } } impl RestorationFilter { pub fn notequal(self, cmp: RestorationFilter) -> bool { match self { RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}), RestorationFilter::Sgrproj { set, xqd } => { if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp { !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1]) } else { true } } RestorationFilter::Wiener { coeffs } => { if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp { !(coeffs[0][0] == coeffs2[0][0] && coeffs[0][1] == coeffs2[0][1] && coeffs[0][2] == coeffs2[0][2] && coeffs[1][0] == coeffs2[1][0] && coeffs[1][1] == coeffs2[1][1] && coeffs[1][2] == coeffs2[1][2]) } else { true } } } } } pub(crate) mod rust { use crate::cpu_features::CpuFeatureLevel; use crate::frame::PlaneSlice; use crate::lrf::{ get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS, SGRPROJ_SGR_BITS, }; use crate::util::CastFromPrimitive; use crate::Pixel; #[inline(always)] pub(crate) fn sgrproj_box_ab_internal( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, ) { let d: usize = r * 2 + 1; let n: usize = d * d; let one_over_n = if r == 1 { 455 } else { 164 }; for x in start_x..stripe_w + 2 { let sum = get_integral_square(iimg, iimg_stride, x, y, d); let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d); let (reta, retb) = sgrproj_sum_finish(ssq, sum, n as u32, one_over_n, s, bdm8); af[x] = reta; bf[x] = retb; } } // computes an intermediate (ab) row for stripe_w + 2 columns at row y pub(crate) fn sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, ) { sgrproj_box_ab_internal( 1, af, bf, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, bdm8, ); } // computes an intermediate (ab) row for stripe_w + 2 columns at row y pub(crate) fn sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, ) { sgrproj_box_ab_internal( 2, af, bf, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, bdm8, ); } pub(crate) fn sgrproj_box_f_r0( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r0_internal( f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let line = cdeffed.row(y); for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) { *fp = u32::cast_from(v) << SGRPROJ_RST_BITS; } } pub(crate) fn sgrproj_box_f_r1( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r1_internal( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let line = cdeffed.row(y); for x in start_x..w { let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2]) + 4 * (af[1][x] + af[0][x + 1] + af[1][x + 1] + af[2][x + 1] + af[1][x + 2]); let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2]) + 4 * (bf[1][x] + bf[0][x + 1] + bf[1][x + 1] + bf[2][x + 1] + bf[1][x + 2]); let v = a * u32::cast_from(line[x]) + b; f[x] = (v + (1 << shift >> 1)) >> shift; } } pub(crate) fn sgrproj_box_f_r2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r2_internal( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let line = cdeffed.row(y); let line1 = cdeffed.row(y + 1); let af0 = af[0][start_x..w + 3].windows(3); let af1 = af[1][start_x..w + 3].windows(3); let bf0 = bf[0][start_x..w + 3].windows(3); let bf1 = bf[1][start_x..w + 3].windows(3); let af_it = af0.zip(af1); let bf_it = bf0.zip(bf1); let in0 = line[start_x..w].iter(); let in1 = line1[start_x..w].iter(); let o0 = f0[start_x..w].iter_mut(); let o1 = f1[start_x..w].iter_mut(); let in_iter = in0.zip(in1); let out_iter = o0.zip(o1); let io_iter = out_iter.zip(in_iter); for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in io_iter.zip(af_it.zip(bf_it)) { let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1]; let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1]; let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1]; let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1]; let v = (a + ao) * u32::cast_from(p0) + b + bo; *o0 = (v + (1 << shift >> 1)) >> shift; let vo = ao * u32::cast_from(p1) + bo; *o1 = (vo + (1 << shifto >> 1)) >> shifto; } } } #[inline(always)] fn sgrproj_sum_finish( ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize, ) -> (u32, u32) { let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8); let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8; let p = cmp::max(0, (scaled_ssq * n) as i32 - (scaled_sum * scaled_sum) as i32) as u32; let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS; let a = if z >= 255 { 256 } else if z == 0 { 1 } else { ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1) }; let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n; (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS) } // Using an integral image, compute the sum of a square region fn get_integral_square( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> u32 { // Cancel out overflow in iimg by using wrapping arithmetic iimg[y * stride + x] .wrapping_add(iimg[(y + size) * stride + x + size]) .wrapping_sub(iimg[(y + size) * stride + x]) .wrapping_sub(iimg[y * stride + x + size]) } struct VertPaddedIter<'a, T: Pixel> { // The two sources that can be selected when clipping deblocked: &'a Plane, cdeffed: &'a Plane, // x index to choice where on the row to start x: isize, // y index that will be mutated y: isize, // The index at which to terminate. Can be larger than the slice length. end: isize, // Used for source buffer choice/clipping. May (and regularly will) // be negative. stripe_begin: isize, // Also used for source buffer choice/clipping. May specify a stripe boundary // less than, equal to, or larger than the buffers we're accessing. stripe_end: isize, // Active area cropping is done by specifying a value smaller than the height // of the plane. crop: isize, } impl<'a, 'b, T: Pixel> VertPaddedIter<'a, T> { fn new( cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>, stripe_h: usize, crop: usize, ) -> VertPaddedIter<'a, T> { // cdeffed and deblocked must start at the same coordinates from their // underlying planes. Since cropping is provided via a separate params, the // height of the underlying planes do not need to match. assert_eq!(cdeffed.x, deblocked.x); assert_eq!(cdeffed.y, deblocked.y); // To share integral images, always use the max box filter radius of 2 let r = 2; // The number of rows outside the stripe are needed let rows_above = r + 2; let rows_below = 2; // Offset crop and stripe_h so they are relative to the underlying plane // and not the plane slice. let crop = crop as isize + deblocked.y; let stripe_end = stripe_h as isize + deblocked.y; // Move y up the number rows above. // If y is negative we repeat the first row let y = deblocked.y - rows_above as isize; VertPaddedIter { deblocked: deblocked.plane, cdeffed: cdeffed.plane, x: deblocked.x, y, end: (rows_above + stripe_h + rows_below) as isize + y, stripe_begin: deblocked.y, stripe_end, crop, } } } impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> { type Item = &'a [T]; #[inline(always)] fn next(&mut self) -> Option { if self.end > self.y { // clamp before deciding the source // clamp vertically to storage at top and passed-in height at bottom let cropped_y = clamp(self.y, 0, self.crop - 1); // clamp vertically to stripe limits let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1); // decide if we're vertically inside or outside the strip let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end as isize { self.cdeffed } else { self.deblocked }; // cannot directly return self.ps.row(row) due to lifetime issue let range = src_plane.row_range(self.x, ly); self.y += 1; Some(&src_plane.data[range]) } else { None } } fn size_hint(&self) -> (usize, Option) { let remaining = self.end - self.y; debug_assert!(remaining >= 0); let remaining = remaining as usize; (remaining, Some(remaining)) } } impl ExactSizeIterator for VertPaddedIter<'_, T> {} impl FusedIterator for VertPaddedIter<'_, T> {} struct HorzPaddedIter<'a, T: Pixel> { // Active area cropping is done using the length of the slice slice: &'a [T], // x index of the iterator // When less than 0, repeat the first element. When greater than end, repeat // the last element index: isize, // The index at which to terminate. Can be larger than the slice length. end: usize, } impl<'a, T: Pixel> HorzPaddedIter<'a, T> { fn new( slice: &'a [T], start_index: isize, width: usize, ) -> HorzPaddedIter<'a, T> { HorzPaddedIter { slice, index: start_index, end: (width as isize + start_index) as usize, } } } impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> { type Item = &'a T; #[inline(always)] fn next(&mut self) -> Option { if self.index < self.end as isize { // clamp to the edges of the frame let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize; self.index += 1; Some(&self.slice[x]) } else { None } } #[inline(always)] fn size_hint(&self) -> (usize, Option) { let size: usize = (self.end as isize - self.index) as usize; (size, Some(size)) } } impl ExactSizeIterator for HorzPaddedIter<'_, T> {} impl FusedIterator for HorzPaddedIter<'_, T> {} pub fn setup_integral_image( integral_image_buffer: &mut IntegralImageBuffer, integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, cdeffed: &PlaneSlice, deblocked: &PlaneSlice, ) { let integral_image = &mut integral_image_buffer.integral_image; let sq_integral_image = &mut integral_image_buffer.sq_integral_image; // Number of elements outside the stripe let left_w = 4; // max radius of 2 + 2 padding let right_w = 3; // max radius of 2 + 1 padding assert_eq!(cdeffed.x, deblocked.x); // Find how many unique elements to use to the left and right let left_uniques = if cdeffed.x == 0 { 0 } else { left_w }; let right_uniques = right_w.min(crop_w - stripe_w); // Find the total number of unique elements used let row_uniques = left_uniques + stripe_w + right_uniques; // Negative start indices result in repeating the first element of the row let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 }; let mut rows_iter = VertPaddedIter::new( // Move left to encompass all the used data &cdeffed.go_left(left_uniques), &deblocked.go_left(left_uniques), // since r2 uses every other row, we need an extra row if stripe_h is odd stripe_h + (stripe_h & 1), crop_h, ) .map(|row: &[T]| { HorzPaddedIter::new( // Limit how many unique elements we use &row[..row_uniques], start_index_x, left_w + stripe_w + right_w, ) }); // Setup the first row { let mut sum: u32 = 0; let mut sq_sum: u32 = 0; // Remove the first row and use it outside of the main loop let row = rows_iter.next().unwrap(); for (src, (integral, sq_integral)) in row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut())) { let current = u32::cast_from(*src); // Wrap adds to prevent undefined behaviour on overflow. Overflow is // cancelled out when calculating the sum of a region. sum = sum.wrapping_add(current); *integral = sum; sq_sum = sq_sum.wrapping_add(current * current); *sq_integral = sq_sum; } } // Calculate all other rows let mut integral_slice = &mut integral_image[..]; let mut sq_integral_slice = &mut sq_integral_image[..]; for row in rows_iter { let mut sum: u32 = 0; let mut sq_sum: u32 = 0; // Split the data between the previous row and future rows. // This allows us to mutate the current row while accessing the // previous row. let (integral_row_prev, integral_row) = integral_slice.split_at_mut(integral_image_stride); let (sq_integral_row_prev, sq_integral_row) = sq_integral_slice.split_at_mut(integral_image_stride); for ( src, ((integral_above, sq_integral_above), (integral, sq_integral)), ) in row.zip( integral_row_prev .iter() .zip(sq_integral_row_prev.iter()) .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())), ) { let current = u32::cast_from(*src); // Wrap adds to prevent undefined behaviour on overflow. Overflow is // cancelled out when calculating the sum of a region. sum = sum.wrapping_add(current); *integral = sum.wrapping_add(*integral_above); sq_sum = sq_sum.wrapping_add(current * current); *sq_integral = sq_sum.wrapping_add(*sq_integral_above); } // The current row also contains all future rows. Replacing the slice with // it moves down a row. integral_slice = integral_row; sq_integral_slice = sq_integral_row; } } pub fn sgrproj_stripe_filter( set: u8, xqd: [i8; 2], fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, ) { let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect(); let bdm8 = fi.sequence.bit_depth - 8; let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation // of every row (of a, b and f), only half of the rows (every-other // row) are actually used. let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { sgrproj_box_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, sq_integral_image, integral_image_stride, 0, stripe_w, s_r2, bdm8, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; sgrproj_box_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, 0, stripe_w, s_r1, bdm8, fi.cpu_feature_level, ); sgrproj_box_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, 1, stripe_w, s_r1, bdm8, fi.cpu_feature_level, ); } /* iterate by row */ // Increment by two to handle the use of even rows by r=2 and run a nested // loop to handle increments of one. for y in (0..stripe_h).step_by(2) { // get results to use y and y+1 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 { sgrproj_box_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, sq_integral_image, integral_image_stride, y + 2, stripe_w, s_r2, bdm8, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]]; sgrproj_box_f_r2( &ap0, &bp0, &mut f_r2_0, &mut f_r2_1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); [&f_r2_0, &f_r2_1] } else { sgrproj_box_f_r0( &mut f_r2_0, y, stripe_w, cdeffed, fi.cpu_feature_level, ); // share results for both rows [&f_r2_0, &f_r2_0] }; for dy in 0..(2.min(stripe_h - y)) { let y = y + dy; if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; sgrproj_box_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, y + 2, stripe_w, s_r1, bdm8, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] = [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]]; let bp1: [&[u32]; 3] = [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]]; sgrproj_box_f_r1( &ap1, &bp1, &mut f_r1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); } else { sgrproj_box_f_r0( &mut f_r1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); } /* apply filter */ let w0 = xqd[0] as i32; let w1 = xqd[1] as i32; let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1; let line = &cdeffed[y]; #[inline(always)] fn apply_filter( out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32], stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32, ) { let line_it = line[..stripe_w].iter(); let f_r2_ab_it = f_r2_ab[..stripe_w].iter(); let f_r1_it = f_r1[..stripe_w].iter(); let out_it = out[..stripe_w].iter_mut(); for ((o, &u), (&f_r2_ab, &f_r1)) in out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it)) { let u = i32::cast_from(u) << SGRPROJ_RST_BITS; let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32; let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1)) >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1)); } } apply_filter( &mut out[y], line, &f_r1, f_r2_ab[dy], stripe_w, fi.sequence.bit_depth, w0, w1, w2, ); } } } // Frame inputs below aren't all equal, and will change as work // continues. There's no deblocked reconstruction available at this // point of RDO, so we use the non-deblocked reconstruction, cdef and // input. The input can be a full-sized frame. Cdef input is a partial // frame constructed specifically for RDO. // For simplicity, this ignores stripe segmentation (it's possible the // extra complexity isn't worth it and we'll ignore stripes // permanently during RDO, but that's not been tested yet). Data // access inside the cdef frame is monolithic and clipped to the cdef // borders. // Input params follow the same rules as sgrproj_stripe_filter. // Inputs are relative to the colocated slice views. pub fn sgrproj_solve( set: u8, fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, ) -> (i8, i8) { let bdm8 = fi.sequence.bit_depth - 8; let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]]; let mut c: [f64; 2] = [0., 0.]; /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation // of every row (of a, b and f), only half of the rows (every-other // row) are actually used. let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { sgrproj_box_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, sq_integral_image, SOLVE_IMAGE_STRIDE, 0, cdef_w, s_r2, bdm8, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; sgrproj_box_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, 0, cdef_w, s_r1, bdm8, fi.cpu_feature_level, ); sgrproj_box_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, 1, cdef_w, s_r1, bdm8, fi.cpu_feature_level, ); } /* iterate by row */ // Increment by two to handle the use of even rows by r=2 and run a nested // loop to handle increments of one. for y in (0..cdef_h).step_by(2) { // get results to use y and y+1 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 { sgrproj_box_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, sq_integral_image, SOLVE_IMAGE_STRIDE, y + 2, cdef_w, s_r2, bdm8, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]]; sgrproj_box_f_r2( &ap0, &bp0, &mut f_r2_0, &mut f_r2_1, y, cdef_w, cdeffed, fi.cpu_feature_level, ); [&f_r2_0, &f_r2_1] } else { sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level); // share results for both rows [&f_r2_0, &f_r2_0] }; for dy in 0..(2.min(cdef_h - y)) { let y = y + dy; if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; sgrproj_box_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, y + 2, cdef_w, s_r1, bdm8, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] = [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]]; let bp1: [&[u32]; 3] = [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]]; sgrproj_box_f_r1( &ap1, &bp1, &mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level, ); } else { sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level); } #[inline(always)] fn process_line( h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T], f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize, ) { let cdeffed_it = cdeffed[..cdef_w].iter(); let input_it = input[..cdef_w].iter(); let f_r2_ab_it = f_r2_ab[..cdef_w].iter(); let f_r1_it = f_r1[..cdef_w].iter(); #[derive(Debug, Copy, Clone)] struct Sums { h: [[i64; 2]; 2], c: [i64; 2], } let sums: Sums = cdeffed_it .zip(input_it) .zip(f_r2_ab_it.zip(f_r1_it)) .map(|((&u, &i), (&f2, &f1))| { let u = i32::cast_from(u) << SGRPROJ_RST_BITS; let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u; let f2 = f2 as i32 - u; let f1 = f1 as i32 - u; (s as i64, f1 as i64, f2 as i64) }) .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| { let mut ret: Sums = sums; ret.h[0][0] += f2 * f2; ret.h[1][1] += f1 * f1; ret.h[0][1] += f1 * f2; ret.c[0] += f2 * s; ret.c[1] += f1 * s; ret }); h[0][0] += sums.h[0][0] as f64; h[1][1] += sums.h[1][1] as f64; h[0][1] += sums.h[0][1] as f64; c[0] += sums.c[0] as f64; c[1] += sums.c[1] as f64; } process_line( &mut h, &mut c, &cdeffed[y], &input[y], &f_r1, f_r2_01[dy], cdef_w, ); } } // this is lifted almost in-tact from libaom let n = cdef_w as f64 * cdef_h as f64; h[0][0] /= n; h[0][1] /= n; h[1][1] /= n; h[1][0] = h[0][1]; c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n; c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n; let (xq0, xq1) = if s_r2 == 0 { // H matrix is now only the scalar h[1][1] // C vector is now only the scalar c[1] if h[1][1] == 0. { (0, 0) } else { (0, (c[1] / h[1][1]).round() as i32) } } else if s_r1 == 0 { // H matrix is now only the scalar h[0][0] // C vector is now only the scalar c[0] if h[0][0] == 0. { (0, 0) } else { ((c[0] / h[0][0]).round() as i32, 0) } } else { let det = h[0][0] * h[1][1] - h[0][1] * h[1][0]; if det == 0. { (0, 0) } else { // If scaling up dividend would overflow, instead scale down the divisor let div1 = h[1][1] * c[0] - h[0][1] * c[1]; let div2 = h[0][0] * c[1] - h[1][0] * c[0]; ((div1 / det).round() as i32, (div2 / det).round() as i32) } }; { let xqd0 = clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32); let xqd1 = clamp( (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1, SGRPROJ_XQD_MIN[1] as i32, SGRPROJ_XQD_MAX[1] as i32, ); (xqd0 as i8, xqd1 as i8) } } fn wiener_stripe_filter( coeffs: [[i8; 3]; 2], fi: &FrameInvariants, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane, deblocked: &Plane, out: &mut Plane, ) { let bit_depth = fi.sequence.bit_depth; let round_h = if bit_depth == 12 { 5 } else { 3 }; let round_v = if bit_depth == 12 { 9 } else { 11 }; let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1); let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1; let mut coeffs_ = [[0; 3]; 2]; for i in 0..2 { for j in 0..3 { coeffs_[i][j] = i32::from(coeffs[i][j]); } } let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7]; let vfilter: [i32; 7] = [ coeffs_[0][0], coeffs_[0][1], coeffs_[0][2], 128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]), coeffs_[0][2], coeffs_[0][1], coeffs_[0][0], ]; let hfilter: [i32; 7] = [ coeffs_[1][0], coeffs_[1][1], coeffs_[1][2], 128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]), coeffs_[1][2], coeffs_[1][1], coeffs_[1][0], ]; // unlike x, our y can be negative to start as the first stripe // starts off the top of the frame by 8 pixels, and can also run off the end of the frame let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize; let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize; let end_i = cmp::max( 0, if stripe_h as isize + stripe_y > crop_h as isize { crop_h as isize - stripe_y - start_wi as isize } else { stripe_h as isize - start_wi as isize }, ) as usize; let mut out_slice = out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize }); for xi in stripe_x..stripe_x + stripe_w { let n = cmp::min(7, crop_w as isize + 3 - xi as isize); for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 { let mut acc = 0; let src = if yi < stripe_y { let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2); deblocked.row(ly) } else if yi < stripe_y + stripe_h as isize { let ly = clamp(yi, 0, crop_h as isize - 1); cdeffed.row(ly) } else { let ly = cmp::min( clamp(yi, 0, crop_h as isize - 1), stripe_y + stripe_h as isize + 1, ); deblocked.row(ly) }; let start = i32::cast_from(src[0]); let end = i32::cast_from(src[crop_w - 1]); for i in 0..3 - xi as isize { acc += hfilter[i as usize] * start; } let off = 3 - (xi as isize); let s = cmp::max(0, off) as usize; let s1 = (s as isize - off) as usize; let n1 = (n as isize - off) as usize; for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) { acc += hf * i32::cast_from(v); } for i in n..7 { acc += hfilter[i as usize] * end; } acc = (acc + (1 << round_h >> 1)) >> round_h; work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset); } for (wi, dst) in (start_wi..start_wi + end_i) .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i)) { let mut acc = 0; for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) { acc += vfilter[i] * *src; } *dst = T::cast_from(clamp( (acc + (1 << round_v >> 1)) >> round_v, 0, (1 << bit_depth) - 1, )); } } } #[derive(Copy, Clone, Debug, Default)] pub struct RestorationUnit { pub filter: RestorationFilter, } #[derive(Clone, Debug)] pub struct FrameRestorationUnits { units: Box<[RestorationUnit]>, pub cols: usize, pub rows: usize, } impl FrameRestorationUnits { pub fn new(cols: usize, rows: usize) -> Self { Self { units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(), cols, rows, } } } impl Index for FrameRestorationUnits { type Output = [RestorationUnit]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { &self.units[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameRestorationUnits { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.units[index * self.cols..(index + 1) * self.cols] } } #[derive(Clone, Debug)] pub struct RestorationPlaneConfig { pub lrf_type: u8, pub unit_size: usize, // (1 << sb_x_shift) gives the number of superblocks horizontally or // vertically in a restoration unit, not accounting for RU stretching pub sb_h_shift: usize, pub sb_v_shift: usize, pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop) pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop) // stripe height is 64 in all cases except 4:2:0 chroma planes where // it is 32. This is independent of all other setup parameters pub stripe_height: usize, pub cols: usize, pub rows: usize, } #[derive(Clone, Debug)] pub struct RestorationPlane { pub cfg: RestorationPlaneConfig, pub units: FrameRestorationUnits, } #[derive(Clone, Default)] pub struct RestorationPlaneOffset { pub row: usize, pub col: usize, } impl RestorationPlane { pub fn new( lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize, sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize, rows: usize, ) -> RestorationPlane { let stripe_height = if stripe_decimate != 0 { 32 } else { 64 }; RestorationPlane { cfg: RestorationPlaneConfig { lrf_type, unit_size, sb_h_shift, sb_v_shift, sb_cols, sb_rows, stripe_height, cols, rows, }, units: FrameRestorationUnits::new(cols, rows), } } // Stripes are always 64 pixels high in a non-subsampled // frame, and decimated from 64 pixels in chroma. When // filtering, they are not co-located on Y with superblocks. fn restoration_unit_index_by_stripe( &self, stripenum: usize, rux: usize, ) -> (usize, usize) { ( cmp::min(rux, self.cfg.cols - 1), cmp::min( stripenum * self.cfg.stripe_height / self.cfg.unit_size, self.cfg.rows - 1, ), ) } pub fn restoration_unit_by_stripe( &self, stripenum: usize, rux: usize, ) -> &RestorationUnit { let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux); &self.units[y][x] } } #[derive(Clone, Debug)] pub struct RestorationState { pub planes: [RestorationPlane; MAX_PLANES], } impl RestorationState { pub fn new(fi: &FrameInvariants, input: &Frame) -> Self { let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg; // stripe size is decimated in 4:2:0 (and only 4:2:0) let stripe_uv_decimate = if xdec > 0 && ydec > 0 { 1 } else { 0 }; let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 }; let uv_sb_h_log2 = y_sb_log2 - xdec; let uv_sb_v_log2 = y_sb_log2 - ydec; let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru && fi.sequence.enable_restoration { assert!( fi.width > 1 && fi.height > 1, "Width and height must be higher than 1 for LRF setup" ); // Specific content does affect optimal LRU size choice, but the // quantizer in use is a surprisingly strong selector. let lrf_base_shift = if fi.base_q_idx > 200 { 0 // big } else if fi.base_q_idx > 160 { 1 } else { 2 // small }; let lrf_chroma_shift = if stripe_uv_decimate > 0 { // 4:2:0 only if lrf_base_shift == 2 { 1 // smallest chroma LRU is a win at low quant } else { // Will a down-shifted chroma LRU eliminate stretch in chroma? // If so, that's generally a win. let lrf_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift); let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size <= lrf_unit_size / 2 || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2; let shifted_stretch = ((fi.width >> xdec) - 1) % (lrf_unit_size >> 1) <= lrf_unit_size / 4 || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1) <= lrf_unit_size / 4; if unshifted_stretch && !shifted_stretch { 1 // shift to eliminate stretch } else { 0 // don't shift; save the signaling bits } } } else { 0 }; (lrf_base_shift, lrf_base_shift + lrf_chroma_shift) } else { // Explicit request to tie LRU size to superblock size == // smallest possible LRU size let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 }; (lrf_y_shift, lrf_y_shift + stripe_uv_decimate) }; let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift); let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift); let tiling = fi.sequence.tiling; // Right now we defer to tiling setup: don't choose an LRU size // large enough that a tile is not an integer number of LRUs // wide/high. if tiling.cols > 1 || tiling.rows > 1 { // despite suggestions to the contrary, tiles can be // non-powers-of-2. let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize; let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize; let tile_aligned_y_unit_size = 1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros)); let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros); let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros); y_unit_size = y_unit_size.min(tile_aligned_y_unit_size); uv_unit_size = uv_unit_size .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size)); // But it's actually worse: LRUs can't span tiles (in our // one-pass design that is, spec allows it). However, the spec // mandates the last LRU stretches forward into any // less-than-half-LRU span of superblocks at the right and // bottom of a frame. These superblocks may well be in a // different tile! Even if LRUs are minimum size (one // superblock), when the right or bottom edge of the frame is a // superblock that's less than half the width/height of a normal // superblock, the LRU is forced by the spec to span into it // (and thus a different tile). Tiling is under no such // restriction; it could decide the right/left sliver will be in // its own tile row/column. We can't disallow the combination // here. The tiling code will have to either prevent it or // tolerate it. (prayer mechanic == Issue #1629). } // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes // to be the same*. If they differ at this // point, it's due to a tiling restriction enforcing a maximum // size, so force both to the smaller value. // // *see sec 5.9.20, "Loop restoration params syntax". The // bitstream provides means of coding a different UV LRU size only // when chroma is in use and both x and y are subsampled in the // chroma planes. if ydec == 0 && y_unit_size != uv_unit_size { y_unit_size = uv_unit_size.min(y_unit_size); uv_unit_size = y_unit_size; } // derive the rest let y_unit_log2 = y_unit_size.ilog() - 1; let uv_unit_log2 = uv_unit_size.ilog() - 1; let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1); let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1); let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec) + (uv_unit_size >> 1)) / uv_unit_size) .max(1); let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec) + (uv_unit_size >> 1)) / uv_unit_size) .max(1); RestorationState { planes: [ RestorationPlane::new( RESTORE_SWITCHABLE, y_unit_size, y_unit_log2 - y_sb_log2, y_unit_log2 - y_sb_log2, fi.sb_width, fi.sb_height, 0, y_cols, y_rows, ), RestorationPlane::new( RESTORE_SWITCHABLE, uv_unit_size, uv_unit_log2 - uv_sb_h_log2, uv_unit_log2 - uv_sb_v_log2, fi.sb_width, fi.sb_height, stripe_uv_decimate, uv_cols, uv_rows, ), RestorationPlane::new( RESTORE_SWITCHABLE, uv_unit_size, uv_unit_log2 - uv_sb_h_log2, uv_unit_log2 - uv_sb_v_log2, fi.sb_width, fi.sb_height, stripe_uv_decimate, uv_cols, uv_rows, ), ], } } #[hawktracer(lrf_filter_frame)] pub fn lrf_filter_frame( &mut self, out: &mut Frame, pre_cdef: &Frame, fi: &FrameInvariants, ) { let cdeffed = out.clone(); let planes = if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES }; // unlike the other loop filters that operate over the padded // frame dimensions, restoration filtering and source pixel // accesses are clipped to the original frame dimensions // that's why we use fi.width and fi.height instead of PlaneConfig fields // number of stripes (counted according to colocated Y luma position) let stripe_n = (fi.height + 7) / 64 + 1; // Buffers for the stripe filter. let mut stripe_filter_buffer = IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE); for pli in 0..planes { let rp = &self.planes[pli]; let xdec = out.planes[pli].cfg.xdec; let ydec = out.planes[pli].cfg.ydec; let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec; let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec; for si in 0..stripe_n { let (stripe_start_y, stripe_size) = if si == 0 { (0, (64 - 8) >> ydec) } else { let start = (si * 64 - 8) >> ydec; ( start as isize, // one past, unlike spec (64 >> ydec).min(crop_h - start), ) }; // horizontally, go rdu-by-rdu for rux in 0..rp.cfg.cols { // stripe x pixel locations must be clipped to frame, last may need to stretch let x = rux * rp.cfg.unit_size; let size = if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size }; let ru = rp.restoration_unit_by_stripe(si, rux); match ru.filter { RestorationFilter::Wiener { coeffs } => { wiener_stripe_filter( coeffs, fi, crop_w, crop_h, size, stripe_size, x, stripe_start_y, &cdeffed.planes[pli], &pre_cdef.planes[pli], &mut out.planes[pli], ); } RestorationFilter::Sgrproj { set, xqd } => { if !fi.sequence.enable_cdef { continue; } setup_integral_image( &mut stripe_filter_buffer, STRIPE_IMAGE_STRIDE, crop_w - x, (crop_h as isize - stripe_start_y) as usize, size, stripe_size, &cdeffed.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), &pre_cdef.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), ); sgrproj_stripe_filter( set, xqd, fi, &stripe_filter_buffer, STRIPE_IMAGE_STRIDE, &cdeffed.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), &mut out.planes[pli].region_mut(Area::Rect { x: x as isize, y: stripe_start_y, width: size, height: stripe_size, }), ); } RestorationFilter::None => { // do nothing } } } } } } }