1 // Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
2 //
3 // This source code is subject to the terms of the BSD 2 Clause License and
4 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5 // was not distributed with this source code in the LICENSE file, you can
6 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
7 // Media Patent License 1.0 was not distributed with this source code in the
8 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9 
10 cfg_if::cfg_if! {
11   if #[cfg(nasm_x86_64)] {
12     use crate::asm::x86::lrf::*;
13   } else {
14     use self::rust::*;
15   }
16 }
17 
18 use crate::color::ChromaSampling::Cs400;
19 use crate::context::{MAX_PLANES, SB_SIZE};
20 use crate::encoder::FrameInvariants;
21 use crate::frame::{
22   AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
23 };
24 use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
25 use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
26 use rust_hawktracer::*;
27 
28 use crate::api::SGRComplexityLevel;
29 use std::cmp;
30 use std::iter::FusedIterator;
31 use std::ops::{Index, IndexMut};
32 
33 pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8;
34 
35 pub const RESTORE_NONE: u8 = 0;
36 pub const RESTORE_SWITCHABLE: u8 = 1;
37 pub const RESTORE_WIENER: u8 = 2;
38 pub const RESTORE_SGRPROJ: u8 = 3;
39 
40 pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17];
41 pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15];
42 pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46];
43 #[allow(unused)]
44 pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3];
45 pub const WIENER_BITS: usize = 7;
46 
47 pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32];
48 pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31];
49 pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95];
50 pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4;
51 pub const SGRPROJ_PRJ_BITS: u8 = 7;
52 pub const SGRPROJ_PARAMS_BITS: u8 = 4;
53 pub const SGRPROJ_MTABLE_BITS: u8 = 20;
54 pub const SGRPROJ_SGR_BITS: u8 = 8;
55 pub const SGRPROJ_RECIP_BITS: u8 = 12;
56 pub const SGRPROJ_RST_BITS: u8 = 4;
57 pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [
58   [140, 3236],
59   [112, 2158],
60   [93, 1618],
61   [80, 1438],
62   [70, 1295],
63   [58, 1177],
64   [47, 1079],
65   [37, 996],
66   [30, 925],
67   [25, 863],
68   [0, 2589],
69   [0, 1618],
70   [0, 1177],
71   [0, 925],
72   [56, 0],
73   [22, 0],
74 ];
75 
76 // List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
77 // SGRPROJ_ALL_SETS contains every possible index
78 const SGRPROJ_ALL_SETS: &[u8] =
79   &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
80 // SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
81 // most of the gains from sgr. The decision of which values to use is somewhat
82 // arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
83 // parameters as non-zero. The other two are distinguishable by which of the
84 // two parameters is zero. There are an even number of each of these groups and
85 // the non-zero parameters grow as the indices increase. This array uses the
86 // 1st, 3rd, ... smallest params of each group.
87 const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15];
88 
get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8]89 pub fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
90   match complexity {
91     SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
92     SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
93   }
94 }
95 
96 pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2;
97 pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2;
98 pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
99 pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
100 
101 pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2)
102   + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1));
103 pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2;
104 pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2;
105 pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
106 
107 pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
108   [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
109 
110 /// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
111 #[derive(Debug)]
112 pub struct IntegralImageBuffer {
113   pub integral_image: Vec<u32>,
114   pub sq_integral_image: Vec<u32>,
115 }
116 
117 impl IntegralImageBuffer {
118   /// Creates a new buffer with the given size, filled with zeros.
119   #[inline]
zeroed(size: usize) -> Self120   pub fn zeroed(size: usize) -> Self {
121     Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] }
122   }
123 }
124 
125 #[allow(unused)] // Wiener coming soon!
126 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
127 pub enum RestorationFilter {
128   None,
129   Wiener { coeffs: [[i8; 3]; 2] },
130   Sgrproj { set: u8, xqd: [i8; 2] },
131 }
132 
133 impl Default for RestorationFilter {
default() -> RestorationFilter134   fn default() -> RestorationFilter {
135     RestorationFilter::None
136   }
137 }
138 
139 impl RestorationFilter {
notequal(self, cmp: RestorationFilter) -> bool140   pub fn notequal(self, cmp: RestorationFilter) -> bool {
141     match self {
142       RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}),
143       RestorationFilter::Sgrproj { set, xqd } => {
144         if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
145           !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1])
146         } else {
147           true
148         }
149       }
150       RestorationFilter::Wiener { coeffs } => {
151         if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
152           !(coeffs[0][0] == coeffs2[0][0]
153             && coeffs[0][1] == coeffs2[0][1]
154             && coeffs[0][2] == coeffs2[0][2]
155             && coeffs[1][0] == coeffs2[1][0]
156             && coeffs[1][1] == coeffs2[1][1]
157             && coeffs[1][2] == coeffs2[1][2])
158         } else {
159           true
160         }
161       }
162     }
163   }
164 }
165 
166 pub(crate) mod rust {
167   use crate::cpu_features::CpuFeatureLevel;
168   use crate::frame::PlaneSlice;
169   use crate::lrf::{
170     get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
171     SGRPROJ_SGR_BITS,
172   };
173   use crate::util::CastFromPrimitive;
174   use crate::Pixel;
175 
176   #[inline(always)]
sgrproj_box_ab_internal( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )177   pub(crate) fn sgrproj_box_ab_internal(
178     r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
179     iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
180     bdm8: usize,
181   ) {
182     let d: usize = r * 2 + 1;
183     let n: usize = d * d;
184     let one_over_n = if r == 1 { 455 } else { 164 };
185 
186     for x in start_x..stripe_w + 2 {
187       let sum = get_integral_square(iimg, iimg_stride, x, y, d);
188       let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
189       let (reta, retb) =
190         sgrproj_sum_finish(ssq, sum, n as u32, one_over_n, s, bdm8);
191       af[x] = reta;
192       bf[x] = retb;
193     }
194   }
195 
196   // computes an intermediate (ab) row for stripe_w + 2 columns at row y
sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, )197   pub(crate) fn sgrproj_box_ab_r1(
198     af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
199     iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
200     _cpu: CpuFeatureLevel,
201   ) {
202     sgrproj_box_ab_internal(
203       1,
204       af,
205       bf,
206       iimg,
207       iimg_sq,
208       iimg_stride,
209       0,
210       y,
211       stripe_w,
212       s,
213       bdm8,
214     );
215   }
216 
217   // computes an intermediate (ab) row for stripe_w + 2 columns at row y
sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, )218   pub(crate) fn sgrproj_box_ab_r2(
219     af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
220     iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
221     _cpu: CpuFeatureLevel,
222   ) {
223     sgrproj_box_ab_internal(
224       2,
225       af,
226       bf,
227       iimg,
228       iimg_sq,
229       iimg_stride,
230       0,
231       y,
232       stripe_w,
233       s,
234       bdm8,
235     );
236   }
237 
sgrproj_box_f_r0<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )238   pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
239     f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
240     _cpu: CpuFeatureLevel,
241   ) {
242     sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed);
243   }
244 
245   #[inline(always)]
sgrproj_box_f_r0_internal<T: Pixel>( f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )246   pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
247     f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
248   ) {
249     let line = cdeffed.row(y);
250     for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
251       *fp = u32::cast_from(v) << SGRPROJ_RST_BITS;
252     }
253   }
254 
sgrproj_box_f_r1<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )255   pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
256     af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
257     cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
258   ) {
259     sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed);
260   }
261 
262   #[inline(always)]
sgrproj_box_f_r1_internal<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )263   pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
264     af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize,
265     y: usize, w: usize, cdeffed: &PlaneSlice<T>,
266   ) {
267     let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
268     let line = cdeffed.row(y);
269     for x in start_x..w {
270       let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2])
271         + 4
272           * (af[1][x]
273             + af[0][x + 1]
274             + af[1][x + 1]
275             + af[2][x + 1]
276             + af[1][x + 2]);
277       let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2])
278         + 4
279           * (bf[1][x]
280             + bf[0][x + 1]
281             + bf[1][x + 1]
282             + bf[2][x + 1]
283             + bf[1][x + 2]);
284       let v = a * u32::cast_from(line[x]) + b;
285       f[x] = (v + (1 << shift >> 1)) >> shift;
286     }
287   }
288 
sgrproj_box_f_r2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )289   pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
290     af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
291     y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
292   ) {
293     sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed);
294   }
295 
296   #[inline(always)]
sgrproj_box_f_r2_internal<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )297   pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
298     af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
299     start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
300   ) {
301     let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
302     let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
303     let line = cdeffed.row(y);
304     let line1 = cdeffed.row(y + 1);
305 
306     let af0 = af[0][start_x..w + 3].windows(3);
307     let af1 = af[1][start_x..w + 3].windows(3);
308     let bf0 = bf[0][start_x..w + 3].windows(3);
309     let bf1 = bf[1][start_x..w + 3].windows(3);
310 
311     let af_it = af0.zip(af1);
312     let bf_it = bf0.zip(bf1);
313 
314     let in0 = line[start_x..w].iter();
315     let in1 = line1[start_x..w].iter();
316 
317     let o0 = f0[start_x..w].iter_mut();
318     let o1 = f1[start_x..w].iter_mut();
319 
320     let in_iter = in0.zip(in1);
321     let out_iter = o0.zip(o1);
322 
323     let io_iter = out_iter.zip(in_iter);
324 
325     for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
326       io_iter.zip(af_it.zip(bf_it))
327     {
328       let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1];
329       let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1];
330       let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1];
331       let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1];
332       let v = (a + ao) * u32::cast_from(p0) + b + bo;
333       *o0 = (v + (1 << shift >> 1)) >> shift;
334       let vo = ao * u32::cast_from(p1) + bo;
335       *o1 = (vo + (1 << shifto >> 1)) >> shifto;
336     }
337   }
338 }
339 
340 #[inline(always)]
sgrproj_sum_finish( ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize, ) -> (u32, u32)341 fn sgrproj_sum_finish(
342   ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize,
343 ) -> (u32, u32) {
344   let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8);
345   let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8;
346   let p =
347     cmp::max(0, (scaled_ssq * n) as i32 - (scaled_sum * scaled_sum) as i32)
348       as u32;
349   let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS;
350   let a = if z >= 255 {
351     256
352   } else if z == 0 {
353     1
354   } else {
355     ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1)
356   };
357   let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
358   (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS)
359 }
360 
361 // Using an integral image, compute the sum of a square region
get_integral_square( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> u32362 fn get_integral_square(
363   iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
364 ) -> u32 {
365   // Cancel out overflow in iimg by using wrapping arithmetic
366   iimg[y * stride + x]
367     .wrapping_add(iimg[(y + size) * stride + x + size])
368     .wrapping_sub(iimg[(y + size) * stride + x])
369     .wrapping_sub(iimg[y * stride + x + size])
370 }
371 
372 struct VertPaddedIter<'a, T: Pixel> {
373   // The two sources that can be selected when clipping
374   deblocked: &'a Plane<T>,
375   cdeffed: &'a Plane<T>,
376   // x index to choice where on the row to start
377   x: isize,
378   // y index that will be mutated
379   y: isize,
380   // The index at which to terminate. Can be larger than the slice length.
381   end: isize,
382   // Used for source buffer choice/clipping. May (and regularly will)
383   // be negative.
384   stripe_begin: isize,
385   // Also used for source buffer choice/clipping. May specify a stripe boundary
386   // less than, equal to, or larger than the buffers we're accessing.
387   stripe_end: isize,
388   // Active area cropping is done by specifying a value smaller than the height
389   // of the plane.
390   crop: isize,
391 }
392 
393 impl<'a, 'b, T: Pixel> VertPaddedIter<'a, T> {
new( cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>, stripe_h: usize, crop: usize, ) -> VertPaddedIter<'a, T>394   fn new(
395     cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
396     stripe_h: usize, crop: usize,
397   ) -> VertPaddedIter<'a, T> {
398     // cdeffed and deblocked must start at the same coordinates from their
399     // underlying planes. Since cropping is provided via a separate params, the
400     // height of the underlying planes do not need to match.
401     assert_eq!(cdeffed.x, deblocked.x);
402     assert_eq!(cdeffed.y, deblocked.y);
403 
404     // To share integral images, always use the max box filter radius of 2
405     let r = 2;
406 
407     // The number of rows outside the stripe are needed
408     let rows_above = r + 2;
409     let rows_below = 2;
410 
411     // Offset crop and stripe_h so they are relative to the underlying plane
412     // and not the plane slice.
413     let crop = crop as isize + deblocked.y;
414     let stripe_end = stripe_h as isize + deblocked.y;
415 
416     // Move y up the number rows above.
417     // If y is negative we repeat the first row
418     let y = deblocked.y - rows_above as isize;
419 
420     VertPaddedIter {
421       deblocked: deblocked.plane,
422       cdeffed: cdeffed.plane,
423       x: deblocked.x,
424       y,
425       end: (rows_above + stripe_h + rows_below) as isize + y,
426       stripe_begin: deblocked.y,
427       stripe_end,
428       crop,
429     }
430   }
431 }
432 
433 impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
434   type Item = &'a [T];
435 
436   #[inline(always)]
next(&mut self) -> Option<Self::Item>437   fn next(&mut self) -> Option<Self::Item> {
438     if self.end > self.y {
439       // clamp before deciding the source
440       // clamp vertically to storage at top and passed-in height at bottom
441       let cropped_y = clamp(self.y, 0, self.crop - 1);
442       // clamp vertically to stripe limits
443       let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1);
444 
445       // decide if we're vertically inside or outside the strip
446       let src_plane =
447         if ly >= self.stripe_begin && ly < self.stripe_end as isize {
448           self.cdeffed
449         } else {
450           self.deblocked
451         };
452       // cannot directly return self.ps.row(row) due to lifetime issue
453       let range = src_plane.row_range(self.x, ly);
454       self.y += 1;
455       Some(&src_plane.data[range])
456     } else {
457       None
458     }
459   }
460 
size_hint(&self) -> (usize, Option<usize>)461   fn size_hint(&self) -> (usize, Option<usize>) {
462     let remaining = self.end - self.y;
463     debug_assert!(remaining >= 0);
464     let remaining = remaining as usize;
465 
466     (remaining, Some(remaining))
467   }
468 }
469 
470 impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
471 impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
472 
473 struct HorzPaddedIter<'a, T: Pixel> {
474   // Active area cropping is done using the length of the slice
475   slice: &'a [T],
476   // x index of the iterator
477   // When less than 0, repeat the first element. When greater than end, repeat
478   // the last element
479   index: isize,
480   // The index at which to terminate. Can be larger than the slice length.
481   end: usize,
482 }
483 
484 impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
new( slice: &'a [T], start_index: isize, width: usize, ) -> HorzPaddedIter<'a, T>485   fn new(
486     slice: &'a [T], start_index: isize, width: usize,
487   ) -> HorzPaddedIter<'a, T> {
488     HorzPaddedIter {
489       slice,
490       index: start_index,
491       end: (width as isize + start_index) as usize,
492     }
493   }
494 }
495 
496 impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
497   type Item = &'a T;
498 
499   #[inline(always)]
next(&mut self) -> Option<Self::Item>500   fn next(&mut self) -> Option<Self::Item> {
501     if self.index < self.end as isize {
502       // clamp to the edges of the frame
503       let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize;
504       self.index += 1;
505       Some(&self.slice[x])
506     } else {
507       None
508     }
509   }
510 
511   #[inline(always)]
size_hint(&self) -> (usize, Option<usize>)512   fn size_hint(&self) -> (usize, Option<usize>) {
513     let size: usize = (self.end as isize - self.index) as usize;
514     (size, Some(size))
515   }
516 }
517 
518 impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
519 impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
520 
setup_integral_image<T: Pixel>( integral_image_buffer: &mut IntegralImageBuffer, integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>, )521 pub fn setup_integral_image<T: Pixel>(
522   integral_image_buffer: &mut IntegralImageBuffer,
523   integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
524   stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
525 ) {
526   let integral_image = &mut integral_image_buffer.integral_image;
527   let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
528 
529   // Number of elements outside the stripe
530   let left_w = 4; // max radius of 2 + 2 padding
531   let right_w = 3; // max radius of 2 + 1 padding
532 
533   assert_eq!(cdeffed.x, deblocked.x);
534 
535   // Find how many unique elements to use to the left and right
536   let left_uniques = if cdeffed.x == 0 { 0 } else { left_w };
537   let right_uniques = right_w.min(crop_w - stripe_w);
538 
539   // Find the total number of unique elements used
540   let row_uniques = left_uniques + stripe_w + right_uniques;
541 
542   // Negative start indices result in repeating the first element of the row
543   let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 };
544 
545   let mut rows_iter = VertPaddedIter::new(
546     // Move left to encompass all the used data
547     &cdeffed.go_left(left_uniques),
548     &deblocked.go_left(left_uniques),
549     // since r2 uses every other row, we need an extra row if stripe_h is odd
550     stripe_h + (stripe_h & 1),
551     crop_h,
552   )
553   .map(|row: &[T]| {
554     HorzPaddedIter::new(
555       // Limit how many unique elements we use
556       &row[..row_uniques],
557       start_index_x,
558       left_w + stripe_w + right_w,
559     )
560   });
561 
562   // Setup the first row
563   {
564     let mut sum: u32 = 0;
565     let mut sq_sum: u32 = 0;
566     // Remove the first row and use it outside of the main loop
567     let row = rows_iter.next().unwrap();
568     for (src, (integral, sq_integral)) in
569       row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
570     {
571       let current = u32::cast_from(*src);
572 
573       // Wrap adds to prevent undefined behaviour on overflow. Overflow is
574       // cancelled out when calculating the sum of a region.
575       sum = sum.wrapping_add(current);
576       *integral = sum;
577       sq_sum = sq_sum.wrapping_add(current * current);
578       *sq_integral = sq_sum;
579     }
580   }
581   // Calculate all other rows
582   let mut integral_slice = &mut integral_image[..];
583   let mut sq_integral_slice = &mut sq_integral_image[..];
584   for row in rows_iter {
585     let mut sum: u32 = 0;
586     let mut sq_sum: u32 = 0;
587 
588     // Split the data between the previous row and future rows.
589     // This allows us to mutate the current row while accessing the
590     // previous row.
591     let (integral_row_prev, integral_row) =
592       integral_slice.split_at_mut(integral_image_stride);
593     let (sq_integral_row_prev, sq_integral_row) =
594       sq_integral_slice.split_at_mut(integral_image_stride);
595     for (
596       src,
597       ((integral_above, sq_integral_above), (integral, sq_integral)),
598     ) in row.zip(
599       integral_row_prev
600         .iter()
601         .zip(sq_integral_row_prev.iter())
602         .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
603     ) {
604       let current = u32::cast_from(*src);
605       // Wrap adds to prevent undefined behaviour on overflow. Overflow is
606       // cancelled out when calculating the sum of a region.
607       sum = sum.wrapping_add(current);
608       *integral = sum.wrapping_add(*integral_above);
609       sq_sum = sq_sum.wrapping_add(current * current);
610       *sq_integral = sq_sum.wrapping_add(*sq_integral_above);
611     }
612 
613     // The current row also contains all future rows. Replacing the slice with
614     // it moves down a row.
615     integral_slice = integral_row;
616     sq_integral_slice = sq_integral_row;
617   }
618 }
619 
sgrproj_stripe_filter<T: Pixel, U: Pixel>( set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>, )620 pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
621   set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
622   integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
623   cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
624 ) {
625   let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
626   let bdm8 = fi.sequence.bit_depth - 8;
627   let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
628     [[0; IMAGE_WIDTH_MAX + 2]; 2];
629   let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
630     [[0; IMAGE_WIDTH_MAX + 2]; 2];
631   let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
632   let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
633   let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
634     [[0; IMAGE_WIDTH_MAX + 2]; 3];
635   let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
636     [[0; IMAGE_WIDTH_MAX + 2]; 3];
637   let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
638 
639   let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
640   let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
641 
642   /* prime the intermediate arrays */
643   // One oddness about the radius=2 intermediate array computations that
644   // the spec doesn't make clear: Although the spec defines computation
645   // of every row (of a, b and f), only half of the rows (every-other
646   // row) are actually used.
647   let integral_image = &integral_image_buffer.integral_image;
648   let sq_integral_image = &integral_image_buffer.sq_integral_image;
649   if s_r2 > 0 {
650     sgrproj_box_ab_r2(
651       &mut a_r2[0],
652       &mut b_r2[0],
653       integral_image,
654       sq_integral_image,
655       integral_image_stride,
656       0,
657       stripe_w,
658       s_r2,
659       bdm8,
660       fi.cpu_feature_level,
661     );
662   }
663   if s_r1 > 0 {
664     let integral_image_offset = integral_image_stride + 1;
665     sgrproj_box_ab_r1(
666       &mut a_r1[0],
667       &mut b_r1[0],
668       &integral_image[integral_image_offset..],
669       &sq_integral_image[integral_image_offset..],
670       integral_image_stride,
671       0,
672       stripe_w,
673       s_r1,
674       bdm8,
675       fi.cpu_feature_level,
676     );
677     sgrproj_box_ab_r1(
678       &mut a_r1[1],
679       &mut b_r1[1],
680       &integral_image[integral_image_offset..],
681       &sq_integral_image[integral_image_offset..],
682       integral_image_stride,
683       1,
684       stripe_w,
685       s_r1,
686       bdm8,
687       fi.cpu_feature_level,
688     );
689   }
690 
691   /* iterate by row */
692   // Increment by two to handle the use of even rows by r=2 and run a nested
693   //  loop to handle increments of one.
694   for y in (0..stripe_h).step_by(2) {
695     // get results to use y and y+1
696     let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
697       sgrproj_box_ab_r2(
698         &mut a_r2[(y / 2 + 1) % 2],
699         &mut b_r2[(y / 2 + 1) % 2],
700         integral_image,
701         sq_integral_image,
702         integral_image_stride,
703         y + 2,
704         stripe_w,
705         s_r2,
706         bdm8,
707         fi.cpu_feature_level,
708       );
709       let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
710       let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
711       sgrproj_box_f_r2(
712         &ap0,
713         &bp0,
714         &mut f_r2_0,
715         &mut f_r2_1,
716         y,
717         stripe_w,
718         cdeffed,
719         fi.cpu_feature_level,
720       );
721       [&f_r2_0, &f_r2_1]
722     } else {
723       sgrproj_box_f_r0(
724         &mut f_r2_0,
725         y,
726         stripe_w,
727         cdeffed,
728         fi.cpu_feature_level,
729       );
730       // share results for both rows
731       [&f_r2_0, &f_r2_0]
732     };
733     for dy in 0..(2.min(stripe_h - y)) {
734       let y = y + dy;
735       if s_r1 > 0 {
736         let integral_image_offset = integral_image_stride + 1;
737         sgrproj_box_ab_r1(
738           &mut a_r1[(y + 2) % 3],
739           &mut b_r1[(y + 2) % 3],
740           &integral_image[integral_image_offset..],
741           &sq_integral_image[integral_image_offset..],
742           integral_image_stride,
743           y + 2,
744           stripe_w,
745           s_r1,
746           bdm8,
747           fi.cpu_feature_level,
748         );
749         let ap1: [&[u32]; 3] =
750           [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
751         let bp1: [&[u32]; 3] =
752           [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
753         sgrproj_box_f_r1(
754           &ap1,
755           &bp1,
756           &mut f_r1,
757           y,
758           stripe_w,
759           cdeffed,
760           fi.cpu_feature_level,
761         );
762       } else {
763         sgrproj_box_f_r0(
764           &mut f_r1,
765           y,
766           stripe_w,
767           cdeffed,
768           fi.cpu_feature_level,
769         );
770       }
771 
772       /* apply filter */
773       let w0 = xqd[0] as i32;
774       let w1 = xqd[1] as i32;
775       let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1;
776 
777       let line = &cdeffed[y];
778 
779       #[inline(always)]
apply_filter<U: Pixel>( out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32], stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32, )780       fn apply_filter<U: Pixel>(
781         out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
782         stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
783       ) {
784         let line_it = line[..stripe_w].iter();
785         let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
786         let f_r1_it = f_r1[..stripe_w].iter();
787         let out_it = out[..stripe_w].iter_mut();
788 
789         for ((o, &u), (&f_r2_ab, &f_r1)) in
790           out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
791         {
792           let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
793           let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
794           let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
795             >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
796           *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
797         }
798       }
799 
800       apply_filter(
801         &mut out[y],
802         line,
803         &f_r1,
804         f_r2_ab[dy],
805         stripe_w,
806         fi.sequence.bit_depth,
807         w0,
808         w1,
809         w2,
810       );
811     }
812   }
813 }
814 
815 // Frame inputs below aren't all equal, and will change as work
816 // continues.  There's no deblocked reconstruction available at this
817 // point of RDO, so we use the non-deblocked reconstruction, cdef and
818 // input.  The input can be a full-sized frame. Cdef input is a partial
819 // frame constructed specifically for RDO.
820 
821 // For simplicity, this ignores stripe segmentation (it's possible the
822 // extra complexity isn't worth it and we'll ignore stripes
823 // permanently during RDO, but that's not been tested yet). Data
824 // access inside the cdef frame is monolithic and clipped to the cdef
825 // borders.
826 
827 // Input params follow the same rules as sgrproj_stripe_filter.
828 // Inputs are relative to the colocated slice views.
sgrproj_solve<T: Pixel>( set: u8, fi: &FrameInvariants<T>, integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize, ) -> (i8, i8)829 pub fn sgrproj_solve<T: Pixel>(
830   set: u8, fi: &FrameInvariants<T>,
831   integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
832   cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
833 ) -> (i8, i8) {
834   let bdm8 = fi.sequence.bit_depth - 8;
835 
836   let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
837     [[0; IMAGE_WIDTH_MAX + 2]; 2];
838   let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
839     [[0; IMAGE_WIDTH_MAX + 2]; 2];
840   let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
841   let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
842   let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
843     [[0; IMAGE_WIDTH_MAX + 2]; 3];
844   let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
845     [[0; IMAGE_WIDTH_MAX + 2]; 3];
846   let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
847 
848   let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
849   let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
850 
851   let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
852   let mut c: [f64; 2] = [0., 0.];
853 
854   /* prime the intermediate arrays */
855   // One oddness about the radius=2 intermediate array computations that
856   // the spec doesn't make clear: Although the spec defines computation
857   // of every row (of a, b and f), only half of the rows (every-other
858   // row) are actually used.
859   let integral_image = &integral_image_buffer.integral_image;
860   let sq_integral_image = &integral_image_buffer.sq_integral_image;
861   if s_r2 > 0 {
862     sgrproj_box_ab_r2(
863       &mut a_r2[0],
864       &mut b_r2[0],
865       integral_image,
866       sq_integral_image,
867       SOLVE_IMAGE_STRIDE,
868       0,
869       cdef_w,
870       s_r2,
871       bdm8,
872       fi.cpu_feature_level,
873     );
874   }
875   if s_r1 > 0 {
876     let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
877     sgrproj_box_ab_r1(
878       &mut a_r1[0],
879       &mut b_r1[0],
880       &integral_image[integral_image_offset..],
881       &sq_integral_image[integral_image_offset..],
882       SOLVE_IMAGE_STRIDE,
883       0,
884       cdef_w,
885       s_r1,
886       bdm8,
887       fi.cpu_feature_level,
888     );
889     sgrproj_box_ab_r1(
890       &mut a_r1[1],
891       &mut b_r1[1],
892       &integral_image[integral_image_offset..],
893       &sq_integral_image[integral_image_offset..],
894       SOLVE_IMAGE_STRIDE,
895       1,
896       cdef_w,
897       s_r1,
898       bdm8,
899       fi.cpu_feature_level,
900     );
901   }
902 
903   /* iterate by row */
904   // Increment by two to handle the use of even rows by r=2 and run a nested
905   //  loop to handle increments of one.
906   for y in (0..cdef_h).step_by(2) {
907     // get results to use y and y+1
908     let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
909       sgrproj_box_ab_r2(
910         &mut a_r2[(y / 2 + 1) % 2],
911         &mut b_r2[(y / 2 + 1) % 2],
912         integral_image,
913         sq_integral_image,
914         SOLVE_IMAGE_STRIDE,
915         y + 2,
916         cdef_w,
917         s_r2,
918         bdm8,
919         fi.cpu_feature_level,
920       );
921       let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
922       let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
923       sgrproj_box_f_r2(
924         &ap0,
925         &bp0,
926         &mut f_r2_0,
927         &mut f_r2_1,
928         y,
929         cdef_w,
930         cdeffed,
931         fi.cpu_feature_level,
932       );
933       [&f_r2_0, &f_r2_1]
934     } else {
935       sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
936       // share results for both rows
937       [&f_r2_0, &f_r2_0]
938     };
939     for dy in 0..(2.min(cdef_h - y)) {
940       let y = y + dy;
941       if s_r1 > 0 {
942         let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
943         sgrproj_box_ab_r1(
944           &mut a_r1[(y + 2) % 3],
945           &mut b_r1[(y + 2) % 3],
946           &integral_image[integral_image_offset..],
947           &sq_integral_image[integral_image_offset..],
948           SOLVE_IMAGE_STRIDE,
949           y + 2,
950           cdef_w,
951           s_r1,
952           bdm8,
953           fi.cpu_feature_level,
954         );
955         let ap1: [&[u32]; 3] =
956           [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
957         let bp1: [&[u32]; 3] =
958           [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
959         sgrproj_box_f_r1(
960           &ap1,
961           &bp1,
962           &mut f_r1,
963           y,
964           cdef_w,
965           cdeffed,
966           fi.cpu_feature_level,
967         );
968       } else {
969         sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
970       }
971 
972       #[inline(always)]
973       fn process_line<T: Pixel>(
974         h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
975         f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
976       ) {
977         let cdeffed_it = cdeffed[..cdef_w].iter();
978         let input_it = input[..cdef_w].iter();
979         let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
980         let f_r1_it = f_r1[..cdef_w].iter();
981 
982         #[derive(Debug, Copy, Clone)]
983         struct Sums {
984           h: [[i64; 2]; 2],
985           c: [i64; 2],
986         }
987 
988         let sums: Sums = cdeffed_it
989           .zip(input_it)
990           .zip(f_r2_ab_it.zip(f_r1_it))
991           .map(|((&u, &i), (&f2, &f1))| {
992             let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
993             let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
994             let f2 = f2 as i32 - u;
995             let f1 = f1 as i32 - u;
996             (s as i64, f1 as i64, f2 as i64)
997           })
998           .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
999             let mut ret: Sums = sums;
1000             ret.h[0][0] += f2 * f2;
1001             ret.h[1][1] += f1 * f1;
1002             ret.h[0][1] += f1 * f2;
1003             ret.c[0] += f2 * s;
1004             ret.c[1] += f1 * s;
1005             ret
1006           });
1007 
1008         h[0][0] += sums.h[0][0] as f64;
1009         h[1][1] += sums.h[1][1] as f64;
1010         h[0][1] += sums.h[0][1] as f64;
1011         c[0] += sums.c[0] as f64;
1012         c[1] += sums.c[1] as f64;
1013       }
1014 
1015       process_line(
1016         &mut h,
1017         &mut c,
1018         &cdeffed[y],
1019         &input[y],
1020         &f_r1,
1021         f_r2_01[dy],
1022         cdef_w,
1023       );
1024     }
1025   }
1026 
1027   // this is lifted almost in-tact from libaom
1028   let n = cdef_w as f64 * cdef_h as f64;
1029   h[0][0] /= n;
1030   h[0][1] /= n;
1031   h[1][1] /= n;
1032   h[1][0] = h[0][1];
1033   c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1034   c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1035   let (xq0, xq1) = if s_r2 == 0 {
1036     // H matrix is now only the scalar h[1][1]
1037     // C vector is now only the scalar c[1]
1038     if h[1][1] == 0. {
1039       (0, 0)
1040     } else {
1041       (0, (c[1] / h[1][1]).round() as i32)
1042     }
1043   } else if s_r1 == 0 {
1044     // H matrix is now only the scalar h[0][0]
1045     // C vector is now only the scalar c[0]
1046     if h[0][0] == 0. {
1047       (0, 0)
1048     } else {
1049       ((c[0] / h[0][0]).round() as i32, 0)
1050     }
1051   } else {
1052     let det = h[0][0] * h[1][1] - h[0][1] * h[1][0];
1053     if det == 0. {
1054       (0, 0)
1055     } else {
1056       // If scaling up dividend would overflow, instead scale down the divisor
1057       let div1 = h[1][1] * c[0] - h[0][1] * c[1];
1058       let div2 = h[0][0] * c[1] - h[1][0] * c[0];
1059       ((div1 / det).round() as i32, (div2 / det).round() as i32)
1060     }
1061   };
1062   {
1063     let xqd0 =
1064       clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32);
1065     let xqd1 = clamp(
1066       (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1067       SGRPROJ_XQD_MIN[1] as i32,
1068       SGRPROJ_XQD_MAX[1] as i32,
1069     );
1070     (xqd0 as i8, xqd1 as i8)
1071   }
1072 }
1073 
wiener_stripe_filter<T: Pixel>( coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>, )1074 fn wiener_stripe_filter<T: Pixel>(
1075   coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1076   stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1077   cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1078 ) {
1079   let bit_depth = fi.sequence.bit_depth;
1080   let round_h = if bit_depth == 12 { 5 } else { 3 };
1081   let round_v = if bit_depth == 12 { 9 } else { 11 };
1082   let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
1083   let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
1084 
1085   let mut coeffs_ = [[0; 3]; 2];
1086   for i in 0..2 {
1087     for j in 0..3 {
1088       coeffs_[i][j] = i32::from(coeffs[i][j]);
1089     }
1090   }
1091 
1092   let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7];
1093   let vfilter: [i32; 7] = [
1094     coeffs_[0][0],
1095     coeffs_[0][1],
1096     coeffs_[0][2],
1097     128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]),
1098     coeffs_[0][2],
1099     coeffs_[0][1],
1100     coeffs_[0][0],
1101   ];
1102   let hfilter: [i32; 7] = [
1103     coeffs_[1][0],
1104     coeffs_[1][1],
1105     coeffs_[1][2],
1106     128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]),
1107     coeffs_[1][2],
1108     coeffs_[1][1],
1109     coeffs_[1][0],
1110   ];
1111 
1112   // unlike x, our y can be negative to start as the first stripe
1113   // starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1114   let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize;
1115   let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize;
1116   let end_i = cmp::max(
1117     0,
1118     if stripe_h as isize + stripe_y > crop_h as isize {
1119       crop_h as isize - stripe_y - start_wi as isize
1120     } else {
1121       stripe_h as isize - start_wi as isize
1122     },
1123   ) as usize;
1124 
1125   let mut out_slice =
1126     out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize });
1127 
1128   for xi in stripe_x..stripe_x + stripe_w {
1129     let n = cmp::min(7, crop_w as isize + 3 - xi as isize);
1130     for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 {
1131       let mut acc = 0;
1132       let src = if yi < stripe_y {
1133         let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2);
1134         deblocked.row(ly)
1135       } else if yi < stripe_y + stripe_h as isize {
1136         let ly = clamp(yi, 0, crop_h as isize - 1);
1137         cdeffed.row(ly)
1138       } else {
1139         let ly = cmp::min(
1140           clamp(yi, 0, crop_h as isize - 1),
1141           stripe_y + stripe_h as isize + 1,
1142         );
1143         deblocked.row(ly)
1144       };
1145       let start = i32::cast_from(src[0]);
1146       let end = i32::cast_from(src[crop_w - 1]);
1147       for i in 0..3 - xi as isize {
1148         acc += hfilter[i as usize] * start;
1149       }
1150 
1151       let off = 3 - (xi as isize);
1152       let s = cmp::max(0, off) as usize;
1153       let s1 = (s as isize - off) as usize;
1154       let n1 = (n as isize - off) as usize;
1155 
1156       for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1157         acc += hf * i32::cast_from(v);
1158       }
1159 
1160       for i in n..7 {
1161         acc += hfilter[i as usize] * end;
1162       }
1163 
1164       acc = (acc + (1 << round_h >> 1)) >> round_h;
1165       work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset);
1166     }
1167 
1168     for (wi, dst) in (start_wi..start_wi + end_i)
1169       .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i))
1170     {
1171       let mut acc = 0;
1172       for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) {
1173         acc += vfilter[i] * *src;
1174       }
1175       *dst = T::cast_from(clamp(
1176         (acc + (1 << round_v >> 1)) >> round_v,
1177         0,
1178         (1 << bit_depth) - 1,
1179       ));
1180     }
1181   }
1182 }
1183 
1184 #[derive(Copy, Clone, Debug, Default)]
1185 pub struct RestorationUnit {
1186   pub filter: RestorationFilter,
1187 }
1188 
1189 #[derive(Clone, Debug)]
1190 pub struct FrameRestorationUnits {
1191   units: Box<[RestorationUnit]>,
1192   pub cols: usize,
1193   pub rows: usize,
1194 }
1195 
1196 impl FrameRestorationUnits {
new(cols: usize, rows: usize) -> Self1197   pub fn new(cols: usize, rows: usize) -> Self {
1198     Self {
1199       units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1200       cols,
1201       rows,
1202     }
1203   }
1204 }
1205 
1206 impl Index<usize> for FrameRestorationUnits {
1207   type Output = [RestorationUnit];
1208   #[inline(always)]
index(&self, index: usize) -> &Self::Output1209   fn index(&self, index: usize) -> &Self::Output {
1210     &self.units[index * self.cols..(index + 1) * self.cols]
1211   }
1212 }
1213 
1214 impl IndexMut<usize> for FrameRestorationUnits {
1215   #[inline(always)]
index_mut(&mut self, index: usize) -> &mut Self::Output1216   fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1217     &mut self.units[index * self.cols..(index + 1) * self.cols]
1218   }
1219 }
1220 
1221 #[derive(Clone, Debug)]
1222 pub struct RestorationPlaneConfig {
1223   pub lrf_type: u8,
1224   pub unit_size: usize,
1225   // (1 << sb_x_shift) gives the number of superblocks horizontally or
1226   // vertically in a restoration unit, not accounting for RU stretching
1227   pub sb_h_shift: usize,
1228   pub sb_v_shift: usize,
1229   pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1230   pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1231   // stripe height is 64 in all cases except 4:2:0 chroma planes where
1232   // it is 32.  This is independent of all other setup parameters
1233   pub stripe_height: usize,
1234   pub cols: usize,
1235   pub rows: usize,
1236 }
1237 
1238 #[derive(Clone, Debug)]
1239 pub struct RestorationPlane {
1240   pub cfg: RestorationPlaneConfig,
1241   pub units: FrameRestorationUnits,
1242 }
1243 
1244 #[derive(Clone, Default)]
1245 pub struct RestorationPlaneOffset {
1246   pub row: usize,
1247   pub col: usize,
1248 }
1249 
1250 impl RestorationPlane {
new( lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize, sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize, rows: usize, ) -> RestorationPlane1251   pub fn new(
1252     lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1253     sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1254     rows: usize,
1255   ) -> RestorationPlane {
1256     let stripe_height = if stripe_decimate != 0 { 32 } else { 64 };
1257     RestorationPlane {
1258       cfg: RestorationPlaneConfig {
1259         lrf_type,
1260         unit_size,
1261         sb_h_shift,
1262         sb_v_shift,
1263         sb_cols,
1264         sb_rows,
1265         stripe_height,
1266         cols,
1267         rows,
1268       },
1269       units: FrameRestorationUnits::new(cols, rows),
1270     }
1271   }
1272 
1273   // Stripes are always 64 pixels high in a non-subsampled
1274   // frame, and decimated from 64 pixels in chroma.  When
1275   // filtering, they are not co-located on Y with superblocks.
restoration_unit_index_by_stripe( &self, stripenum: usize, rux: usize, ) -> (usize, usize)1276   fn restoration_unit_index_by_stripe(
1277     &self, stripenum: usize, rux: usize,
1278   ) -> (usize, usize) {
1279     (
1280       cmp::min(rux, self.cfg.cols - 1),
1281       cmp::min(
1282         stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1283         self.cfg.rows - 1,
1284       ),
1285     )
1286   }
1287 
restoration_unit_by_stripe( &self, stripenum: usize, rux: usize, ) -> &RestorationUnit1288   pub fn restoration_unit_by_stripe(
1289     &self, stripenum: usize, rux: usize,
1290   ) -> &RestorationUnit {
1291     let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1292     &self.units[y][x]
1293   }
1294 }
1295 
1296 #[derive(Clone, Debug)]
1297 pub struct RestorationState {
1298   pub planes: [RestorationPlane; MAX_PLANES],
1299 }
1300 
1301 impl RestorationState {
new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self1302   pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1303     let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg;
1304     // stripe size is decimated in 4:2:0 (and only 4:2:0)
1305     let stripe_uv_decimate = if xdec > 0 && ydec > 0 { 1 } else { 0 };
1306     let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 };
1307     let uv_sb_h_log2 = y_sb_log2 - xdec;
1308     let uv_sb_v_log2 = y_sb_log2 - ydec;
1309 
1310     let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1311       && fi.sequence.enable_restoration
1312     {
1313       assert!(
1314         fi.width > 1 && fi.height > 1,
1315         "Width and height must be higher than 1 for LRF setup"
1316       );
1317 
1318       // Specific content does affect optimal LRU size choice, but the
1319       // quantizer in use is a surprisingly strong selector.
1320       let lrf_base_shift = if fi.base_q_idx > 200 {
1321         0 // big
1322       } else if fi.base_q_idx > 160 {
1323         1
1324       } else {
1325         2 // small
1326       };
1327       let lrf_chroma_shift = if stripe_uv_decimate > 0 {
1328         // 4:2:0 only
1329         if lrf_base_shift == 2 {
1330           1 // smallest chroma LRU is a win at low quant
1331         } else {
1332           // Will a down-shifted chroma LRU eliminate stretch in chroma?
1333           // If so, that's generally a win.
1334           let lrf_unit_size =
1335             1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1336           let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size
1337             <= lrf_unit_size / 2
1338             || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2;
1339           let shifted_stretch = ((fi.width >> xdec) - 1)
1340             % (lrf_unit_size >> 1)
1341             <= lrf_unit_size / 4
1342             || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1)
1343               <= lrf_unit_size / 4;
1344           if unshifted_stretch && !shifted_stretch {
1345             1 // shift to eliminate stretch
1346           } else {
1347             0 // don't shift; save the signaling bits
1348           }
1349         }
1350       } else {
1351         0
1352       };
1353       (lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1354     } else {
1355       // Explicit request to tie LRU size to superblock size ==
1356       // smallest possible LRU size
1357       let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 };
1358       (lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1359     };
1360 
1361     let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1362     let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1363 
1364     let tiling = fi.sequence.tiling;
1365     // Right now we defer to tiling setup: don't choose an LRU size
1366     // large enough that a tile is not an integer number of LRUs
1367     // wide/high.
1368     if tiling.cols > 1 || tiling.rows > 1 {
1369       // despite suggestions to the contrary, tiles can be
1370       // non-powers-of-2.
1371       let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1372       let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1373       let tile_aligned_y_unit_size =
1374         1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1375       let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros);
1376       let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros);
1377       y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1378       uv_unit_size = uv_unit_size
1379         .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1380 
1381       // But it's actually worse: LRUs can't span tiles (in our
1382       // one-pass design that is, spec allows it).  However, the spec
1383       // mandates the last LRU stretches forward into any
1384       // less-than-half-LRU span of superblocks at the right and
1385       // bottom of a frame.  These superblocks may well be in a
1386       // different tile!  Even if LRUs are minimum size (one
1387       // superblock), when the right or bottom edge of the frame is a
1388       // superblock that's less than half the width/height of a normal
1389       // superblock, the LRU is forced by the spec to span into it
1390       // (and thus a different tile).  Tiling is under no such
1391       // restriction; it could decide the right/left sliver will be in
1392       // its own tile row/column.  We can't disallow the combination
1393       // here.  The tiling code will have to either prevent it or
1394       // tolerate it.  (prayer mechanic == Issue #1629).
1395     }
1396 
1397     // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1398     // to be the same*. If they differ at this
1399     // point, it's due to a tiling restriction enforcing a maximum
1400     // size, so force both to the smaller value.
1401     //
1402     // *see sec 5.9.20, "Loop restoration params syntax".  The
1403     // bitstream provides means of coding a different UV LRU size only
1404     // when chroma is in use and both x and y are subsampled in the
1405     // chroma planes.
1406     if ydec == 0 && y_unit_size != uv_unit_size {
1407       y_unit_size = uv_unit_size.min(y_unit_size);
1408       uv_unit_size = y_unit_size;
1409     }
1410 
1411     // derive the rest
1412     let y_unit_log2 = y_unit_size.ilog() - 1;
1413     let uv_unit_log2 = uv_unit_size.ilog() - 1;
1414     let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1);
1415     let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1);
1416     let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec)
1417       + (uv_unit_size >> 1))
1418       / uv_unit_size)
1419       .max(1);
1420     let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec)
1421       + (uv_unit_size >> 1))
1422       / uv_unit_size)
1423       .max(1);
1424 
1425     RestorationState {
1426       planes: [
1427         RestorationPlane::new(
1428           RESTORE_SWITCHABLE,
1429           y_unit_size,
1430           y_unit_log2 - y_sb_log2,
1431           y_unit_log2 - y_sb_log2,
1432           fi.sb_width,
1433           fi.sb_height,
1434           0,
1435           y_cols,
1436           y_rows,
1437         ),
1438         RestorationPlane::new(
1439           RESTORE_SWITCHABLE,
1440           uv_unit_size,
1441           uv_unit_log2 - uv_sb_h_log2,
1442           uv_unit_log2 - uv_sb_v_log2,
1443           fi.sb_width,
1444           fi.sb_height,
1445           stripe_uv_decimate,
1446           uv_cols,
1447           uv_rows,
1448         ),
1449         RestorationPlane::new(
1450           RESTORE_SWITCHABLE,
1451           uv_unit_size,
1452           uv_unit_log2 - uv_sb_h_log2,
1453           uv_unit_log2 - uv_sb_v_log2,
1454           fi.sb_width,
1455           fi.sb_height,
1456           stripe_uv_decimate,
1457           uv_cols,
1458           uv_rows,
1459         ),
1460       ],
1461     }
1462   }
1463 
1464   #[hawktracer(lrf_filter_frame)]
lrf_filter_frame<T: Pixel>( &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>, fi: &FrameInvariants<T>, )1465   pub fn lrf_filter_frame<T: Pixel>(
1466     &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1467     fi: &FrameInvariants<T>,
1468   ) {
1469     let cdeffed = out.clone();
1470     let planes =
1471       if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES };
1472 
1473     // unlike the other loop filters that operate over the padded
1474     // frame dimensions, restoration filtering and source pixel
1475     // accesses are clipped to the original frame dimensions
1476     // that's why we use fi.width and fi.height instead of PlaneConfig fields
1477 
1478     // number of stripes (counted according to colocated Y luma position)
1479     let stripe_n = (fi.height + 7) / 64 + 1;
1480 
1481     // Buffers for the stripe filter.
1482     let mut stripe_filter_buffer =
1483       IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1484 
1485     for pli in 0..planes {
1486       let rp = &self.planes[pli];
1487       let xdec = out.planes[pli].cfg.xdec;
1488       let ydec = out.planes[pli].cfg.ydec;
1489       let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec;
1490       let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec;
1491 
1492       for si in 0..stripe_n {
1493         let (stripe_start_y, stripe_size) = if si == 0 {
1494           (0, (64 - 8) >> ydec)
1495         } else {
1496           let start = (si * 64 - 8) >> ydec;
1497           (
1498             start as isize,
1499             // one past, unlike spec
1500             (64 >> ydec).min(crop_h - start),
1501           )
1502         };
1503 
1504         // horizontally, go rdu-by-rdu
1505         for rux in 0..rp.cfg.cols {
1506           // stripe x pixel locations must be clipped to frame, last may need to stretch
1507           let x = rux * rp.cfg.unit_size;
1508           let size =
1509             if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size };
1510           let ru = rp.restoration_unit_by_stripe(si, rux);
1511           match ru.filter {
1512             RestorationFilter::Wiener { coeffs } => {
1513               wiener_stripe_filter(
1514                 coeffs,
1515                 fi,
1516                 crop_w,
1517                 crop_h,
1518                 size,
1519                 stripe_size,
1520                 x,
1521                 stripe_start_y,
1522                 &cdeffed.planes[pli],
1523                 &pre_cdef.planes[pli],
1524                 &mut out.planes[pli],
1525               );
1526             }
1527             RestorationFilter::Sgrproj { set, xqd } => {
1528               if !fi.sequence.enable_cdef {
1529                 continue;
1530               }
1531 
1532               setup_integral_image(
1533                 &mut stripe_filter_buffer,
1534                 STRIPE_IMAGE_STRIDE,
1535                 crop_w - x,
1536                 (crop_h as isize - stripe_start_y) as usize,
1537                 size,
1538                 stripe_size,
1539                 &cdeffed.planes[pli]
1540                   .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1541                 &pre_cdef.planes[pli]
1542                   .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1543               );
1544 
1545               sgrproj_stripe_filter(
1546                 set,
1547                 xqd,
1548                 fi,
1549                 &stripe_filter_buffer,
1550                 STRIPE_IMAGE_STRIDE,
1551                 &cdeffed.planes[pli]
1552                   .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1553                 &mut out.planes[pli].region_mut(Area::Rect {
1554                   x: x as isize,
1555                   y: stripe_start_y,
1556                   width: size,
1557                   height: stripe_size,
1558                 }),
1559               );
1560             }
1561             RestorationFilter::None => {
1562               // do nothing
1563             }
1564           }
1565         }
1566       }
1567     }
1568   }
1569 }
1570