1 // Copyright (c) 2017-2021, The rav1e contributors. All rights reserved
2 //
3 // This source code is subject to the terms of the BSD 2 Clause License and
4 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5 // was not distributed with this source code in the LICENSE file, you can
6 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
7 // Media Patent License 1.0 was not distributed with this source code in the
8 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10 cfg_if::cfg_if! {
11 if #[cfg(nasm_x86_64)] {
12 use crate::asm::x86::lrf::*;
13 } else {
14 use self::rust::*;
15 }
16 }
17
18 use crate::color::ChromaSampling::Cs400;
19 use crate::context::{MAX_PLANES, SB_SIZE};
20 use crate::encoder::FrameInvariants;
21 use crate::frame::{
22 AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice,
23 };
24 use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect};
25 use crate::util::{clamp, CastFromPrimitive, ILog, Pixel};
26 use rust_hawktracer::*;
27
28 use crate::api::SGRComplexityLevel;
29 use std::cmp;
30 use std::iter::FusedIterator;
31 use std::ops::{Index, IndexMut};
32
33 pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8;
34
35 pub const RESTORE_NONE: u8 = 0;
36 pub const RESTORE_SWITCHABLE: u8 = 1;
37 pub const RESTORE_WIENER: u8 = 2;
38 pub const RESTORE_SGRPROJ: u8 = 3;
39
40 pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17];
41 pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15];
42 pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46];
43 #[allow(unused)]
44 pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3];
45 pub const WIENER_BITS: usize = 7;
46
47 pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32];
48 pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31];
49 pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95];
50 pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4;
51 pub const SGRPROJ_PRJ_BITS: u8 = 7;
52 pub const SGRPROJ_PARAMS_BITS: u8 = 4;
53 pub const SGRPROJ_MTABLE_BITS: u8 = 20;
54 pub const SGRPROJ_SGR_BITS: u8 = 8;
55 pub const SGRPROJ_RECIP_BITS: u8 = 12;
56 pub const SGRPROJ_RST_BITS: u8 = 4;
57 pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [
58 [140, 3236],
59 [112, 2158],
60 [93, 1618],
61 [80, 1438],
62 [70, 1295],
63 [58, 1177],
64 [47, 1079],
65 [37, 996],
66 [30, 925],
67 [25, 863],
68 [0, 2589],
69 [0, 1618],
70 [0, 1177],
71 [0, 925],
72 [56, 0],
73 [22, 0],
74 ];
75
76 // List of indices to SGRPROJ_PARAMS_S values that at a given complexity level.
77 // SGRPROJ_ALL_SETS contains every possible index
78 const SGRPROJ_ALL_SETS: &[u8] =
79 &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
80 // SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives
81 // most of the gains from sgr. The decision of which values to use is somewhat
82 // arbitrary. The sgr parameters has 3 discontinuous groups. The first has both
83 // parameters as non-zero. The other two are distinguishable by which of the
84 // two parameters is zero. There are an even number of each of these groups and
85 // the non-zero parameters grow as the indices increase. This array uses the
86 // 1st, 3rd, ... smallest params of each group.
87 const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15];
88
get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8]89 pub fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] {
90 match complexity {
91 SGRComplexityLevel::Full => SGRPROJ_ALL_SETS,
92 SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS,
93 }
94 }
95
96 pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2;
97 pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2;
98 pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE;
99 pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT;
100
101 pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2)
102 + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1));
103 pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2;
104 pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2;
105 pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT;
106
107 pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX]
108 [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize];
109
110 /// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`.
111 #[derive(Debug)]
112 pub struct IntegralImageBuffer {
113 pub integral_image: Vec<u32>,
114 pub sq_integral_image: Vec<u32>,
115 }
116
117 impl IntegralImageBuffer {
118 /// Creates a new buffer with the given size, filled with zeros.
119 #[inline]
zeroed(size: usize) -> Self120 pub fn zeroed(size: usize) -> Self {
121 Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] }
122 }
123 }
124
125 #[allow(unused)] // Wiener coming soon!
126 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
127 pub enum RestorationFilter {
128 None,
129 Wiener { coeffs: [[i8; 3]; 2] },
130 Sgrproj { set: u8, xqd: [i8; 2] },
131 }
132
133 impl Default for RestorationFilter {
default() -> RestorationFilter134 fn default() -> RestorationFilter {
135 RestorationFilter::None
136 }
137 }
138
139 impl RestorationFilter {
notequal(self, cmp: RestorationFilter) -> bool140 pub fn notequal(self, cmp: RestorationFilter) -> bool {
141 match self {
142 RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}),
143 RestorationFilter::Sgrproj { set, xqd } => {
144 if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp {
145 !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1])
146 } else {
147 true
148 }
149 }
150 RestorationFilter::Wiener { coeffs } => {
151 if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp {
152 !(coeffs[0][0] == coeffs2[0][0]
153 && coeffs[0][1] == coeffs2[0][1]
154 && coeffs[0][2] == coeffs2[0][2]
155 && coeffs[1][0] == coeffs2[1][0]
156 && coeffs[1][1] == coeffs2[1][1]
157 && coeffs[1][2] == coeffs2[1][2])
158 } else {
159 true
160 }
161 }
162 }
163 }
164 }
165
166 pub(crate) mod rust {
167 use crate::cpu_features::CpuFeatureLevel;
168 use crate::frame::PlaneSlice;
169 use crate::lrf::{
170 get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS,
171 SGRPROJ_SGR_BITS,
172 };
173 use crate::util::CastFromPrimitive;
174 use crate::Pixel;
175
176 #[inline(always)]
sgrproj_box_ab_internal( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )177 pub(crate) fn sgrproj_box_ab_internal(
178 r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
179 iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32,
180 bdm8: usize,
181 ) {
182 let d: usize = r * 2 + 1;
183 let n: usize = d * d;
184 let one_over_n = if r == 1 { 455 } else { 164 };
185
186 for x in start_x..stripe_w + 2 {
187 let sum = get_integral_square(iimg, iimg_stride, x, y, d);
188 let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d);
189 let (reta, retb) =
190 sgrproj_sum_finish(ssq, sum, n as u32, one_over_n, s, bdm8);
191 af[x] = reta;
192 bf[x] = retb;
193 }
194 }
195
196 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, )197 pub(crate) fn sgrproj_box_ab_r1(
198 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
199 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
200 _cpu: CpuFeatureLevel,
201 ) {
202 sgrproj_box_ab_internal(
203 1,
204 af,
205 bf,
206 iimg,
207 iimg_sq,
208 iimg_stride,
209 0,
210 y,
211 stripe_w,
212 s,
213 bdm8,
214 );
215 }
216
217 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, _cpu: CpuFeatureLevel, )218 pub(crate) fn sgrproj_box_ab_r2(
219 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
220 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
221 _cpu: CpuFeatureLevel,
222 ) {
223 sgrproj_box_ab_internal(
224 2,
225 af,
226 bf,
227 iimg,
228 iimg_sq,
229 iimg_stride,
230 0,
231 y,
232 stripe_w,
233 s,
234 bdm8,
235 );
236 }
237
sgrproj_box_f_r0<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )238 pub(crate) fn sgrproj_box_f_r0<T: Pixel>(
239 f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
240 _cpu: CpuFeatureLevel,
241 ) {
242 sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed);
243 }
244
245 #[inline(always)]
sgrproj_box_f_r0_internal<T: Pixel>( f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )246 pub(crate) fn sgrproj_box_f_r0_internal<T: Pixel>(
247 f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
248 ) {
249 let line = cdeffed.row(y);
250 for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) {
251 *fp = u32::cast_from(v) << SGRPROJ_RST_BITS;
252 }
253 }
254
sgrproj_box_f_r1<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )255 pub(crate) fn sgrproj_box_f_r1<T: Pixel>(
256 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
257 cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
258 ) {
259 sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed);
260 }
261
262 #[inline(always)]
sgrproj_box_f_r1_internal<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )263 pub(crate) fn sgrproj_box_f_r1_internal<T: Pixel>(
264 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize,
265 y: usize, w: usize, cdeffed: &PlaneSlice<T>,
266 ) {
267 let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
268 let line = cdeffed.row(y);
269 for x in start_x..w {
270 let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2])
271 + 4
272 * (af[1][x]
273 + af[0][x + 1]
274 + af[1][x + 1]
275 + af[2][x + 1]
276 + af[1][x + 2]);
277 let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2])
278 + 4
279 * (bf[1][x]
280 + bf[0][x + 1]
281 + bf[1][x + 1]
282 + bf[2][x + 1]
283 + bf[1][x + 2]);
284 let v = a * u32::cast_from(line[x]) + b;
285 f[x] = (v + (1 << shift >> 1)) >> shift;
286 }
287 }
288
sgrproj_box_f_r2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel, )289 pub(crate) fn sgrproj_box_f_r2<T: Pixel>(
290 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
291 y: usize, w: usize, cdeffed: &PlaneSlice<T>, _cpu: CpuFeatureLevel,
292 ) {
293 sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed);
294 }
295
296 #[inline(always)]
sgrproj_box_f_r2_internal<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>, )297 pub(crate) fn sgrproj_box_f_r2_internal<T: Pixel>(
298 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
299 start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice<T>,
300 ) {
301 let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
302 let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS;
303 let line = cdeffed.row(y);
304 let line1 = cdeffed.row(y + 1);
305
306 let af0 = af[0][start_x..w + 3].windows(3);
307 let af1 = af[1][start_x..w + 3].windows(3);
308 let bf0 = bf[0][start_x..w + 3].windows(3);
309 let bf1 = bf[1][start_x..w + 3].windows(3);
310
311 let af_it = af0.zip(af1);
312 let bf_it = bf0.zip(bf1);
313
314 let in0 = line[start_x..w].iter();
315 let in1 = line1[start_x..w].iter();
316
317 let o0 = f0[start_x..w].iter_mut();
318 let o1 = f1[start_x..w].iter_mut();
319
320 let in_iter = in0.zip(in1);
321 let out_iter = o0.zip(o1);
322
323 let io_iter = out_iter.zip(in_iter);
324
325 for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in
326 io_iter.zip(af_it.zip(bf_it))
327 {
328 let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1];
329 let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1];
330 let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1];
331 let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1];
332 let v = (a + ao) * u32::cast_from(p0) + b + bo;
333 *o0 = (v + (1 << shift >> 1)) >> shift;
334 let vo = ao * u32::cast_from(p1) + bo;
335 *o1 = (vo + (1 << shifto >> 1)) >> shifto;
336 }
337 }
338 }
339
340 #[inline(always)]
sgrproj_sum_finish( ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize, ) -> (u32, u32)341 fn sgrproj_sum_finish(
342 ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize,
343 ) -> (u32, u32) {
344 let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8);
345 let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8;
346 let p =
347 cmp::max(0, (scaled_ssq * n) as i32 - (scaled_sum * scaled_sum) as i32)
348 as u32;
349 let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS;
350 let a = if z >= 255 {
351 256
352 } else if z == 0 {
353 1
354 } else {
355 ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1)
356 };
357 let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n;
358 (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS)
359 }
360
361 // Using an integral image, compute the sum of a square region
get_integral_square( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> u32362 fn get_integral_square(
363 iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
364 ) -> u32 {
365 // Cancel out overflow in iimg by using wrapping arithmetic
366 iimg[y * stride + x]
367 .wrapping_add(iimg[(y + size) * stride + x + size])
368 .wrapping_sub(iimg[(y + size) * stride + x])
369 .wrapping_sub(iimg[y * stride + x + size])
370 }
371
372 struct VertPaddedIter<'a, T: Pixel> {
373 // The two sources that can be selected when clipping
374 deblocked: &'a Plane<T>,
375 cdeffed: &'a Plane<T>,
376 // x index to choice where on the row to start
377 x: isize,
378 // y index that will be mutated
379 y: isize,
380 // The index at which to terminate. Can be larger than the slice length.
381 end: isize,
382 // Used for source buffer choice/clipping. May (and regularly will)
383 // be negative.
384 stripe_begin: isize,
385 // Also used for source buffer choice/clipping. May specify a stripe boundary
386 // less than, equal to, or larger than the buffers we're accessing.
387 stripe_end: isize,
388 // Active area cropping is done by specifying a value smaller than the height
389 // of the plane.
390 crop: isize,
391 }
392
393 impl<'a, 'b, T: Pixel> VertPaddedIter<'a, T> {
new( cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>, stripe_h: usize, crop: usize, ) -> VertPaddedIter<'a, T>394 fn new(
395 cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>,
396 stripe_h: usize, crop: usize,
397 ) -> VertPaddedIter<'a, T> {
398 // cdeffed and deblocked must start at the same coordinates from their
399 // underlying planes. Since cropping is provided via a separate params, the
400 // height of the underlying planes do not need to match.
401 assert_eq!(cdeffed.x, deblocked.x);
402 assert_eq!(cdeffed.y, deblocked.y);
403
404 // To share integral images, always use the max box filter radius of 2
405 let r = 2;
406
407 // The number of rows outside the stripe are needed
408 let rows_above = r + 2;
409 let rows_below = 2;
410
411 // Offset crop and stripe_h so they are relative to the underlying plane
412 // and not the plane slice.
413 let crop = crop as isize + deblocked.y;
414 let stripe_end = stripe_h as isize + deblocked.y;
415
416 // Move y up the number rows above.
417 // If y is negative we repeat the first row
418 let y = deblocked.y - rows_above as isize;
419
420 VertPaddedIter {
421 deblocked: deblocked.plane,
422 cdeffed: cdeffed.plane,
423 x: deblocked.x,
424 y,
425 end: (rows_above + stripe_h + rows_below) as isize + y,
426 stripe_begin: deblocked.y,
427 stripe_end,
428 crop,
429 }
430 }
431 }
432
433 impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> {
434 type Item = &'a [T];
435
436 #[inline(always)]
next(&mut self) -> Option<Self::Item>437 fn next(&mut self) -> Option<Self::Item> {
438 if self.end > self.y {
439 // clamp before deciding the source
440 // clamp vertically to storage at top and passed-in height at bottom
441 let cropped_y = clamp(self.y, 0, self.crop - 1);
442 // clamp vertically to stripe limits
443 let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1);
444
445 // decide if we're vertically inside or outside the strip
446 let src_plane =
447 if ly >= self.stripe_begin && ly < self.stripe_end as isize {
448 self.cdeffed
449 } else {
450 self.deblocked
451 };
452 // cannot directly return self.ps.row(row) due to lifetime issue
453 let range = src_plane.row_range(self.x, ly);
454 self.y += 1;
455 Some(&src_plane.data[range])
456 } else {
457 None
458 }
459 }
460
size_hint(&self) -> (usize, Option<usize>)461 fn size_hint(&self) -> (usize, Option<usize>) {
462 let remaining = self.end - self.y;
463 debug_assert!(remaining >= 0);
464 let remaining = remaining as usize;
465
466 (remaining, Some(remaining))
467 }
468 }
469
470 impl<T: Pixel> ExactSizeIterator for VertPaddedIter<'_, T> {}
471 impl<T: Pixel> FusedIterator for VertPaddedIter<'_, T> {}
472
473 struct HorzPaddedIter<'a, T: Pixel> {
474 // Active area cropping is done using the length of the slice
475 slice: &'a [T],
476 // x index of the iterator
477 // When less than 0, repeat the first element. When greater than end, repeat
478 // the last element
479 index: isize,
480 // The index at which to terminate. Can be larger than the slice length.
481 end: usize,
482 }
483
484 impl<'a, T: Pixel> HorzPaddedIter<'a, T> {
new( slice: &'a [T], start_index: isize, width: usize, ) -> HorzPaddedIter<'a, T>485 fn new(
486 slice: &'a [T], start_index: isize, width: usize,
487 ) -> HorzPaddedIter<'a, T> {
488 HorzPaddedIter {
489 slice,
490 index: start_index,
491 end: (width as isize + start_index) as usize,
492 }
493 }
494 }
495
496 impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> {
497 type Item = &'a T;
498
499 #[inline(always)]
next(&mut self) -> Option<Self::Item>500 fn next(&mut self) -> Option<Self::Item> {
501 if self.index < self.end as isize {
502 // clamp to the edges of the frame
503 let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize;
504 self.index += 1;
505 Some(&self.slice[x])
506 } else {
507 None
508 }
509 }
510
511 #[inline(always)]
size_hint(&self) -> (usize, Option<usize>)512 fn size_hint(&self) -> (usize, Option<usize>) {
513 let size: usize = (self.end as isize - self.index) as usize;
514 (size, Some(size))
515 }
516 }
517
518 impl<T: Pixel> ExactSizeIterator for HorzPaddedIter<'_, T> {}
519 impl<T: Pixel> FusedIterator for HorzPaddedIter<'_, T> {}
520
setup_integral_image<T: Pixel>( integral_image_buffer: &mut IntegralImageBuffer, integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>, )521 pub fn setup_integral_image<T: Pixel>(
522 integral_image_buffer: &mut IntegralImageBuffer,
523 integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize,
524 stripe_h: usize, cdeffed: &PlaneSlice<T>, deblocked: &PlaneSlice<T>,
525 ) {
526 let integral_image = &mut integral_image_buffer.integral_image;
527 let sq_integral_image = &mut integral_image_buffer.sq_integral_image;
528
529 // Number of elements outside the stripe
530 let left_w = 4; // max radius of 2 + 2 padding
531 let right_w = 3; // max radius of 2 + 1 padding
532
533 assert_eq!(cdeffed.x, deblocked.x);
534
535 // Find how many unique elements to use to the left and right
536 let left_uniques = if cdeffed.x == 0 { 0 } else { left_w };
537 let right_uniques = right_w.min(crop_w - stripe_w);
538
539 // Find the total number of unique elements used
540 let row_uniques = left_uniques + stripe_w + right_uniques;
541
542 // Negative start indices result in repeating the first element of the row
543 let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 };
544
545 let mut rows_iter = VertPaddedIter::new(
546 // Move left to encompass all the used data
547 &cdeffed.go_left(left_uniques),
548 &deblocked.go_left(left_uniques),
549 // since r2 uses every other row, we need an extra row if stripe_h is odd
550 stripe_h + (stripe_h & 1),
551 crop_h,
552 )
553 .map(|row: &[T]| {
554 HorzPaddedIter::new(
555 // Limit how many unique elements we use
556 &row[..row_uniques],
557 start_index_x,
558 left_w + stripe_w + right_w,
559 )
560 });
561
562 // Setup the first row
563 {
564 let mut sum: u32 = 0;
565 let mut sq_sum: u32 = 0;
566 // Remove the first row and use it outside of the main loop
567 let row = rows_iter.next().unwrap();
568 for (src, (integral, sq_integral)) in
569 row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut()))
570 {
571 let current = u32::cast_from(*src);
572
573 // Wrap adds to prevent undefined behaviour on overflow. Overflow is
574 // cancelled out when calculating the sum of a region.
575 sum = sum.wrapping_add(current);
576 *integral = sum;
577 sq_sum = sq_sum.wrapping_add(current * current);
578 *sq_integral = sq_sum;
579 }
580 }
581 // Calculate all other rows
582 let mut integral_slice = &mut integral_image[..];
583 let mut sq_integral_slice = &mut sq_integral_image[..];
584 for row in rows_iter {
585 let mut sum: u32 = 0;
586 let mut sq_sum: u32 = 0;
587
588 // Split the data between the previous row and future rows.
589 // This allows us to mutate the current row while accessing the
590 // previous row.
591 let (integral_row_prev, integral_row) =
592 integral_slice.split_at_mut(integral_image_stride);
593 let (sq_integral_row_prev, sq_integral_row) =
594 sq_integral_slice.split_at_mut(integral_image_stride);
595 for (
596 src,
597 ((integral_above, sq_integral_above), (integral, sq_integral)),
598 ) in row.zip(
599 integral_row_prev
600 .iter()
601 .zip(sq_integral_row_prev.iter())
602 .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())),
603 ) {
604 let current = u32::cast_from(*src);
605 // Wrap adds to prevent undefined behaviour on overflow. Overflow is
606 // cancelled out when calculating the sum of a region.
607 sum = sum.wrapping_add(current);
608 *integral = sum.wrapping_add(*integral_above);
609 sq_sum = sq_sum.wrapping_add(current * current);
610 *sq_integral = sq_sum.wrapping_add(*sq_integral_above);
611 }
612
613 // The current row also contains all future rows. Replacing the slice with
614 // it moves down a row.
615 integral_slice = integral_row;
616 sq_integral_slice = sq_integral_row;
617 }
618 }
619
sgrproj_stripe_filter<T: Pixel, U: Pixel>( set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>, )620 pub fn sgrproj_stripe_filter<T: Pixel, U: Pixel>(
621 set: u8, xqd: [i8; 2], fi: &FrameInvariants<T>,
622 integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize,
623 cdeffed: &PlaneSlice<U>, out: &mut PlaneRegionMut<U>,
624 ) {
625 let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect();
626 let bdm8 = fi.sequence.bit_depth - 8;
627 let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
628 [[0; IMAGE_WIDTH_MAX + 2]; 2];
629 let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
630 [[0; IMAGE_WIDTH_MAX + 2]; 2];
631 let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
632 let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
633 let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
634 [[0; IMAGE_WIDTH_MAX + 2]; 3];
635 let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
636 [[0; IMAGE_WIDTH_MAX + 2]; 3];
637 let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
638
639 let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
640 let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
641
642 /* prime the intermediate arrays */
643 // One oddness about the radius=2 intermediate array computations that
644 // the spec doesn't make clear: Although the spec defines computation
645 // of every row (of a, b and f), only half of the rows (every-other
646 // row) are actually used.
647 let integral_image = &integral_image_buffer.integral_image;
648 let sq_integral_image = &integral_image_buffer.sq_integral_image;
649 if s_r2 > 0 {
650 sgrproj_box_ab_r2(
651 &mut a_r2[0],
652 &mut b_r2[0],
653 integral_image,
654 sq_integral_image,
655 integral_image_stride,
656 0,
657 stripe_w,
658 s_r2,
659 bdm8,
660 fi.cpu_feature_level,
661 );
662 }
663 if s_r1 > 0 {
664 let integral_image_offset = integral_image_stride + 1;
665 sgrproj_box_ab_r1(
666 &mut a_r1[0],
667 &mut b_r1[0],
668 &integral_image[integral_image_offset..],
669 &sq_integral_image[integral_image_offset..],
670 integral_image_stride,
671 0,
672 stripe_w,
673 s_r1,
674 bdm8,
675 fi.cpu_feature_level,
676 );
677 sgrproj_box_ab_r1(
678 &mut a_r1[1],
679 &mut b_r1[1],
680 &integral_image[integral_image_offset..],
681 &sq_integral_image[integral_image_offset..],
682 integral_image_stride,
683 1,
684 stripe_w,
685 s_r1,
686 bdm8,
687 fi.cpu_feature_level,
688 );
689 }
690
691 /* iterate by row */
692 // Increment by two to handle the use of even rows by r=2 and run a nested
693 // loop to handle increments of one.
694 for y in (0..stripe_h).step_by(2) {
695 // get results to use y and y+1
696 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 {
697 sgrproj_box_ab_r2(
698 &mut a_r2[(y / 2 + 1) % 2],
699 &mut b_r2[(y / 2 + 1) % 2],
700 integral_image,
701 sq_integral_image,
702 integral_image_stride,
703 y + 2,
704 stripe_w,
705 s_r2,
706 bdm8,
707 fi.cpu_feature_level,
708 );
709 let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
710 let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
711 sgrproj_box_f_r2(
712 &ap0,
713 &bp0,
714 &mut f_r2_0,
715 &mut f_r2_1,
716 y,
717 stripe_w,
718 cdeffed,
719 fi.cpu_feature_level,
720 );
721 [&f_r2_0, &f_r2_1]
722 } else {
723 sgrproj_box_f_r0(
724 &mut f_r2_0,
725 y,
726 stripe_w,
727 cdeffed,
728 fi.cpu_feature_level,
729 );
730 // share results for both rows
731 [&f_r2_0, &f_r2_0]
732 };
733 for dy in 0..(2.min(stripe_h - y)) {
734 let y = y + dy;
735 if s_r1 > 0 {
736 let integral_image_offset = integral_image_stride + 1;
737 sgrproj_box_ab_r1(
738 &mut a_r1[(y + 2) % 3],
739 &mut b_r1[(y + 2) % 3],
740 &integral_image[integral_image_offset..],
741 &sq_integral_image[integral_image_offset..],
742 integral_image_stride,
743 y + 2,
744 stripe_w,
745 s_r1,
746 bdm8,
747 fi.cpu_feature_level,
748 );
749 let ap1: [&[u32]; 3] =
750 [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
751 let bp1: [&[u32]; 3] =
752 [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
753 sgrproj_box_f_r1(
754 &ap1,
755 &bp1,
756 &mut f_r1,
757 y,
758 stripe_w,
759 cdeffed,
760 fi.cpu_feature_level,
761 );
762 } else {
763 sgrproj_box_f_r0(
764 &mut f_r1,
765 y,
766 stripe_w,
767 cdeffed,
768 fi.cpu_feature_level,
769 );
770 }
771
772 /* apply filter */
773 let w0 = xqd[0] as i32;
774 let w1 = xqd[1] as i32;
775 let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1;
776
777 let line = &cdeffed[y];
778
779 #[inline(always)]
apply_filter<U: Pixel>( out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32], stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32, )780 fn apply_filter<U: Pixel>(
781 out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32],
782 stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32,
783 ) {
784 let line_it = line[..stripe_w].iter();
785 let f_r2_ab_it = f_r2_ab[..stripe_w].iter();
786 let f_r1_it = f_r1[..stripe_w].iter();
787 let out_it = out[..stripe_w].iter_mut();
788
789 for ((o, &u), (&f_r2_ab, &f_r1)) in
790 out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it))
791 {
792 let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
793 let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32;
794 let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1))
795 >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
796 *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1));
797 }
798 }
799
800 apply_filter(
801 &mut out[y],
802 line,
803 &f_r1,
804 f_r2_ab[dy],
805 stripe_w,
806 fi.sequence.bit_depth,
807 w0,
808 w1,
809 w2,
810 );
811 }
812 }
813 }
814
815 // Frame inputs below aren't all equal, and will change as work
816 // continues. There's no deblocked reconstruction available at this
817 // point of RDO, so we use the non-deblocked reconstruction, cdef and
818 // input. The input can be a full-sized frame. Cdef input is a partial
819 // frame constructed specifically for RDO.
820
821 // For simplicity, this ignores stripe segmentation (it's possible the
822 // extra complexity isn't worth it and we'll ignore stripes
823 // permanently during RDO, but that's not been tested yet). Data
824 // access inside the cdef frame is monolithic and clipped to the cdef
825 // borders.
826
827 // Input params follow the same rules as sgrproj_stripe_filter.
828 // Inputs are relative to the colocated slice views.
sgrproj_solve<T: Pixel>( set: u8, fi: &FrameInvariants<T>, integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize, ) -> (i8, i8)829 pub fn sgrproj_solve<T: Pixel>(
830 set: u8, fi: &FrameInvariants<T>,
831 integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>,
832 cdeffed: &PlaneSlice<T>, cdef_w: usize, cdef_h: usize,
833 ) -> (i8, i8) {
834 let bdm8 = fi.sequence.bit_depth - 8;
835
836 let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
837 [[0; IMAGE_WIDTH_MAX + 2]; 2];
838 let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] =
839 [[0; IMAGE_WIDTH_MAX + 2]; 2];
840 let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
841 let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
842 let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
843 [[0; IMAGE_WIDTH_MAX + 2]; 3];
844 let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] =
845 [[0; IMAGE_WIDTH_MAX + 2]; 3];
846 let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX];
847
848 let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0];
849 let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1];
850
851 let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]];
852 let mut c: [f64; 2] = [0., 0.];
853
854 /* prime the intermediate arrays */
855 // One oddness about the radius=2 intermediate array computations that
856 // the spec doesn't make clear: Although the spec defines computation
857 // of every row (of a, b and f), only half of the rows (every-other
858 // row) are actually used.
859 let integral_image = &integral_image_buffer.integral_image;
860 let sq_integral_image = &integral_image_buffer.sq_integral_image;
861 if s_r2 > 0 {
862 sgrproj_box_ab_r2(
863 &mut a_r2[0],
864 &mut b_r2[0],
865 integral_image,
866 sq_integral_image,
867 SOLVE_IMAGE_STRIDE,
868 0,
869 cdef_w,
870 s_r2,
871 bdm8,
872 fi.cpu_feature_level,
873 );
874 }
875 if s_r1 > 0 {
876 let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
877 sgrproj_box_ab_r1(
878 &mut a_r1[0],
879 &mut b_r1[0],
880 &integral_image[integral_image_offset..],
881 &sq_integral_image[integral_image_offset..],
882 SOLVE_IMAGE_STRIDE,
883 0,
884 cdef_w,
885 s_r1,
886 bdm8,
887 fi.cpu_feature_level,
888 );
889 sgrproj_box_ab_r1(
890 &mut a_r1[1],
891 &mut b_r1[1],
892 &integral_image[integral_image_offset..],
893 &sq_integral_image[integral_image_offset..],
894 SOLVE_IMAGE_STRIDE,
895 1,
896 cdef_w,
897 s_r1,
898 bdm8,
899 fi.cpu_feature_level,
900 );
901 }
902
903 /* iterate by row */
904 // Increment by two to handle the use of even rows by r=2 and run a nested
905 // loop to handle increments of one.
906 for y in (0..cdef_h).step_by(2) {
907 // get results to use y and y+1
908 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 {
909 sgrproj_box_ab_r2(
910 &mut a_r2[(y / 2 + 1) % 2],
911 &mut b_r2[(y / 2 + 1) % 2],
912 integral_image,
913 sq_integral_image,
914 SOLVE_IMAGE_STRIDE,
915 y + 2,
916 cdef_w,
917 s_r2,
918 bdm8,
919 fi.cpu_feature_level,
920 );
921 let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]];
922 let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]];
923 sgrproj_box_f_r2(
924 &ap0,
925 &bp0,
926 &mut f_r2_0,
927 &mut f_r2_1,
928 y,
929 cdef_w,
930 cdeffed,
931 fi.cpu_feature_level,
932 );
933 [&f_r2_0, &f_r2_1]
934 } else {
935 sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level);
936 // share results for both rows
937 [&f_r2_0, &f_r2_0]
938 };
939 for dy in 0..(2.min(cdef_h - y)) {
940 let y = y + dy;
941 if s_r1 > 0 {
942 let integral_image_offset = SOLVE_IMAGE_STRIDE + 1;
943 sgrproj_box_ab_r1(
944 &mut a_r1[(y + 2) % 3],
945 &mut b_r1[(y + 2) % 3],
946 &integral_image[integral_image_offset..],
947 &sq_integral_image[integral_image_offset..],
948 SOLVE_IMAGE_STRIDE,
949 y + 2,
950 cdef_w,
951 s_r1,
952 bdm8,
953 fi.cpu_feature_level,
954 );
955 let ap1: [&[u32]; 3] =
956 [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]];
957 let bp1: [&[u32]; 3] =
958 [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]];
959 sgrproj_box_f_r1(
960 &ap1,
961 &bp1,
962 &mut f_r1,
963 y,
964 cdef_w,
965 cdeffed,
966 fi.cpu_feature_level,
967 );
968 } else {
969 sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level);
970 }
971
972 #[inline(always)]
973 fn process_line<T: Pixel>(
974 h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T],
975 f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize,
976 ) {
977 let cdeffed_it = cdeffed[..cdef_w].iter();
978 let input_it = input[..cdef_w].iter();
979 let f_r2_ab_it = f_r2_ab[..cdef_w].iter();
980 let f_r1_it = f_r1[..cdef_w].iter();
981
982 #[derive(Debug, Copy, Clone)]
983 struct Sums {
984 h: [[i64; 2]; 2],
985 c: [i64; 2],
986 }
987
988 let sums: Sums = cdeffed_it
989 .zip(input_it)
990 .zip(f_r2_ab_it.zip(f_r1_it))
991 .map(|((&u, &i), (&f2, &f1))| {
992 let u = i32::cast_from(u) << SGRPROJ_RST_BITS;
993 let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u;
994 let f2 = f2 as i32 - u;
995 let f1 = f1 as i32 - u;
996 (s as i64, f1 as i64, f2 as i64)
997 })
998 .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| {
999 let mut ret: Sums = sums;
1000 ret.h[0][0] += f2 * f2;
1001 ret.h[1][1] += f1 * f1;
1002 ret.h[0][1] += f1 * f2;
1003 ret.c[0] += f2 * s;
1004 ret.c[1] += f1 * s;
1005 ret
1006 });
1007
1008 h[0][0] += sums.h[0][0] as f64;
1009 h[1][1] += sums.h[1][1] as f64;
1010 h[0][1] += sums.h[0][1] as f64;
1011 c[0] += sums.c[0] as f64;
1012 c[1] += sums.c[1] as f64;
1013 }
1014
1015 process_line(
1016 &mut h,
1017 &mut c,
1018 &cdeffed[y],
1019 &input[y],
1020 &f_r1,
1021 f_r2_01[dy],
1022 cdef_w,
1023 );
1024 }
1025 }
1026
1027 // this is lifted almost in-tact from libaom
1028 let n = cdef_w as f64 * cdef_h as f64;
1029 h[0][0] /= n;
1030 h[0][1] /= n;
1031 h[1][1] /= n;
1032 h[1][0] = h[0][1];
1033 c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1034 c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n;
1035 let (xq0, xq1) = if s_r2 == 0 {
1036 // H matrix is now only the scalar h[1][1]
1037 // C vector is now only the scalar c[1]
1038 if h[1][1] == 0. {
1039 (0, 0)
1040 } else {
1041 (0, (c[1] / h[1][1]).round() as i32)
1042 }
1043 } else if s_r1 == 0 {
1044 // H matrix is now only the scalar h[0][0]
1045 // C vector is now only the scalar c[0]
1046 if h[0][0] == 0. {
1047 (0, 0)
1048 } else {
1049 ((c[0] / h[0][0]).round() as i32, 0)
1050 }
1051 } else {
1052 let det = h[0][0] * h[1][1] - h[0][1] * h[1][0];
1053 if det == 0. {
1054 (0, 0)
1055 } else {
1056 // If scaling up dividend would overflow, instead scale down the divisor
1057 let div1 = h[1][1] * c[0] - h[0][1] * c[1];
1058 let div2 = h[0][0] * c[1] - h[1][0] * c[0];
1059 ((div1 / det).round() as i32, (div2 / det).round() as i32)
1060 }
1061 };
1062 {
1063 let xqd0 =
1064 clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32);
1065 let xqd1 = clamp(
1066 (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1,
1067 SGRPROJ_XQD_MIN[1] as i32,
1068 SGRPROJ_XQD_MAX[1] as i32,
1069 );
1070 (xqd0 as i8, xqd1 as i8)
1071 }
1072 }
1073
wiener_stripe_filter<T: Pixel>( coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>, )1074 fn wiener_stripe_filter<T: Pixel>(
1075 coeffs: [[i8; 3]; 2], fi: &FrameInvariants<T>, crop_w: usize, crop_h: usize,
1076 stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize,
1077 cdeffed: &Plane<T>, deblocked: &Plane<T>, out: &mut Plane<T>,
1078 ) {
1079 let bit_depth = fi.sequence.bit_depth;
1080 let round_h = if bit_depth == 12 { 5 } else { 3 };
1081 let round_v = if bit_depth == 12 { 9 } else { 11 };
1082 let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1);
1083 let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1;
1084
1085 let mut coeffs_ = [[0; 3]; 2];
1086 for i in 0..2 {
1087 for j in 0..3 {
1088 coeffs_[i][j] = i32::from(coeffs[i][j]);
1089 }
1090 }
1091
1092 let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7];
1093 let vfilter: [i32; 7] = [
1094 coeffs_[0][0],
1095 coeffs_[0][1],
1096 coeffs_[0][2],
1097 128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]),
1098 coeffs_[0][2],
1099 coeffs_[0][1],
1100 coeffs_[0][0],
1101 ];
1102 let hfilter: [i32; 7] = [
1103 coeffs_[1][0],
1104 coeffs_[1][1],
1105 coeffs_[1][2],
1106 128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]),
1107 coeffs_[1][2],
1108 coeffs_[1][1],
1109 coeffs_[1][0],
1110 ];
1111
1112 // unlike x, our y can be negative to start as the first stripe
1113 // starts off the top of the frame by 8 pixels, and can also run off the end of the frame
1114 let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize;
1115 let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize;
1116 let end_i = cmp::max(
1117 0,
1118 if stripe_h as isize + stripe_y > crop_h as isize {
1119 crop_h as isize - stripe_y - start_wi as isize
1120 } else {
1121 stripe_h as isize - start_wi as isize
1122 },
1123 ) as usize;
1124
1125 let mut out_slice =
1126 out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize });
1127
1128 for xi in stripe_x..stripe_x + stripe_w {
1129 let n = cmp::min(7, crop_w as isize + 3 - xi as isize);
1130 for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 {
1131 let mut acc = 0;
1132 let src = if yi < stripe_y {
1133 let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2);
1134 deblocked.row(ly)
1135 } else if yi < stripe_y + stripe_h as isize {
1136 let ly = clamp(yi, 0, crop_h as isize - 1);
1137 cdeffed.row(ly)
1138 } else {
1139 let ly = cmp::min(
1140 clamp(yi, 0, crop_h as isize - 1),
1141 stripe_y + stripe_h as isize + 1,
1142 );
1143 deblocked.row(ly)
1144 };
1145 let start = i32::cast_from(src[0]);
1146 let end = i32::cast_from(src[crop_w - 1]);
1147 for i in 0..3 - xi as isize {
1148 acc += hfilter[i as usize] * start;
1149 }
1150
1151 let off = 3 - (xi as isize);
1152 let s = cmp::max(0, off) as usize;
1153 let s1 = (s as isize - off) as usize;
1154 let n1 = (n as isize - off) as usize;
1155
1156 for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) {
1157 acc += hf * i32::cast_from(v);
1158 }
1159
1160 for i in n..7 {
1161 acc += hfilter[i as usize] * end;
1162 }
1163
1164 acc = (acc + (1 << round_h >> 1)) >> round_h;
1165 work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset);
1166 }
1167
1168 for (wi, dst) in (start_wi..start_wi + end_i)
1169 .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i))
1170 {
1171 let mut acc = 0;
1172 for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) {
1173 acc += vfilter[i] * *src;
1174 }
1175 *dst = T::cast_from(clamp(
1176 (acc + (1 << round_v >> 1)) >> round_v,
1177 0,
1178 (1 << bit_depth) - 1,
1179 ));
1180 }
1181 }
1182 }
1183
1184 #[derive(Copy, Clone, Debug, Default)]
1185 pub struct RestorationUnit {
1186 pub filter: RestorationFilter,
1187 }
1188
1189 #[derive(Clone, Debug)]
1190 pub struct FrameRestorationUnits {
1191 units: Box<[RestorationUnit]>,
1192 pub cols: usize,
1193 pub rows: usize,
1194 }
1195
1196 impl FrameRestorationUnits {
new(cols: usize, rows: usize) -> Self1197 pub fn new(cols: usize, rows: usize) -> Self {
1198 Self {
1199 units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(),
1200 cols,
1201 rows,
1202 }
1203 }
1204 }
1205
1206 impl Index<usize> for FrameRestorationUnits {
1207 type Output = [RestorationUnit];
1208 #[inline(always)]
index(&self, index: usize) -> &Self::Output1209 fn index(&self, index: usize) -> &Self::Output {
1210 &self.units[index * self.cols..(index + 1) * self.cols]
1211 }
1212 }
1213
1214 impl IndexMut<usize> for FrameRestorationUnits {
1215 #[inline(always)]
index_mut(&mut self, index: usize) -> &mut Self::Output1216 fn index_mut(&mut self, index: usize) -> &mut Self::Output {
1217 &mut self.units[index * self.cols..(index + 1) * self.cols]
1218 }
1219 }
1220
1221 #[derive(Clone, Debug)]
1222 pub struct RestorationPlaneConfig {
1223 pub lrf_type: u8,
1224 pub unit_size: usize,
1225 // (1 << sb_x_shift) gives the number of superblocks horizontally or
1226 // vertically in a restoration unit, not accounting for RU stretching
1227 pub sb_h_shift: usize,
1228 pub sb_v_shift: usize,
1229 pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop)
1230 pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop)
1231 // stripe height is 64 in all cases except 4:2:0 chroma planes where
1232 // it is 32. This is independent of all other setup parameters
1233 pub stripe_height: usize,
1234 pub cols: usize,
1235 pub rows: usize,
1236 }
1237
1238 #[derive(Clone, Debug)]
1239 pub struct RestorationPlane {
1240 pub cfg: RestorationPlaneConfig,
1241 pub units: FrameRestorationUnits,
1242 }
1243
1244 #[derive(Clone, Default)]
1245 pub struct RestorationPlaneOffset {
1246 pub row: usize,
1247 pub col: usize,
1248 }
1249
1250 impl RestorationPlane {
new( lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize, sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize, rows: usize, ) -> RestorationPlane1251 pub fn new(
1252 lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize,
1253 sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize,
1254 rows: usize,
1255 ) -> RestorationPlane {
1256 let stripe_height = if stripe_decimate != 0 { 32 } else { 64 };
1257 RestorationPlane {
1258 cfg: RestorationPlaneConfig {
1259 lrf_type,
1260 unit_size,
1261 sb_h_shift,
1262 sb_v_shift,
1263 sb_cols,
1264 sb_rows,
1265 stripe_height,
1266 cols,
1267 rows,
1268 },
1269 units: FrameRestorationUnits::new(cols, rows),
1270 }
1271 }
1272
1273 // Stripes are always 64 pixels high in a non-subsampled
1274 // frame, and decimated from 64 pixels in chroma. When
1275 // filtering, they are not co-located on Y with superblocks.
restoration_unit_index_by_stripe( &self, stripenum: usize, rux: usize, ) -> (usize, usize)1276 fn restoration_unit_index_by_stripe(
1277 &self, stripenum: usize, rux: usize,
1278 ) -> (usize, usize) {
1279 (
1280 cmp::min(rux, self.cfg.cols - 1),
1281 cmp::min(
1282 stripenum * self.cfg.stripe_height / self.cfg.unit_size,
1283 self.cfg.rows - 1,
1284 ),
1285 )
1286 }
1287
restoration_unit_by_stripe( &self, stripenum: usize, rux: usize, ) -> &RestorationUnit1288 pub fn restoration_unit_by_stripe(
1289 &self, stripenum: usize, rux: usize,
1290 ) -> &RestorationUnit {
1291 let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux);
1292 &self.units[y][x]
1293 }
1294 }
1295
1296 #[derive(Clone, Debug)]
1297 pub struct RestorationState {
1298 pub planes: [RestorationPlane; MAX_PLANES],
1299 }
1300
1301 impl RestorationState {
new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self1302 pub fn new<T: Pixel>(fi: &FrameInvariants<T>, input: &Frame<T>) -> Self {
1303 let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg;
1304 // stripe size is decimated in 4:2:0 (and only 4:2:0)
1305 let stripe_uv_decimate = if xdec > 0 && ydec > 0 { 1 } else { 0 };
1306 let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 };
1307 let uv_sb_h_log2 = y_sb_log2 - xdec;
1308 let uv_sb_v_log2 = y_sb_log2 - ydec;
1309
1310 let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru
1311 && fi.sequence.enable_restoration
1312 {
1313 assert!(
1314 fi.width > 1 && fi.height > 1,
1315 "Width and height must be higher than 1 for LRF setup"
1316 );
1317
1318 // Specific content does affect optimal LRU size choice, but the
1319 // quantizer in use is a surprisingly strong selector.
1320 let lrf_base_shift = if fi.base_q_idx > 200 {
1321 0 // big
1322 } else if fi.base_q_idx > 160 {
1323 1
1324 } else {
1325 2 // small
1326 };
1327 let lrf_chroma_shift = if stripe_uv_decimate > 0 {
1328 // 4:2:0 only
1329 if lrf_base_shift == 2 {
1330 1 // smallest chroma LRU is a win at low quant
1331 } else {
1332 // Will a down-shifted chroma LRU eliminate stretch in chroma?
1333 // If so, that's generally a win.
1334 let lrf_unit_size =
1335 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift);
1336 let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size
1337 <= lrf_unit_size / 2
1338 || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2;
1339 let shifted_stretch = ((fi.width >> xdec) - 1)
1340 % (lrf_unit_size >> 1)
1341 <= lrf_unit_size / 4
1342 || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1)
1343 <= lrf_unit_size / 4;
1344 if unshifted_stretch && !shifted_stretch {
1345 1 // shift to eliminate stretch
1346 } else {
1347 0 // don't shift; save the signaling bits
1348 }
1349 }
1350 } else {
1351 0
1352 };
1353 (lrf_base_shift, lrf_base_shift + lrf_chroma_shift)
1354 } else {
1355 // Explicit request to tie LRU size to superblock size ==
1356 // smallest possible LRU size
1357 let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 };
1358 (lrf_y_shift, lrf_y_shift + stripe_uv_decimate)
1359 };
1360
1361 let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift);
1362 let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift);
1363
1364 let tiling = fi.sequence.tiling;
1365 // Right now we defer to tiling setup: don't choose an LRU size
1366 // large enough that a tile is not an integer number of LRUs
1367 // wide/high.
1368 if tiling.cols > 1 || tiling.rows > 1 {
1369 // despite suggestions to the contrary, tiles can be
1370 // non-powers-of-2.
1371 let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize;
1372 let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize;
1373 let tile_aligned_y_unit_size =
1374 1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros));
1375 let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros);
1376 let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros);
1377 y_unit_size = y_unit_size.min(tile_aligned_y_unit_size);
1378 uv_unit_size = uv_unit_size
1379 .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size));
1380
1381 // But it's actually worse: LRUs can't span tiles (in our
1382 // one-pass design that is, spec allows it). However, the spec
1383 // mandates the last LRU stretches forward into any
1384 // less-than-half-LRU span of superblocks at the right and
1385 // bottom of a frame. These superblocks may well be in a
1386 // different tile! Even if LRUs are minimum size (one
1387 // superblock), when the right or bottom edge of the frame is a
1388 // superblock that's less than half the width/height of a normal
1389 // superblock, the LRU is forced by the spec to span into it
1390 // (and thus a different tile). Tiling is under no such
1391 // restriction; it could decide the right/left sliver will be in
1392 // its own tile row/column. We can't disallow the combination
1393 // here. The tiling code will have to either prevent it or
1394 // tolerate it. (prayer mechanic == Issue #1629).
1395 }
1396
1397 // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes
1398 // to be the same*. If they differ at this
1399 // point, it's due to a tiling restriction enforcing a maximum
1400 // size, so force both to the smaller value.
1401 //
1402 // *see sec 5.9.20, "Loop restoration params syntax". The
1403 // bitstream provides means of coding a different UV LRU size only
1404 // when chroma is in use and both x and y are subsampled in the
1405 // chroma planes.
1406 if ydec == 0 && y_unit_size != uv_unit_size {
1407 y_unit_size = uv_unit_size.min(y_unit_size);
1408 uv_unit_size = y_unit_size;
1409 }
1410
1411 // derive the rest
1412 let y_unit_log2 = y_unit_size.ilog() - 1;
1413 let uv_unit_log2 = uv_unit_size.ilog() - 1;
1414 let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1);
1415 let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1);
1416 let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec)
1417 + (uv_unit_size >> 1))
1418 / uv_unit_size)
1419 .max(1);
1420 let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec)
1421 + (uv_unit_size >> 1))
1422 / uv_unit_size)
1423 .max(1);
1424
1425 RestorationState {
1426 planes: [
1427 RestorationPlane::new(
1428 RESTORE_SWITCHABLE,
1429 y_unit_size,
1430 y_unit_log2 - y_sb_log2,
1431 y_unit_log2 - y_sb_log2,
1432 fi.sb_width,
1433 fi.sb_height,
1434 0,
1435 y_cols,
1436 y_rows,
1437 ),
1438 RestorationPlane::new(
1439 RESTORE_SWITCHABLE,
1440 uv_unit_size,
1441 uv_unit_log2 - uv_sb_h_log2,
1442 uv_unit_log2 - uv_sb_v_log2,
1443 fi.sb_width,
1444 fi.sb_height,
1445 stripe_uv_decimate,
1446 uv_cols,
1447 uv_rows,
1448 ),
1449 RestorationPlane::new(
1450 RESTORE_SWITCHABLE,
1451 uv_unit_size,
1452 uv_unit_log2 - uv_sb_h_log2,
1453 uv_unit_log2 - uv_sb_v_log2,
1454 fi.sb_width,
1455 fi.sb_height,
1456 stripe_uv_decimate,
1457 uv_cols,
1458 uv_rows,
1459 ),
1460 ],
1461 }
1462 }
1463
1464 #[hawktracer(lrf_filter_frame)]
lrf_filter_frame<T: Pixel>( &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>, fi: &FrameInvariants<T>, )1465 pub fn lrf_filter_frame<T: Pixel>(
1466 &mut self, out: &mut Frame<T>, pre_cdef: &Frame<T>,
1467 fi: &FrameInvariants<T>,
1468 ) {
1469 let cdeffed = out.clone();
1470 let planes =
1471 if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES };
1472
1473 // unlike the other loop filters that operate over the padded
1474 // frame dimensions, restoration filtering and source pixel
1475 // accesses are clipped to the original frame dimensions
1476 // that's why we use fi.width and fi.height instead of PlaneConfig fields
1477
1478 // number of stripes (counted according to colocated Y luma position)
1479 let stripe_n = (fi.height + 7) / 64 + 1;
1480
1481 // Buffers for the stripe filter.
1482 let mut stripe_filter_buffer =
1483 IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE);
1484
1485 for pli in 0..planes {
1486 let rp = &self.planes[pli];
1487 let xdec = out.planes[pli].cfg.xdec;
1488 let ydec = out.planes[pli].cfg.ydec;
1489 let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec;
1490 let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec;
1491
1492 for si in 0..stripe_n {
1493 let (stripe_start_y, stripe_size) = if si == 0 {
1494 (0, (64 - 8) >> ydec)
1495 } else {
1496 let start = (si * 64 - 8) >> ydec;
1497 (
1498 start as isize,
1499 // one past, unlike spec
1500 (64 >> ydec).min(crop_h - start),
1501 )
1502 };
1503
1504 // horizontally, go rdu-by-rdu
1505 for rux in 0..rp.cfg.cols {
1506 // stripe x pixel locations must be clipped to frame, last may need to stretch
1507 let x = rux * rp.cfg.unit_size;
1508 let size =
1509 if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size };
1510 let ru = rp.restoration_unit_by_stripe(si, rux);
1511 match ru.filter {
1512 RestorationFilter::Wiener { coeffs } => {
1513 wiener_stripe_filter(
1514 coeffs,
1515 fi,
1516 crop_w,
1517 crop_h,
1518 size,
1519 stripe_size,
1520 x,
1521 stripe_start_y,
1522 &cdeffed.planes[pli],
1523 &pre_cdef.planes[pli],
1524 &mut out.planes[pli],
1525 );
1526 }
1527 RestorationFilter::Sgrproj { set, xqd } => {
1528 if !fi.sequence.enable_cdef {
1529 continue;
1530 }
1531
1532 setup_integral_image(
1533 &mut stripe_filter_buffer,
1534 STRIPE_IMAGE_STRIDE,
1535 crop_w - x,
1536 (crop_h as isize - stripe_start_y) as usize,
1537 size,
1538 stripe_size,
1539 &cdeffed.planes[pli]
1540 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1541 &pre_cdef.planes[pli]
1542 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1543 );
1544
1545 sgrproj_stripe_filter(
1546 set,
1547 xqd,
1548 fi,
1549 &stripe_filter_buffer,
1550 STRIPE_IMAGE_STRIDE,
1551 &cdeffed.planes[pli]
1552 .slice(PlaneOffset { x: x as isize, y: stripe_start_y }),
1553 &mut out.planes[pli].region_mut(Area::Rect {
1554 x: x as isize,
1555 y: stripe_start_y,
1556 width: size,
1557 height: stripe_size,
1558 }),
1559 );
1560 }
1561 RestorationFilter::None => {
1562 // do nothing
1563 }
1564 }
1565 }
1566 }
1567 }
1568 }
1569 }
1570