1 // Copyright (c) 2019-2020, The rav1e contributors. All rights reserved
2 //
3 // This source code is subject to the terms of the BSD 2 Clause License and
4 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5 // was not distributed with this source code in the LICENSE file, you can
6 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
7 // Media Patent License 1.0 was not distributed with this source code in the
8 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9 
10 use crate::cpu_features::CpuFeatureLevel;
11 use crate::frame::PlaneSlice;
12 use crate::lrf::*;
13 use crate::util::Pixel;
14 #[cfg(target_arch = "x86")]
15 use std::arch::x86::*;
16 #[cfg(target_arch = "x86_64")]
17 use std::arch::x86_64::*;
18 use std::mem;
19 
20 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
21 #[inline]
sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, cpu: CpuFeatureLevel, )22 pub fn sgrproj_box_ab_r1(
23   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
24   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
25   cpu: CpuFeatureLevel,
26 ) {
27   // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0
28   if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 {
29     return unsafe {
30       sgrproj_box_ab_r1_avx2(
31         af,
32         bf,
33         iimg,
34         iimg_sq,
35         iimg_stride,
36         y,
37         stripe_w,
38         s,
39         bdm8,
40       );
41     };
42   }
43 
44   rust::sgrproj_box_ab_r1(
45     af,
46     bf,
47     iimg,
48     iimg_sq,
49     iimg_stride,
50     y,
51     stripe_w,
52     s,
53     bdm8,
54     cpu,
55   );
56 }
57 
58 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
59 #[inline]
sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, cpu: CpuFeatureLevel, )60 pub fn sgrproj_box_ab_r2(
61   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
62   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
63   cpu: CpuFeatureLevel,
64 ) {
65   // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0
66   if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 {
67     return unsafe {
68       sgrproj_box_ab_r2_avx2(
69         af,
70         bf,
71         iimg,
72         iimg_sq,
73         iimg_stride,
74         y,
75         stripe_w,
76         s,
77         bdm8,
78       );
79     };
80   }
81 
82   rust::sgrproj_box_ab_r2(
83     af,
84     bf,
85     iimg,
86     iimg_sq,
87     iimg_stride,
88     y,
89     stripe_w,
90     s,
91     bdm8,
92     cpu,
93   );
94 }
95 
96 #[inline]
sgrproj_box_f_r0<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )97 pub fn sgrproj_box_f_r0<T: Pixel>(
98   f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
99   cpu: CpuFeatureLevel,
100 ) {
101   if cpu >= CpuFeatureLevel::AVX2 {
102     return unsafe {
103       sgrproj_box_f_r0_avx2(f, y, w, cdeffed);
104     };
105   }
106 
107   rust::sgrproj_box_f_r0(f, y, w, cdeffed, cpu);
108 }
109 
110 #[inline]
sgrproj_box_f_r1<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )111 pub fn sgrproj_box_f_r1<T: Pixel>(
112   af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
113   cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel,
114 ) {
115   if cpu >= CpuFeatureLevel::AVX2 {
116     return unsafe {
117       sgrproj_box_f_r1_avx2(af, bf, f, y, w, cdeffed);
118     };
119   }
120 
121   rust::sgrproj_box_f_r1(af, bf, f, y, w, cdeffed, cpu);
122 }
123 
124 #[inline]
sgrproj_box_f_r2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )125 pub fn sgrproj_box_f_r2<T: Pixel>(
126   af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
127   y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel,
128 ) {
129   if cpu >= CpuFeatureLevel::AVX2 {
130     return unsafe {
131       sgrproj_box_f_r2_avx2(af, bf, f0, f1, y, w, cdeffed);
132     };
133   }
134 
135   rust::sgrproj_box_f_r2(af, bf, f0, f1, y, w, cdeffed, cpu);
136 }
137 
138 static X_BY_XPLUS1: [u32; 256] = [
139   // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
140   // instead of 0. See comments in selfguided_restoration_internal() for why
141   1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240,
142   241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 248,
143   248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 250,
144   251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 252,
145   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 253,
146   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
147   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254,
148   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
149   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
150   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
151   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
152   254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
153   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
154   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
155   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
156   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
157   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256,
158 ];
159 
160 #[inline]
161 #[target_feature(enable = "avx2")]
sgrproj_box_ab_8_avx2( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize, )162 unsafe fn sgrproj_box_ab_8_avx2(
163   r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
164   iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
165 ) {
166   let d: usize = r * 2 + 1;
167   let n: i32 = (d * d) as i32;
168   let one_over_n = if r == 1 { 455 } else { 164 };
169 
170   // Using an integral image, compute the sum of a square region
171   #[inline]
172   #[target_feature(enable = "avx2")]
173   unsafe fn get_integral_square_avx2(
174     iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
175   ) -> __m256i {
176     let iimg = iimg.as_ptr().add(y * stride + x);
177     // Cancel out overflow in iimg by using wrapping arithmetic
178     _mm256_sub_epi32(
179       _mm256_add_epi32(
180         _mm256_loadu_si256(iimg as *const _),
181         _mm256_loadu_si256(iimg.add(size * stride + size) as *const _),
182       ),
183       _mm256_add_epi32(
184         _mm256_loadu_si256(iimg.add(size * stride) as *const _),
185         _mm256_loadu_si256(iimg.add(size) as *const _),
186       ),
187     )
188   }
189 
190   let sum = get_integral_square_avx2(iimg, iimg_stride, x, y, d);
191   let ssq = get_integral_square_avx2(iimg_sq, iimg_stride, x, y, d);
192   let scaled_sum = _mm256_srlv_epi32(
193     _mm256_add_epi32(sum, _mm256_set1_epi32(1 << bdm8 as i32 >> 1)),
194     _mm256_set1_epi32(bdm8 as i32),
195   );
196   let scaled_ssq = _mm256_srlv_epi32(
197     _mm256_add_epi32(ssq, _mm256_set1_epi32(1 << (2 * bdm8) as i32 >> 1)),
198     _mm256_set1_epi32(2 * bdm8 as i32),
199   );
200   let p = _mm256_max_epi32(
201     _mm256_setzero_si256(),
202     _mm256_sub_epi32(
203       _mm256_mullo_epi32(scaled_ssq, _mm256_set1_epi32(n as i32)),
204       _mm256_madd_epi16(scaled_sum, scaled_sum),
205     ),
206   );
207   let z = _mm256_srli_epi32(
208     _mm256_add_epi32(
209       _mm256_mullo_epi32(p, _mm256_set1_epi32(s as i32)),
210       _mm256_set1_epi32(1 << SGRPROJ_MTABLE_BITS as i32 >> 1),
211     ),
212     SGRPROJ_MTABLE_BITS as i32,
213   );
214   let a = _mm256_i32gather_epi32(
215     X_BY_XPLUS1.as_ptr() as *const _,
216     _mm256_min_epi32(z, _mm256_set1_epi32(255)),
217     4,
218   );
219   let b = _mm256_mullo_epi32(
220     _mm256_madd_epi16(
221       _mm256_sub_epi32(_mm256_set1_epi32(1 << SGRPROJ_SGR_BITS as i32), a),
222       sum,
223     ),
224     _mm256_set1_epi32(one_over_n),
225   );
226   let b = _mm256_srlv_epi32(
227     _mm256_add_epi32(
228       b,
229       _mm256_set1_epi32(1 << SGRPROJ_RECIP_BITS as i32 >> 1),
230     ),
231     _mm256_set1_epi32(SGRPROJ_RECIP_BITS as i32),
232   );
233   _mm256_storeu_si256(af.as_mut_ptr().add(x) as *mut _, a);
234   _mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
235 }
236 
237 #[target_feature(enable = "avx2")]
sgrproj_box_ab_r1_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )238 pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
239   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
240   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
241 ) {
242   for x in (0..stripe_w + 2).step_by(8) {
243     if x + 8 <= stripe_w + 2 {
244       sgrproj_box_ab_8_avx2(
245         1,
246         af,
247         bf,
248         iimg,
249         iimg_sq,
250         iimg_stride,
251         x,
252         y,
253         s,
254         bdm8,
255       );
256     } else {
257       // finish using scalar
258       rust::sgrproj_box_ab_internal(
259         1,
260         af,
261         bf,
262         iimg,
263         iimg_sq,
264         iimg_stride,
265         x,
266         y,
267         stripe_w,
268         s,
269         bdm8,
270       );
271     }
272   }
273 
274   #[cfg(feature = "check_asm")]
275   {
276     let mut af_ref: Vec<u32> = vec![0; stripe_w + 2];
277     let mut bf_ref: Vec<u32> = vec![0; stripe_w + 2];
278     rust::sgrproj_box_ab_internal(
279       1,
280       &mut af_ref,
281       &mut bf_ref,
282       iimg,
283       iimg_sq,
284       iimg_stride,
285       0,
286       y,
287       stripe_w,
288       s,
289       bdm8,
290     );
291     assert_eq!(&af[..stripe_w + 2], &af_ref[..]);
292     assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]);
293   }
294 }
295 
296 #[target_feature(enable = "avx2")]
sgrproj_box_ab_r2_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )297 pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
298   af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
299   iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
300 ) {
301   for x in (0..stripe_w + 2).step_by(8) {
302     if x + 8 <= stripe_w + 2 {
303       sgrproj_box_ab_8_avx2(
304         2,
305         af,
306         bf,
307         iimg,
308         iimg_sq,
309         iimg_stride,
310         x,
311         y,
312         s,
313         bdm8,
314       );
315     } else {
316       // finish using scalar
317       rust::sgrproj_box_ab_internal(
318         2,
319         af,
320         bf,
321         iimg,
322         iimg_sq,
323         iimg_stride,
324         x,
325         y,
326         stripe_w,
327         s,
328         bdm8,
329       );
330     }
331   }
332 
333   #[cfg(feature = "check_asm")]
334   {
335     let mut af_ref: Vec<u32> = vec![0; stripe_w + 2];
336     let mut bf_ref: Vec<u32> = vec![0; stripe_w + 2];
337     rust::sgrproj_box_ab_internal(
338       2,
339       &mut af_ref,
340       &mut bf_ref,
341       iimg,
342       iimg_sq,
343       iimg_stride,
344       0,
345       y,
346       stripe_w,
347       s,
348       bdm8,
349     );
350     assert_eq!(&af[..stripe_w + 2], &af_ref[..]);
351     assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]);
352   }
353 }
354 
355 #[inline]
356 #[target_feature(enable = "avx2")]
sgrproj_box_f_r0_8_avx2<T: Pixel>( f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )357 unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
358   f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
359 ) {
360   _mm256_storeu_si256(
361     f.as_mut_ptr().add(x) as *mut _,
362     _mm256_slli_epi32(
363       if mem::size_of::<T>() == 1 {
364         _mm256_cvtepu8_epi32(_mm_loadl_epi64(
365           cdeffed.subslice(x, y).as_ptr() as *const _
366         ))
367       } else {
368         _mm256_cvtepu16_epi32(_mm_loadu_si128(
369           cdeffed.subslice(x, y).as_ptr() as *const _
370         ))
371       },
372       SGRPROJ_RST_BITS as i32,
373     ),
374   );
375 }
376 
377 #[target_feature(enable = "avx2")]
sgrproj_box_f_r0_avx2<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )378 pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
379   f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
380 ) {
381   for x in (0..w).step_by(8) {
382     if x + 8 <= w {
383       sgrproj_box_f_r0_8_avx2(f, x, y, cdeffed);
384     } else {
385       // finish using scalar
386       rust::sgrproj_box_f_r0_internal(f, x, y, w, cdeffed);
387     }
388   }
389 
390   #[cfg(feature = "check_asm")]
391   {
392     let mut f_ref: Vec<u32> = vec![0; w];
393     rust::sgrproj_box_f_r0_internal(&mut f_ref, 0, y, w, cdeffed);
394     assert_eq!(&f[..w], &f_ref[..]);
395   }
396 }
397 
398 #[inline]
399 #[target_feature(enable = "avx2")]
sgrproj_box_f_r1_8_avx2<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )400 unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
401   af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
402   cdeffed: &PlaneSlice<T>,
403 ) {
404   let three = _mm256_set1_epi32(3);
405   let four = _mm256_set1_epi32(4);
406   let a0 = af[0].as_ptr();
407   let a1 = af[1].as_ptr();
408   let a2 = af[2].as_ptr();
409   let b0 = bf[0].as_ptr();
410   let b1 = bf[1].as_ptr();
411   let b2 = bf[2].as_ptr();
412   let a = _mm256_add_epi32(
413     _mm256_madd_epi16(
414       _mm256_add_epi32(
415         _mm256_add_epi32(
416           _mm256_loadu_si256(a0.add(x) as *const _),
417           _mm256_loadu_si256(a0.add(x + 2) as *const _),
418         ),
419         _mm256_add_epi32(
420           _mm256_loadu_si256(a2.add(x) as *const _),
421           _mm256_loadu_si256(a2.add(x + 2) as *const _),
422         ),
423       ),
424       three,
425     ),
426     _mm256_madd_epi16(
427       _mm256_add_epi32(
428         _mm256_add_epi32(
429           _mm256_loadu_si256(a1.add(x) as *const _),
430           _mm256_loadu_si256(a0.add(x + 1) as *const _),
431         ),
432         _mm256_add_epi32(
433           _mm256_add_epi32(
434             _mm256_loadu_si256(a1.add(x + 1) as *const _),
435             _mm256_loadu_si256(a2.add(x + 1) as *const _),
436           ),
437           _mm256_loadu_si256(a1.add(x + 2) as *const _),
438         ),
439       ),
440       four,
441     ),
442   );
443   let b = _mm256_add_epi32(
444     _mm256_mullo_epi32(
445       _mm256_add_epi32(
446         _mm256_add_epi32(
447           _mm256_loadu_si256(b0.add(x) as *const _),
448           _mm256_loadu_si256(b0.add(x + 2) as *const _),
449         ),
450         _mm256_add_epi32(
451           _mm256_loadu_si256(b2.add(x) as *const _),
452           _mm256_loadu_si256(b2.add(x + 2) as *const _),
453         ),
454       ),
455       three,
456     ),
457     _mm256_mullo_epi32(
458       _mm256_add_epi32(
459         _mm256_add_epi32(
460           _mm256_loadu_si256(b1.add(x) as *const _),
461           _mm256_loadu_si256(b0.add(x + 1) as *const _),
462         ),
463         _mm256_add_epi32(
464           _mm256_add_epi32(
465             _mm256_loadu_si256(b1.add(x + 1) as *const _),
466             _mm256_loadu_si256(b2.add(x + 1) as *const _),
467           ),
468           _mm256_loadu_si256(b1.add(x + 2) as *const _),
469         ),
470       ),
471       four,
472     ),
473   );
474   let v = _mm256_add_epi32(
475     _mm256_madd_epi16(
476       a,
477       if mem::size_of::<T>() == 1 {
478         _mm256_cvtepu8_epi32(_mm_loadl_epi64(
479           cdeffed.subslice(x, y).as_ptr() as *const _
480         ))
481       } else {
482         _mm256_cvtepu16_epi32(_mm_loadu_si128(
483           cdeffed.subslice(x, y).as_ptr() as *const _
484         ))
485       },
486     ),
487     b,
488   );
489   const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
490   _mm256_storeu_si256(
491     f.as_mut_ptr().add(x) as *mut _,
492     _mm256_srli_epi32(
493       _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)),
494       SHIFT,
495     ),
496   );
497 }
498 
499 #[target_feature(enable = "avx2")]
sgrproj_box_f_r1_avx2<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )500 pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
501   af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
502   cdeffed: &PlaneSlice<T>,
503 ) {
504   for x in (0..w).step_by(8) {
505     if x + 8 <= w {
506       sgrproj_box_f_r1_8_avx2(af, bf, f, x, y, cdeffed);
507     } else {
508       // finish using scalar
509       rust::sgrproj_box_f_r1_internal(af, bf, f, x, y, w, cdeffed);
510     }
511   }
512 
513   #[cfg(feature = "check_asm")]
514   {
515     let mut f_ref: Vec<u32> = vec![0; w];
516     rust::sgrproj_box_f_r1_internal(af, bf, &mut f_ref, 0, y, w, cdeffed);
517     assert_eq!(&f[..w], &f_ref[..]);
518   }
519 }
520 
521 #[inline]
522 #[target_feature(enable = "avx2")]
sgrproj_box_f_r2_8_avx2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )523 unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
524   af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
525   x: usize, y: usize, cdeffed: &PlaneSlice<T>,
526 ) {
527   let five = _mm256_set1_epi32(5);
528   let six = _mm256_set1_epi32(6);
529   let a0 = af[0].as_ptr();
530   let a2 = af[1].as_ptr();
531   let b0 = bf[0].as_ptr();
532   let b2 = bf[1].as_ptr();
533   let a = _mm256_add_epi32(
534     _mm256_madd_epi16(
535       _mm256_add_epi32(
536         _mm256_loadu_si256(a0.add(x) as *const _),
537         _mm256_loadu_si256(a0.add(x + 2) as *const _),
538       ),
539       five,
540     ),
541     _mm256_madd_epi16(_mm256_loadu_si256(a0.add(x + 1) as *const _), six),
542   );
543   let b = _mm256_add_epi32(
544     _mm256_mullo_epi32(
545       _mm256_add_epi32(
546         _mm256_loadu_si256(b0.add(x) as *const _),
547         _mm256_loadu_si256(b0.add(x + 2) as *const _),
548       ),
549       five,
550     ),
551     _mm256_mullo_epi32(_mm256_loadu_si256(b0.add(x + 1) as *const _), six),
552   );
553   let ao = _mm256_add_epi32(
554     _mm256_madd_epi16(
555       _mm256_add_epi32(
556         _mm256_loadu_si256(a2.add(x) as *const _),
557         _mm256_loadu_si256(a2.add(x + 2) as *const _),
558       ),
559       five,
560     ),
561     _mm256_madd_epi16(_mm256_loadu_si256(a2.add(x + 1) as *const _), six),
562   );
563   let bo = _mm256_add_epi32(
564     _mm256_mullo_epi32(
565       _mm256_add_epi32(
566         _mm256_loadu_si256(b2.add(x) as *const _),
567         _mm256_loadu_si256(b2.add(x + 2) as *const _),
568       ),
569       five,
570     ),
571     _mm256_mullo_epi32(_mm256_loadu_si256(b2.add(x + 1) as *const _), six),
572   );
573   let v = _mm256_add_epi32(
574     _mm256_madd_epi16(
575       _mm256_add_epi32(a, ao),
576       if mem::size_of::<T>() == 1 {
577         _mm256_cvtepu8_epi32(_mm_loadl_epi64(
578           cdeffed.subslice(x, y).as_ptr() as *const _
579         ))
580       } else {
581         _mm256_cvtepu16_epi32(_mm_loadu_si128(
582           cdeffed.subslice(x, y).as_ptr() as *const _
583         ))
584       },
585     ),
586     _mm256_add_epi32(b, bo),
587   );
588   let vo = _mm256_add_epi32(
589     _mm256_madd_epi16(
590       ao,
591       if mem::size_of::<T>() == 1 {
592         _mm256_cvtepu8_epi32(_mm_loadl_epi64(
593           cdeffed.subslice(x, y + 1).as_ptr() as *const _,
594         ))
595       } else {
596         _mm256_cvtepu16_epi32(_mm_loadu_si128(
597           cdeffed.subslice(x, y + 1).as_ptr() as *const _,
598         ))
599       },
600     ),
601     bo,
602   );
603   const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
604   _mm256_storeu_si256(
605     f0.as_mut_ptr().add(x) as *mut _,
606     _mm256_srli_epi32(
607       _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)),
608       SHIFT,
609     ),
610   );
611   const SHIFTO: i32 = (4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
612   _mm256_storeu_si256(
613     f1.as_mut_ptr().add(x) as *mut _,
614     _mm256_srli_epi32(
615       _mm256_add_epi32(vo, _mm256_set1_epi32(1 << SHIFTO >> 1)),
616       SHIFTO,
617     ),
618   );
619 }
620 
621 #[target_feature(enable = "avx2")]
sgrproj_box_f_r2_avx2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )622 pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
623   af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
624   y: usize, w: usize, cdeffed: &PlaneSlice<T>,
625 ) {
626   for x in (0..w).step_by(8) {
627     if x + 8 <= w {
628       sgrproj_box_f_r2_8_avx2(af, bf, f0, f1, x, y, cdeffed);
629     } else {
630       // finish using scalar
631       rust::sgrproj_box_f_r2_internal(af, bf, f0, f1, x, y, w, cdeffed);
632     }
633   }
634 
635   #[cfg(feature = "check_asm")]
636   {
637     let mut f0_ref: Vec<u32> = vec![0; w];
638     let mut f1_ref: Vec<u32> = vec![0; w];
639     rust::sgrproj_box_f_r2_internal(
640       af,
641       bf,
642       &mut f0_ref,
643       &mut f1_ref,
644       0,
645       y,
646       w,
647       cdeffed,
648     );
649     assert_eq!(&f0[..w], &f0_ref[..]);
650     assert_eq!(&f1[..w], &f1_ref[..]);
651   }
652 }
653