1 // Copyright (c) 2019-2020, The rav1e contributors. All rights reserved
2 //
3 // This source code is subject to the terms of the BSD 2 Clause License and
4 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5 // was not distributed with this source code in the LICENSE file, you can
6 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
7 // Media Patent License 1.0 was not distributed with this source code in the
8 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10 use crate::cpu_features::CpuFeatureLevel;
11 use crate::frame::PlaneSlice;
12 use crate::lrf::*;
13 use crate::util::Pixel;
14 #[cfg(target_arch = "x86")]
15 use std::arch::x86::*;
16 #[cfg(target_arch = "x86_64")]
17 use std::arch::x86_64::*;
18 use std::mem;
19
20 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
21 #[inline]
sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, cpu: CpuFeatureLevel, )22 pub fn sgrproj_box_ab_r1(
23 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
24 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
25 cpu: CpuFeatureLevel,
26 ) {
27 // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0
28 if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 {
29 return unsafe {
30 sgrproj_box_ab_r1_avx2(
31 af,
32 bf,
33 iimg,
34 iimg_sq,
35 iimg_stride,
36 y,
37 stripe_w,
38 s,
39 bdm8,
40 );
41 };
42 }
43
44 rust::sgrproj_box_ab_r1(
45 af,
46 bf,
47 iimg,
48 iimg_sq,
49 iimg_stride,
50 y,
51 stripe_w,
52 s,
53 bdm8,
54 cpu,
55 );
56 }
57
58 // computes an intermediate (ab) row for stripe_w + 2 columns at row y
59 #[inline]
sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, cpu: CpuFeatureLevel, )60 pub fn sgrproj_box_ab_r2(
61 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
62 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
63 cpu: CpuFeatureLevel,
64 ) {
65 // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0
66 if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 {
67 return unsafe {
68 sgrproj_box_ab_r2_avx2(
69 af,
70 bf,
71 iimg,
72 iimg_sq,
73 iimg_stride,
74 y,
75 stripe_w,
76 s,
77 bdm8,
78 );
79 };
80 }
81
82 rust::sgrproj_box_ab_r2(
83 af,
84 bf,
85 iimg,
86 iimg_sq,
87 iimg_stride,
88 y,
89 stripe_w,
90 s,
91 bdm8,
92 cpu,
93 );
94 }
95
96 #[inline]
sgrproj_box_f_r0<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )97 pub fn sgrproj_box_f_r0<T: Pixel>(
98 f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
99 cpu: CpuFeatureLevel,
100 ) {
101 if cpu >= CpuFeatureLevel::AVX2 {
102 return unsafe {
103 sgrproj_box_f_r0_avx2(f, y, w, cdeffed);
104 };
105 }
106
107 rust::sgrproj_box_f_r0(f, y, w, cdeffed, cpu);
108 }
109
110 #[inline]
sgrproj_box_f_r1<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )111 pub fn sgrproj_box_f_r1<T: Pixel>(
112 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
113 cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel,
114 ) {
115 if cpu >= CpuFeatureLevel::AVX2 {
116 return unsafe {
117 sgrproj_box_f_r1_avx2(af, bf, f, y, w, cdeffed);
118 };
119 }
120
121 rust::sgrproj_box_f_r1(af, bf, f, y, w, cdeffed, cpu);
122 }
123
124 #[inline]
sgrproj_box_f_r2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel, )125 pub fn sgrproj_box_f_r2<T: Pixel>(
126 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
127 y: usize, w: usize, cdeffed: &PlaneSlice<T>, cpu: CpuFeatureLevel,
128 ) {
129 if cpu >= CpuFeatureLevel::AVX2 {
130 return unsafe {
131 sgrproj_box_f_r2_avx2(af, bf, f0, f1, y, w, cdeffed);
132 };
133 }
134
135 rust::sgrproj_box_f_r2(af, bf, f0, f1, y, w, cdeffed, cpu);
136 }
137
138 static X_BY_XPLUS1: [u32; 256] = [
139 // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
140 // instead of 0. See comments in selfguided_restoration_internal() for why
141 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240,
142 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 248,
143 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 250,
144 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 252,
145 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 253,
146 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
147 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254,
148 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
149 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
150 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
151 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
152 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
153 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
154 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
155 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
156 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
157 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256,
158 ];
159
160 #[inline]
161 #[target_feature(enable = "avx2")]
sgrproj_box_ab_8_avx2( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize, )162 unsafe fn sgrproj_box_ab_8_avx2(
163 r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
164 iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize,
165 ) {
166 let d: usize = r * 2 + 1;
167 let n: i32 = (d * d) as i32;
168 let one_over_n = if r == 1 { 455 } else { 164 };
169
170 // Using an integral image, compute the sum of a square region
171 #[inline]
172 #[target_feature(enable = "avx2")]
173 unsafe fn get_integral_square_avx2(
174 iimg: &[u32], stride: usize, x: usize, y: usize, size: usize,
175 ) -> __m256i {
176 let iimg = iimg.as_ptr().add(y * stride + x);
177 // Cancel out overflow in iimg by using wrapping arithmetic
178 _mm256_sub_epi32(
179 _mm256_add_epi32(
180 _mm256_loadu_si256(iimg as *const _),
181 _mm256_loadu_si256(iimg.add(size * stride + size) as *const _),
182 ),
183 _mm256_add_epi32(
184 _mm256_loadu_si256(iimg.add(size * stride) as *const _),
185 _mm256_loadu_si256(iimg.add(size) as *const _),
186 ),
187 )
188 }
189
190 let sum = get_integral_square_avx2(iimg, iimg_stride, x, y, d);
191 let ssq = get_integral_square_avx2(iimg_sq, iimg_stride, x, y, d);
192 let scaled_sum = _mm256_srlv_epi32(
193 _mm256_add_epi32(sum, _mm256_set1_epi32(1 << bdm8 as i32 >> 1)),
194 _mm256_set1_epi32(bdm8 as i32),
195 );
196 let scaled_ssq = _mm256_srlv_epi32(
197 _mm256_add_epi32(ssq, _mm256_set1_epi32(1 << (2 * bdm8) as i32 >> 1)),
198 _mm256_set1_epi32(2 * bdm8 as i32),
199 );
200 let p = _mm256_max_epi32(
201 _mm256_setzero_si256(),
202 _mm256_sub_epi32(
203 _mm256_mullo_epi32(scaled_ssq, _mm256_set1_epi32(n as i32)),
204 _mm256_madd_epi16(scaled_sum, scaled_sum),
205 ),
206 );
207 let z = _mm256_srli_epi32(
208 _mm256_add_epi32(
209 _mm256_mullo_epi32(p, _mm256_set1_epi32(s as i32)),
210 _mm256_set1_epi32(1 << SGRPROJ_MTABLE_BITS as i32 >> 1),
211 ),
212 SGRPROJ_MTABLE_BITS as i32,
213 );
214 let a = _mm256_i32gather_epi32(
215 X_BY_XPLUS1.as_ptr() as *const _,
216 _mm256_min_epi32(z, _mm256_set1_epi32(255)),
217 4,
218 );
219 let b = _mm256_mullo_epi32(
220 _mm256_madd_epi16(
221 _mm256_sub_epi32(_mm256_set1_epi32(1 << SGRPROJ_SGR_BITS as i32), a),
222 sum,
223 ),
224 _mm256_set1_epi32(one_over_n),
225 );
226 let b = _mm256_srlv_epi32(
227 _mm256_add_epi32(
228 b,
229 _mm256_set1_epi32(1 << SGRPROJ_RECIP_BITS as i32 >> 1),
230 ),
231 _mm256_set1_epi32(SGRPROJ_RECIP_BITS as i32),
232 );
233 _mm256_storeu_si256(af.as_mut_ptr().add(x) as *mut _, a);
234 _mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b);
235 }
236
237 #[target_feature(enable = "avx2")]
sgrproj_box_ab_r1_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )238 pub(crate) unsafe fn sgrproj_box_ab_r1_avx2(
239 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
240 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
241 ) {
242 for x in (0..stripe_w + 2).step_by(8) {
243 if x + 8 <= stripe_w + 2 {
244 sgrproj_box_ab_8_avx2(
245 1,
246 af,
247 bf,
248 iimg,
249 iimg_sq,
250 iimg_stride,
251 x,
252 y,
253 s,
254 bdm8,
255 );
256 } else {
257 // finish using scalar
258 rust::sgrproj_box_ab_internal(
259 1,
260 af,
261 bf,
262 iimg,
263 iimg_sq,
264 iimg_stride,
265 x,
266 y,
267 stripe_w,
268 s,
269 bdm8,
270 );
271 }
272 }
273
274 #[cfg(feature = "check_asm")]
275 {
276 let mut af_ref: Vec<u32> = vec![0; stripe_w + 2];
277 let mut bf_ref: Vec<u32> = vec![0; stripe_w + 2];
278 rust::sgrproj_box_ab_internal(
279 1,
280 &mut af_ref,
281 &mut bf_ref,
282 iimg,
283 iimg_sq,
284 iimg_stride,
285 0,
286 y,
287 stripe_w,
288 s,
289 bdm8,
290 );
291 assert_eq!(&af[..stripe_w + 2], &af_ref[..]);
292 assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]);
293 }
294 }
295
296 #[target_feature(enable = "avx2")]
sgrproj_box_ab_r2_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, )297 pub(crate) unsafe fn sgrproj_box_ab_r2_avx2(
298 af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32],
299 iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize,
300 ) {
301 for x in (0..stripe_w + 2).step_by(8) {
302 if x + 8 <= stripe_w + 2 {
303 sgrproj_box_ab_8_avx2(
304 2,
305 af,
306 bf,
307 iimg,
308 iimg_sq,
309 iimg_stride,
310 x,
311 y,
312 s,
313 bdm8,
314 );
315 } else {
316 // finish using scalar
317 rust::sgrproj_box_ab_internal(
318 2,
319 af,
320 bf,
321 iimg,
322 iimg_sq,
323 iimg_stride,
324 x,
325 y,
326 stripe_w,
327 s,
328 bdm8,
329 );
330 }
331 }
332
333 #[cfg(feature = "check_asm")]
334 {
335 let mut af_ref: Vec<u32> = vec![0; stripe_w + 2];
336 let mut bf_ref: Vec<u32> = vec![0; stripe_w + 2];
337 rust::sgrproj_box_ab_internal(
338 2,
339 &mut af_ref,
340 &mut bf_ref,
341 iimg,
342 iimg_sq,
343 iimg_stride,
344 0,
345 y,
346 stripe_w,
347 s,
348 bdm8,
349 );
350 assert_eq!(&af[..stripe_w + 2], &af_ref[..]);
351 assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]);
352 }
353 }
354
355 #[inline]
356 #[target_feature(enable = "avx2")]
sgrproj_box_f_r0_8_avx2<T: Pixel>( f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )357 unsafe fn sgrproj_box_f_r0_8_avx2<T: Pixel>(
358 f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>,
359 ) {
360 _mm256_storeu_si256(
361 f.as_mut_ptr().add(x) as *mut _,
362 _mm256_slli_epi32(
363 if mem::size_of::<T>() == 1 {
364 _mm256_cvtepu8_epi32(_mm_loadl_epi64(
365 cdeffed.subslice(x, y).as_ptr() as *const _
366 ))
367 } else {
368 _mm256_cvtepu16_epi32(_mm_loadu_si128(
369 cdeffed.subslice(x, y).as_ptr() as *const _
370 ))
371 },
372 SGRPROJ_RST_BITS as i32,
373 ),
374 );
375 }
376
377 #[target_feature(enable = "avx2")]
sgrproj_box_f_r0_avx2<T: Pixel>( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )378 pub(crate) unsafe fn sgrproj_box_f_r0_avx2<T: Pixel>(
379 f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>,
380 ) {
381 for x in (0..w).step_by(8) {
382 if x + 8 <= w {
383 sgrproj_box_f_r0_8_avx2(f, x, y, cdeffed);
384 } else {
385 // finish using scalar
386 rust::sgrproj_box_f_r0_internal(f, x, y, w, cdeffed);
387 }
388 }
389
390 #[cfg(feature = "check_asm")]
391 {
392 let mut f_ref: Vec<u32> = vec![0; w];
393 rust::sgrproj_box_f_r0_internal(&mut f_ref, 0, y, w, cdeffed);
394 assert_eq!(&f[..w], &f_ref[..]);
395 }
396 }
397
398 #[inline]
399 #[target_feature(enable = "avx2")]
sgrproj_box_f_r1_8_avx2<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )400 unsafe fn sgrproj_box_f_r1_8_avx2<T: Pixel>(
401 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize,
402 cdeffed: &PlaneSlice<T>,
403 ) {
404 let three = _mm256_set1_epi32(3);
405 let four = _mm256_set1_epi32(4);
406 let a0 = af[0].as_ptr();
407 let a1 = af[1].as_ptr();
408 let a2 = af[2].as_ptr();
409 let b0 = bf[0].as_ptr();
410 let b1 = bf[1].as_ptr();
411 let b2 = bf[2].as_ptr();
412 let a = _mm256_add_epi32(
413 _mm256_madd_epi16(
414 _mm256_add_epi32(
415 _mm256_add_epi32(
416 _mm256_loadu_si256(a0.add(x) as *const _),
417 _mm256_loadu_si256(a0.add(x + 2) as *const _),
418 ),
419 _mm256_add_epi32(
420 _mm256_loadu_si256(a2.add(x) as *const _),
421 _mm256_loadu_si256(a2.add(x + 2) as *const _),
422 ),
423 ),
424 three,
425 ),
426 _mm256_madd_epi16(
427 _mm256_add_epi32(
428 _mm256_add_epi32(
429 _mm256_loadu_si256(a1.add(x) as *const _),
430 _mm256_loadu_si256(a0.add(x + 1) as *const _),
431 ),
432 _mm256_add_epi32(
433 _mm256_add_epi32(
434 _mm256_loadu_si256(a1.add(x + 1) as *const _),
435 _mm256_loadu_si256(a2.add(x + 1) as *const _),
436 ),
437 _mm256_loadu_si256(a1.add(x + 2) as *const _),
438 ),
439 ),
440 four,
441 ),
442 );
443 let b = _mm256_add_epi32(
444 _mm256_mullo_epi32(
445 _mm256_add_epi32(
446 _mm256_add_epi32(
447 _mm256_loadu_si256(b0.add(x) as *const _),
448 _mm256_loadu_si256(b0.add(x + 2) as *const _),
449 ),
450 _mm256_add_epi32(
451 _mm256_loadu_si256(b2.add(x) as *const _),
452 _mm256_loadu_si256(b2.add(x + 2) as *const _),
453 ),
454 ),
455 three,
456 ),
457 _mm256_mullo_epi32(
458 _mm256_add_epi32(
459 _mm256_add_epi32(
460 _mm256_loadu_si256(b1.add(x) as *const _),
461 _mm256_loadu_si256(b0.add(x + 1) as *const _),
462 ),
463 _mm256_add_epi32(
464 _mm256_add_epi32(
465 _mm256_loadu_si256(b1.add(x + 1) as *const _),
466 _mm256_loadu_si256(b2.add(x + 1) as *const _),
467 ),
468 _mm256_loadu_si256(b1.add(x + 2) as *const _),
469 ),
470 ),
471 four,
472 ),
473 );
474 let v = _mm256_add_epi32(
475 _mm256_madd_epi16(
476 a,
477 if mem::size_of::<T>() == 1 {
478 _mm256_cvtepu8_epi32(_mm_loadl_epi64(
479 cdeffed.subslice(x, y).as_ptr() as *const _
480 ))
481 } else {
482 _mm256_cvtepu16_epi32(_mm_loadu_si128(
483 cdeffed.subslice(x, y).as_ptr() as *const _
484 ))
485 },
486 ),
487 b,
488 );
489 const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
490 _mm256_storeu_si256(
491 f.as_mut_ptr().add(x) as *mut _,
492 _mm256_srli_epi32(
493 _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)),
494 SHIFT,
495 ),
496 );
497 }
498
499 #[target_feature(enable = "avx2")]
sgrproj_box_f_r1_avx2<T: Pixel>( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )500 pub(crate) unsafe fn sgrproj_box_f_r1_avx2<T: Pixel>(
501 af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize,
502 cdeffed: &PlaneSlice<T>,
503 ) {
504 for x in (0..w).step_by(8) {
505 if x + 8 <= w {
506 sgrproj_box_f_r1_8_avx2(af, bf, f, x, y, cdeffed);
507 } else {
508 // finish using scalar
509 rust::sgrproj_box_f_r1_internal(af, bf, f, x, y, w, cdeffed);
510 }
511 }
512
513 #[cfg(feature = "check_asm")]
514 {
515 let mut f_ref: Vec<u32> = vec![0; w];
516 rust::sgrproj_box_f_r1_internal(af, bf, &mut f_ref, 0, y, w, cdeffed);
517 assert_eq!(&f[..w], &f_ref[..]);
518 }
519 }
520
521 #[inline]
522 #[target_feature(enable = "avx2")]
sgrproj_box_f_r2_8_avx2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice<T>, )523 unsafe fn sgrproj_box_f_r2_8_avx2<T: Pixel>(
524 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
525 x: usize, y: usize, cdeffed: &PlaneSlice<T>,
526 ) {
527 let five = _mm256_set1_epi32(5);
528 let six = _mm256_set1_epi32(6);
529 let a0 = af[0].as_ptr();
530 let a2 = af[1].as_ptr();
531 let b0 = bf[0].as_ptr();
532 let b2 = bf[1].as_ptr();
533 let a = _mm256_add_epi32(
534 _mm256_madd_epi16(
535 _mm256_add_epi32(
536 _mm256_loadu_si256(a0.add(x) as *const _),
537 _mm256_loadu_si256(a0.add(x + 2) as *const _),
538 ),
539 five,
540 ),
541 _mm256_madd_epi16(_mm256_loadu_si256(a0.add(x + 1) as *const _), six),
542 );
543 let b = _mm256_add_epi32(
544 _mm256_mullo_epi32(
545 _mm256_add_epi32(
546 _mm256_loadu_si256(b0.add(x) as *const _),
547 _mm256_loadu_si256(b0.add(x + 2) as *const _),
548 ),
549 five,
550 ),
551 _mm256_mullo_epi32(_mm256_loadu_si256(b0.add(x + 1) as *const _), six),
552 );
553 let ao = _mm256_add_epi32(
554 _mm256_madd_epi16(
555 _mm256_add_epi32(
556 _mm256_loadu_si256(a2.add(x) as *const _),
557 _mm256_loadu_si256(a2.add(x + 2) as *const _),
558 ),
559 five,
560 ),
561 _mm256_madd_epi16(_mm256_loadu_si256(a2.add(x + 1) as *const _), six),
562 );
563 let bo = _mm256_add_epi32(
564 _mm256_mullo_epi32(
565 _mm256_add_epi32(
566 _mm256_loadu_si256(b2.add(x) as *const _),
567 _mm256_loadu_si256(b2.add(x + 2) as *const _),
568 ),
569 five,
570 ),
571 _mm256_mullo_epi32(_mm256_loadu_si256(b2.add(x + 1) as *const _), six),
572 );
573 let v = _mm256_add_epi32(
574 _mm256_madd_epi16(
575 _mm256_add_epi32(a, ao),
576 if mem::size_of::<T>() == 1 {
577 _mm256_cvtepu8_epi32(_mm_loadl_epi64(
578 cdeffed.subslice(x, y).as_ptr() as *const _
579 ))
580 } else {
581 _mm256_cvtepu16_epi32(_mm_loadu_si128(
582 cdeffed.subslice(x, y).as_ptr() as *const _
583 ))
584 },
585 ),
586 _mm256_add_epi32(b, bo),
587 );
588 let vo = _mm256_add_epi32(
589 _mm256_madd_epi16(
590 ao,
591 if mem::size_of::<T>() == 1 {
592 _mm256_cvtepu8_epi32(_mm_loadl_epi64(
593 cdeffed.subslice(x, y + 1).as_ptr() as *const _,
594 ))
595 } else {
596 _mm256_cvtepu16_epi32(_mm_loadu_si128(
597 cdeffed.subslice(x, y + 1).as_ptr() as *const _,
598 ))
599 },
600 ),
601 bo,
602 );
603 const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
604 _mm256_storeu_si256(
605 f0.as_mut_ptr().add(x) as *mut _,
606 _mm256_srli_epi32(
607 _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)),
608 SHIFT,
609 ),
610 );
611 const SHIFTO: i32 = (4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32;
612 _mm256_storeu_si256(
613 f1.as_mut_ptr().add(x) as *mut _,
614 _mm256_srli_epi32(
615 _mm256_add_epi32(vo, _mm256_set1_epi32(1 << SHIFTO >> 1)),
616 SHIFTO,
617 ),
618 );
619 }
620
621 #[target_feature(enable = "avx2")]
sgrproj_box_f_r2_avx2<T: Pixel>( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice<T>, )622 pub(crate) unsafe fn sgrproj_box_f_r2_avx2<T: Pixel>(
623 af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32],
624 y: usize, w: usize, cdeffed: &PlaneSlice<T>,
625 ) {
626 for x in (0..w).step_by(8) {
627 if x + 8 <= w {
628 sgrproj_box_f_r2_8_avx2(af, bf, f0, f1, x, y, cdeffed);
629 } else {
630 // finish using scalar
631 rust::sgrproj_box_f_r2_internal(af, bf, f0, f1, x, y, w, cdeffed);
632 }
633 }
634
635 #[cfg(feature = "check_asm")]
636 {
637 let mut f0_ref: Vec<u32> = vec![0; w];
638 let mut f1_ref: Vec<u32> = vec![0; w];
639 rust::sgrproj_box_f_r2_internal(
640 af,
641 bf,
642 &mut f0_ref,
643 &mut f1_ref,
644 0,
645 y,
646 w,
647 cdeffed,
648 );
649 assert_eq!(&f0[..w], &f0_ref[..]);
650 assert_eq!(&f1[..w], &f1_ref[..]);
651 }
652 }
653