1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #ifndef EbPictureOperators_Inline_AVX2_h
13 #define EbPictureOperators_Inline_AVX2_h
14 
15 #include <immintrin.h>
16 #include "EbDefinitions.h"
17 #include "EbMemory_AVX2.h"
18 #include "EbPictureOperators_SSE2.h"
19 
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23 
residual_kernel4_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)24 SIMD_INLINE void residual_kernel4_avx2(const uint8_t *input, const uint32_t input_stride,
25                                        const uint8_t *pred, const uint32_t pred_stride,
26                                        int16_t *residual, const uint32_t residual_stride,
27                                        const uint32_t area_height) {
28     const __m256i zero = _mm256_setzero_si256();
29     uint32_t      y    = area_height;
30 
31     do {
32         const __m256i in    = load_u8_4x4_avx2(input, input_stride);
33         const __m256i pr    = load_u8_4x4_avx2(pred, pred_stride);
34         const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
35         const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
36         const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
37         const __m128i r0    = _mm256_castsi256_si128(re_lo);
38         const __m128i r1    = _mm256_extracti128_si256(re_lo, 1);
39 
40         store_s16_4x2_sse2(r0, residual + 0 * residual_stride, residual_stride);
41         store_s16_4x2_sse2(r1, residual + 2 * residual_stride, residual_stride);
42 
43         input += 4 * input_stride;
44         pred += 4 * pred_stride;
45         residual += 4 * residual_stride;
46         y -= 4;
47     } while (y);
48 }
49 
residual_kernel8_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)50 SIMD_INLINE void residual_kernel8_avx2(const uint8_t *input, const uint32_t input_stride,
51                                        const uint8_t *pred, const uint32_t pred_stride,
52                                        int16_t *residual, const uint32_t residual_stride,
53                                        const uint32_t area_height) {
54     const __m256i zero = _mm256_setzero_si256();
55     uint32_t      y    = area_height;
56 
57     do {
58         const __m256i in    = load_u8_8x4_avx2(input, input_stride);
59         const __m256i pr    = load_u8_8x4_avx2(pred, pred_stride);
60         const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
61         const __m256i in_hi = _mm256_unpackhi_epi8(in, zero);
62         const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
63         const __m256i pr_hi = _mm256_unpackhi_epi8(pr, zero);
64         const __m256i r0    = _mm256_sub_epi16(in_lo, pr_lo);
65         const __m256i r1    = _mm256_sub_epi16(in_hi, pr_hi);
66 
67         storeu_s16_8x2_avx2(r0, residual + 0 * residual_stride, 2 * residual_stride);
68         storeu_s16_8x2_avx2(r1, residual + 1 * residual_stride, 2 * residual_stride);
69 
70         input += 4 * input_stride;
71         pred += 4 * pred_stride;
72         residual += 4 * residual_stride;
73         y -= 4;
74     } while (y);
75 }
76 
residual_kernel16_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)77 SIMD_INLINE void residual_kernel16_avx2(const uint8_t *input, const uint32_t input_stride,
78                                         const uint8_t *pred, const uint32_t pred_stride,
79                                         int16_t *residual, const uint32_t residual_stride,
80                                         const uint32_t area_height) {
81     const __m256i zero = _mm256_setzero_si256();
82     uint32_t      y    = area_height;
83 
84     do {
85         const __m256i in0   = loadu_u8_16x2_avx2(input, input_stride);
86         const __m256i pr0   = loadu_u8_16x2_avx2(pred, pred_stride);
87         const __m256i in1   = _mm256_permute4x64_epi64(in0, 0xD8);
88         const __m256i pr1   = _mm256_permute4x64_epi64(pr0, 0xD8);
89         const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
90         const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
91         const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
92         const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
93         const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
94         const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
95 
96         _mm256_storeu_si256((__m256i *)(residual + 0 * residual_stride), re_lo);
97         _mm256_storeu_si256((__m256i *)(residual + 1 * residual_stride), re_hi);
98         input += 2 * input_stride;
99         pred += 2 * pred_stride;
100         residual += 2 * residual_stride;
101         y -= 2;
102     } while (y);
103 }
104 
distortion_avx2_intrin(const __m256i input,const __m256i recon,__m256i * const sum)105 static INLINE void distortion_avx2_intrin(const __m256i input, const __m256i recon,
106                                           __m256i *const sum) {
107     const __m256i in   = _mm256_unpacklo_epi8(input, _mm256_setzero_si256());
108     const __m256i re   = _mm256_unpacklo_epi8(recon, _mm256_setzero_si256());
109     const __m256i diff = _mm256_sub_epi16(in, re);
110     const __m256i dist = _mm256_madd_epi16(diff, diff);
111     *sum               = _mm256_add_epi32(*sum, dist);
112 }
113 
spatial_full_distortion_kernel16_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum)114 static INLINE void spatial_full_distortion_kernel16_avx2_intrin(const uint8_t *const input,
115                                                                 const uint8_t *const recon,
116                                                                 __m256i *const       sum) {
117     const __m128i in8  = _mm_loadu_si128((__m128i *)input);
118     const __m128i re8  = _mm_loadu_si128((__m128i *)recon);
119     const __m256i in16 = _mm256_cvtepu8_epi16(in8);
120     const __m256i re16 = _mm256_cvtepu8_epi16(re8);
121     const __m256i diff = _mm256_sub_epi16(in16, re16);
122     const __m256i dist = _mm256_madd_epi16(diff, diff);
123     *sum               = _mm256_add_epi32(*sum, dist);
124 }
125 
full_distortion_kernel4_avx2_intrin(const uint16_t * const input,const uint16_t * const recon,__m256i * const sum)126 static INLINE void full_distortion_kernel4_avx2_intrin(const uint16_t *const input,
127                                                        const uint16_t *const recon,
128                                                        __m256i *const        sum) {
129     __m128i in   = _mm_loadl_epi64((__m128i *)input);
130     __m128i re   = _mm_loadl_epi64((__m128i *)recon);
131     __m128i max  = _mm_max_epu16(in, re);
132     __m128i min  = _mm_min_epu16(in, re);
133     __m128i diff = _mm_sub_epi16(max, min);
134     diff         = _mm_madd_epi16(diff, diff);
135     __m256i zero = _mm256_setzero_si256();
136     zero         = _mm256_inserti128_si256(zero, diff, 1);
137     *sum         = _mm256_add_epi32(*sum, zero);
138 }
139 
full_distortion_kernel16_avx2_intrin(__m256i in,__m256i re,__m256i * const sum)140 static INLINE void full_distortion_kernel16_avx2_intrin(__m256i in, __m256i re,
141                                                         __m256i *const sum) {
142     __m256i max  = _mm256_max_epu16(in, re);
143     __m256i min  = _mm256_min_epu16(in, re);
144     __m256i diff = _mm256_sub_epi16(max, min);
145 
146     diff = _mm256_madd_epi16(diff, diff);
147     *sum = _mm256_add_epi32(*sum, diff);
148 }
149 
sum32_to64(__m256i * const sum32,__m256i * const sum64)150 static INLINE void sum32_to64(__m256i *const sum32, __m256i *const sum64) {
151     //Save partial sum into large 64bit register instead of 32 bit (which could overflow)
152     *sum64 = _mm256_add_epi64(*sum64, _mm256_unpacklo_epi32(*sum32, _mm256_setzero_si256()));
153     *sum64 = _mm256_add_epi64(*sum64, _mm256_unpackhi_epi32(*sum32, _mm256_setzero_si256()));
154     *sum32 = _mm256_setzero_si256();
155 }
156 
spatial_full_distortion_kernel32_leftover_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum0,__m256i * const sum1)157 static INLINE void spatial_full_distortion_kernel32_leftover_avx2_intrin(const uint8_t *const input,
158                                                                          const uint8_t *const recon,
159                                                                          __m256i *const       sum0,
160                                                                          __m256i *const sum1) {
161     const __m256i in     = _mm256_loadu_si256((__m256i *)input);
162     const __m256i re     = _mm256_loadu_si256((__m256i *)recon);
163     const __m256i max    = _mm256_max_epu8(in, re);
164     const __m256i min    = _mm256_min_epu8(in, re);
165     const __m256i diff   = _mm256_sub_epi8(max, min);
166     const __m256i diff_l = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
167     const __m256i diff_h = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
168     const __m256i dist_l = _mm256_madd_epi16(diff_l, diff_l);
169     const __m256i dist_h = _mm256_madd_epi16(diff_h, diff_h);
170     *sum0                = _mm256_add_epi32(*sum0, dist_l);
171     *sum1                = _mm256_add_epi32(*sum1, dist_h);
172 }
173 
spatial_full_distortion_kernel32_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum)174 static INLINE void spatial_full_distortion_kernel32_avx2_intrin(const uint8_t *const input,
175                                                                 const uint8_t *const recon,
176                                                                 __m256i *const       sum) {
177     const __m256i in     = _mm256_loadu_si256((__m256i *)input);
178     const __m256i re     = _mm256_loadu_si256((__m256i *)recon);
179     const __m256i max    = _mm256_max_epu8(in, re);
180     const __m256i min    = _mm256_min_epu8(in, re);
181     const __m256i diff   = _mm256_sub_epi8(max, min);
182     const __m256i diff_l = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
183     const __m256i diff_h = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
184     const __m256i dist_l = _mm256_madd_epi16(diff_l, diff_l);
185     const __m256i dist_h = _mm256_madd_epi16(diff_h, diff_h);
186     const __m256i dist   = _mm256_add_epi32(dist_l, dist_h);
187     *sum                 = _mm256_add_epi32(*sum, dist);
188 }
189 
hadd32_avx2_intrin(const __m256i src)190 static INLINE int32_t hadd32_avx2_intrin(const __m256i src) {
191     const __m128i src_l = _mm256_extracti128_si256(src, 0);
192     const __m128i src_h = _mm256_extracti128_si256(src, 1);
193     const __m128i sum   = _mm_add_epi32(src_l, src_h);
194 
195     return hadd32_sse2_intrin(sum);
196 }
197 
198 #ifdef __cplusplus
199 }
200 #endif
201 
202 #endif // EbPictureOperators_Inline_AVX2_h
203