1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #ifndef EbPictureOperators_Inline_AVX2_h
13 #define EbPictureOperators_Inline_AVX2_h
14
15 #include <immintrin.h>
16 #include "EbDefinitions.h"
17 #include "EbMemory_AVX2.h"
18 #include "EbPictureOperators_SSE2.h"
19
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23
residual_kernel4_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)24 SIMD_INLINE void residual_kernel4_avx2(const uint8_t *input, const uint32_t input_stride,
25 const uint8_t *pred, const uint32_t pred_stride,
26 int16_t *residual, const uint32_t residual_stride,
27 const uint32_t area_height) {
28 const __m256i zero = _mm256_setzero_si256();
29 uint32_t y = area_height;
30
31 do {
32 const __m256i in = load_u8_4x4_avx2(input, input_stride);
33 const __m256i pr = load_u8_4x4_avx2(pred, pred_stride);
34 const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
35 const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
36 const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
37 const __m128i r0 = _mm256_castsi256_si128(re_lo);
38 const __m128i r1 = _mm256_extracti128_si256(re_lo, 1);
39
40 store_s16_4x2_sse2(r0, residual + 0 * residual_stride, residual_stride);
41 store_s16_4x2_sse2(r1, residual + 2 * residual_stride, residual_stride);
42
43 input += 4 * input_stride;
44 pred += 4 * pred_stride;
45 residual += 4 * residual_stride;
46 y -= 4;
47 } while (y);
48 }
49
residual_kernel8_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)50 SIMD_INLINE void residual_kernel8_avx2(const uint8_t *input, const uint32_t input_stride,
51 const uint8_t *pred, const uint32_t pred_stride,
52 int16_t *residual, const uint32_t residual_stride,
53 const uint32_t area_height) {
54 const __m256i zero = _mm256_setzero_si256();
55 uint32_t y = area_height;
56
57 do {
58 const __m256i in = load_u8_8x4_avx2(input, input_stride);
59 const __m256i pr = load_u8_8x4_avx2(pred, pred_stride);
60 const __m256i in_lo = _mm256_unpacklo_epi8(in, zero);
61 const __m256i in_hi = _mm256_unpackhi_epi8(in, zero);
62 const __m256i pr_lo = _mm256_unpacklo_epi8(pr, zero);
63 const __m256i pr_hi = _mm256_unpackhi_epi8(pr, zero);
64 const __m256i r0 = _mm256_sub_epi16(in_lo, pr_lo);
65 const __m256i r1 = _mm256_sub_epi16(in_hi, pr_hi);
66
67 storeu_s16_8x2_avx2(r0, residual + 0 * residual_stride, 2 * residual_stride);
68 storeu_s16_8x2_avx2(r1, residual + 1 * residual_stride, 2 * residual_stride);
69
70 input += 4 * input_stride;
71 pred += 4 * pred_stride;
72 residual += 4 * residual_stride;
73 y -= 4;
74 } while (y);
75 }
76
residual_kernel16_avx2(const uint8_t * input,const uint32_t input_stride,const uint8_t * pred,const uint32_t pred_stride,int16_t * residual,const uint32_t residual_stride,const uint32_t area_height)77 SIMD_INLINE void residual_kernel16_avx2(const uint8_t *input, const uint32_t input_stride,
78 const uint8_t *pred, const uint32_t pred_stride,
79 int16_t *residual, const uint32_t residual_stride,
80 const uint32_t area_height) {
81 const __m256i zero = _mm256_setzero_si256();
82 uint32_t y = area_height;
83
84 do {
85 const __m256i in0 = loadu_u8_16x2_avx2(input, input_stride);
86 const __m256i pr0 = loadu_u8_16x2_avx2(pred, pred_stride);
87 const __m256i in1 = _mm256_permute4x64_epi64(in0, 0xD8);
88 const __m256i pr1 = _mm256_permute4x64_epi64(pr0, 0xD8);
89 const __m256i in_lo = _mm256_unpacklo_epi8(in1, zero);
90 const __m256i in_hi = _mm256_unpackhi_epi8(in1, zero);
91 const __m256i pr_lo = _mm256_unpacklo_epi8(pr1, zero);
92 const __m256i pr_hi = _mm256_unpackhi_epi8(pr1, zero);
93 const __m256i re_lo = _mm256_sub_epi16(in_lo, pr_lo);
94 const __m256i re_hi = _mm256_sub_epi16(in_hi, pr_hi);
95
96 _mm256_storeu_si256((__m256i *)(residual + 0 * residual_stride), re_lo);
97 _mm256_storeu_si256((__m256i *)(residual + 1 * residual_stride), re_hi);
98 input += 2 * input_stride;
99 pred += 2 * pred_stride;
100 residual += 2 * residual_stride;
101 y -= 2;
102 } while (y);
103 }
104
distortion_avx2_intrin(const __m256i input,const __m256i recon,__m256i * const sum)105 static INLINE void distortion_avx2_intrin(const __m256i input, const __m256i recon,
106 __m256i *const sum) {
107 const __m256i in = _mm256_unpacklo_epi8(input, _mm256_setzero_si256());
108 const __m256i re = _mm256_unpacklo_epi8(recon, _mm256_setzero_si256());
109 const __m256i diff = _mm256_sub_epi16(in, re);
110 const __m256i dist = _mm256_madd_epi16(diff, diff);
111 *sum = _mm256_add_epi32(*sum, dist);
112 }
113
spatial_full_distortion_kernel16_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum)114 static INLINE void spatial_full_distortion_kernel16_avx2_intrin(const uint8_t *const input,
115 const uint8_t *const recon,
116 __m256i *const sum) {
117 const __m128i in8 = _mm_loadu_si128((__m128i *)input);
118 const __m128i re8 = _mm_loadu_si128((__m128i *)recon);
119 const __m256i in16 = _mm256_cvtepu8_epi16(in8);
120 const __m256i re16 = _mm256_cvtepu8_epi16(re8);
121 const __m256i diff = _mm256_sub_epi16(in16, re16);
122 const __m256i dist = _mm256_madd_epi16(diff, diff);
123 *sum = _mm256_add_epi32(*sum, dist);
124 }
125
full_distortion_kernel4_avx2_intrin(const uint16_t * const input,const uint16_t * const recon,__m256i * const sum)126 static INLINE void full_distortion_kernel4_avx2_intrin(const uint16_t *const input,
127 const uint16_t *const recon,
128 __m256i *const sum) {
129 __m128i in = _mm_loadl_epi64((__m128i *)input);
130 __m128i re = _mm_loadl_epi64((__m128i *)recon);
131 __m128i max = _mm_max_epu16(in, re);
132 __m128i min = _mm_min_epu16(in, re);
133 __m128i diff = _mm_sub_epi16(max, min);
134 diff = _mm_madd_epi16(diff, diff);
135 __m256i zero = _mm256_setzero_si256();
136 zero = _mm256_inserti128_si256(zero, diff, 1);
137 *sum = _mm256_add_epi32(*sum, zero);
138 }
139
full_distortion_kernel16_avx2_intrin(__m256i in,__m256i re,__m256i * const sum)140 static INLINE void full_distortion_kernel16_avx2_intrin(__m256i in, __m256i re,
141 __m256i *const sum) {
142 __m256i max = _mm256_max_epu16(in, re);
143 __m256i min = _mm256_min_epu16(in, re);
144 __m256i diff = _mm256_sub_epi16(max, min);
145
146 diff = _mm256_madd_epi16(diff, diff);
147 *sum = _mm256_add_epi32(*sum, diff);
148 }
149
sum32_to64(__m256i * const sum32,__m256i * const sum64)150 static INLINE void sum32_to64(__m256i *const sum32, __m256i *const sum64) {
151 //Save partial sum into large 64bit register instead of 32 bit (which could overflow)
152 *sum64 = _mm256_add_epi64(*sum64, _mm256_unpacklo_epi32(*sum32, _mm256_setzero_si256()));
153 *sum64 = _mm256_add_epi64(*sum64, _mm256_unpackhi_epi32(*sum32, _mm256_setzero_si256()));
154 *sum32 = _mm256_setzero_si256();
155 }
156
spatial_full_distortion_kernel32_leftover_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum0,__m256i * const sum1)157 static INLINE void spatial_full_distortion_kernel32_leftover_avx2_intrin(const uint8_t *const input,
158 const uint8_t *const recon,
159 __m256i *const sum0,
160 __m256i *const sum1) {
161 const __m256i in = _mm256_loadu_si256((__m256i *)input);
162 const __m256i re = _mm256_loadu_si256((__m256i *)recon);
163 const __m256i max = _mm256_max_epu8(in, re);
164 const __m256i min = _mm256_min_epu8(in, re);
165 const __m256i diff = _mm256_sub_epi8(max, min);
166 const __m256i diff_l = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
167 const __m256i diff_h = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
168 const __m256i dist_l = _mm256_madd_epi16(diff_l, diff_l);
169 const __m256i dist_h = _mm256_madd_epi16(diff_h, diff_h);
170 *sum0 = _mm256_add_epi32(*sum0, dist_l);
171 *sum1 = _mm256_add_epi32(*sum1, dist_h);
172 }
173
spatial_full_distortion_kernel32_avx2_intrin(const uint8_t * const input,const uint8_t * const recon,__m256i * const sum)174 static INLINE void spatial_full_distortion_kernel32_avx2_intrin(const uint8_t *const input,
175 const uint8_t *const recon,
176 __m256i *const sum) {
177 const __m256i in = _mm256_loadu_si256((__m256i *)input);
178 const __m256i re = _mm256_loadu_si256((__m256i *)recon);
179 const __m256i max = _mm256_max_epu8(in, re);
180 const __m256i min = _mm256_min_epu8(in, re);
181 const __m256i diff = _mm256_sub_epi8(max, min);
182 const __m256i diff_l = _mm256_unpacklo_epi8(diff, _mm256_setzero_si256());
183 const __m256i diff_h = _mm256_unpackhi_epi8(diff, _mm256_setzero_si256());
184 const __m256i dist_l = _mm256_madd_epi16(diff_l, diff_l);
185 const __m256i dist_h = _mm256_madd_epi16(diff_h, diff_h);
186 const __m256i dist = _mm256_add_epi32(dist_l, dist_h);
187 *sum = _mm256_add_epi32(*sum, dist);
188 }
189
hadd32_avx2_intrin(const __m256i src)190 static INLINE int32_t hadd32_avx2_intrin(const __m256i src) {
191 const __m128i src_l = _mm256_extracti128_si256(src, 0);
192 const __m128i src_h = _mm256_extracti128_si256(src, 1);
193 const __m128i sum = _mm_add_epi32(src_l, src_h);
194
195 return hadd32_sse2_intrin(sum);
196 }
197
198 #ifdef __cplusplus
199 }
200 #endif
201
202 #endif // EbPictureOperators_Inline_AVX2_h
203