1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13
14 #include <emmintrin.h>
15 #include <immintrin.h>
16
energy_computation_kernel_avx2(const int32_t * const in,__m256i * const sum256)17 static INLINE void energy_computation_kernel_avx2(const int32_t *const in, __m256i *const sum256) {
18 const __m256i zero = _mm256_setzero_si256();
19 const __m256i input = _mm256_loadu_si256((__m256i *)in);
20 const __m256i in_lo = _mm256_unpacklo_epi32(input, zero);
21 const __m256i in_hi = _mm256_unpackhi_epi32(input, zero);
22 const __m256i energy_lo = _mm256_mul_epi32(in_lo, in_lo);
23 const __m256i energy_hi = _mm256_mul_epi32(in_hi, in_hi);
24 *sum256 = _mm256_add_epi64(*sum256, energy_lo);
25 *sum256 = _mm256_add_epi64(*sum256, energy_hi);
26 }
27
hadd64_avx2(const __m256i sum256)28 static INLINE uint64_t hadd64_avx2(const __m256i sum256) {
29 const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
30 const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
31 const __m128i sum128 = _mm_add_epi64(sum256_lo, sum256_hi);
32 const __m128i sum128_hi = _mm_srli_si128(sum128, 8);
33 const __m128i sum = _mm_add_epi64(sum128, sum128_hi);
34
35 return _mm_extract_epi64(sum, 0);
36 }
37
energy_computation_avx2(const int32_t * const in,const uint32_t size)38 static INLINE uint64_t energy_computation_avx2(const int32_t *const in, const uint32_t size) {
39 const __m256i zero = _mm256_setzero_si256();
40 uint32_t i = 0;
41 __m256i sum = zero;
42
43 do {
44 energy_computation_kernel_avx2(in + i, &sum);
45 i += 8;
46 } while (i < size);
47
48 return hadd64_avx2(sum);
49 }
50
energy_computation_64_avx2(const int32_t * in,const uint32_t height)51 static INLINE uint64_t energy_computation_64_avx2(const int32_t *in, const uint32_t height) {
52 const __m256i zero = _mm256_setzero_si256();
53 uint32_t i = height;
54 __m256i sum = zero;
55
56 do {
57 energy_computation_kernel_avx2(in + 0 * 8, &sum);
58 energy_computation_kernel_avx2(in + 1 * 8, &sum);
59 energy_computation_kernel_avx2(in + 2 * 8, &sum);
60 energy_computation_kernel_avx2(in + 3 * 8, &sum);
61 in += 64;
62 } while (--i);
63
64 return hadd64_avx2(sum);
65 }
66
clean_256_bytes_avx2(int32_t * buf,const uint32_t height)67 static INLINE void clean_256_bytes_avx2(int32_t *buf, const uint32_t height) {
68 const __m256i zero = _mm256_setzero_si256();
69 uint32_t h = height;
70
71 do {
72 _mm256_storeu_si256((__m256i *)(buf + 0 * 8), zero);
73 _mm256_storeu_si256((__m256i *)(buf + 1 * 8), zero);
74 _mm256_storeu_si256((__m256i *)(buf + 2 * 8), zero);
75 _mm256_storeu_si256((__m256i *)(buf + 3 * 8), zero);
76 buf += 64;
77 } while (--h);
78 }
79
copy_32_bytes_avx2(const int32_t * src,int32_t * dst)80 static INLINE void copy_32_bytes_avx2(const int32_t *src, int32_t *dst) {
81 const __m256i val = _mm256_loadu_si256((__m256i *)(src + 0 * 8));
82 _mm256_storeu_si256((__m256i *)(dst + 0 * 8), val);
83 }
84
copy_256x_bytes_avx2(const int32_t * src,int32_t * dst,const uint32_t height)85 static INLINE void copy_256x_bytes_avx2(const int32_t *src, int32_t *dst, const uint32_t height) {
86 uint32_t h = height;
87
88 do {
89 copy_32_bytes_avx2(src + 0 * 8, dst + 0 * 8);
90 copy_32_bytes_avx2(src + 1 * 8, dst + 1 * 8);
91 copy_32_bytes_avx2(src + 2 * 8, dst + 2 * 8);
92 copy_32_bytes_avx2(src + 3 * 8, dst + 3 * 8);
93 src += 64;
94 dst += 32;
95 } while (--h);
96 }
97
svt_handle_transform16x64_avx2(int32_t * output)98 uint64_t svt_handle_transform16x64_avx2(int32_t *output) {
99 //bottom 16x32 area.
100 const uint64_t three_quad_energy = energy_computation_avx2(output + 16 * 32, 16 * 32);
101
102 // zero out the bottom 16x32 area.
103 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
104
105 return three_quad_energy;
106 }
107
svt_handle_transform32x64_avx2(int32_t * output)108 uint64_t svt_handle_transform32x64_avx2(int32_t *output) {
109 //bottom 32x32 area.
110 const uint64_t three_quad_energy = energy_computation_avx2(output + 32 * 32, 32 * 32);
111
112 // zero out the bottom 32x32 area.
113 memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
114
115 return three_quad_energy;
116 }
117
svt_handle_transform64x16_avx2(int32_t * output)118 uint64_t svt_handle_transform64x16_avx2(int32_t *output) {
119 // top - right 32x16 area.
120 const uint64_t three_quad_energy = energy_computation_64_avx2(output + 32, 16);
121
122 // zero out right 32x16 area.
123 clean_256_bytes_avx2(output + 32, 16);
124
125 // Re-pack non-zero coeffs in the first 32x16 indices.
126 copy_256x_bytes_avx2(output + 64, output + 32, 15);
127
128 return three_quad_energy;
129 }
130
svt_handle_transform64x32_avx2(int32_t * output)131 uint64_t svt_handle_transform64x32_avx2(int32_t *output) {
132 // top - right 32x32 area.
133 const uint64_t three_quad_energy = energy_computation_64_avx2(output + 32, 32);
134
135 // zero out right 32x32 area.
136 clean_256_bytes_avx2(output + 32, 32);
137
138 // Re-pack non-zero coeffs in the first 32x32 indices.
139 copy_256x_bytes_avx2(output + 64, output + 32, 31);
140
141 return three_quad_energy;
142 }
143
svt_handle_transform64x64_avx2(int32_t * output)144 uint64_t svt_handle_transform64x64_avx2(int32_t *output) {
145 uint64_t three_quad_energy;
146
147 // top - right 32x32 area.
148 three_quad_energy = energy_computation_64_avx2(output + 32, 32);
149 //bottom 64x32 area.
150 three_quad_energy += energy_computation_avx2(output + 32 * 64, 64 * 32);
151
152 // zero out top-right 32x32 area.
153 clean_256_bytes_avx2(output + 32, 32);
154
155 // zero out the bottom 64x32 area.
156 memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
157
158 // Re-pack non-zero coeffs in the first 32x32 indices.
159 copy_256x_bytes_avx2(output + 64, output + 32, 31);
160
161 return three_quad_energy;
162 }
handle_transform16x64_N2_N4_avx2(int32_t * output)163 uint64_t handle_transform16x64_N2_N4_avx2(int32_t *output) {
164 (void)output;
165 return 0;
166 }
167
handle_transform32x64_N2_N4_avx2(int32_t * output)168 uint64_t handle_transform32x64_N2_N4_avx2(int32_t *output) {
169 (void)output;
170 return 0;
171 }
172
handle_transform64x16_N2_N4_avx2(int32_t * output)173 uint64_t handle_transform64x16_N2_N4_avx2(int32_t *output) {
174 // Re-pack non-zero coeffs in the first 32x16 indices.
175 copy_256x_bytes_avx2(output + 64, output + 32, 15);
176 return 0;
177 }
178
handle_transform64x32_N2_N4_avx2(int32_t * output)179 uint64_t handle_transform64x32_N2_N4_avx2(int32_t *output) {
180 // Re-pack non-zero coeffs in the first 32x32 indices.
181 copy_256x_bytes_avx2(output + 64, output + 32, 31);
182 return 0;
183 }
184
handle_transform64x64_N2_N4_avx2(int32_t * output)185 uint64_t handle_transform64x64_N2_N4_avx2(int32_t *output) {
186 // Re-pack non-zero coeffs in the first 32x32 indices.
187 copy_256x_bytes_avx2(output + 64, output + 32, 31);
188 return 0;
189 }
190