1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #include "EbDefinitions.h"
13 
14 #include <emmintrin.h>
15 #include <immintrin.h>
16 
energy_computation_kernel_avx2(const int32_t * const in,__m256i * const sum256)17 static INLINE void energy_computation_kernel_avx2(const int32_t *const in, __m256i *const sum256) {
18     const __m256i zero      = _mm256_setzero_si256();
19     const __m256i input     = _mm256_loadu_si256((__m256i *)in);
20     const __m256i in_lo     = _mm256_unpacklo_epi32(input, zero);
21     const __m256i in_hi     = _mm256_unpackhi_epi32(input, zero);
22     const __m256i energy_lo = _mm256_mul_epi32(in_lo, in_lo);
23     const __m256i energy_hi = _mm256_mul_epi32(in_hi, in_hi);
24     *sum256                 = _mm256_add_epi64(*sum256, energy_lo);
25     *sum256                 = _mm256_add_epi64(*sum256, energy_hi);
26 }
27 
hadd64_avx2(const __m256i sum256)28 static INLINE uint64_t hadd64_avx2(const __m256i sum256) {
29     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
30     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
31     const __m128i sum128    = _mm_add_epi64(sum256_lo, sum256_hi);
32     const __m128i sum128_hi = _mm_srli_si128(sum128, 8);
33     const __m128i sum       = _mm_add_epi64(sum128, sum128_hi);
34 
35     return _mm_extract_epi64(sum, 0);
36 }
37 
energy_computation_avx2(const int32_t * const in,const uint32_t size)38 static INLINE uint64_t energy_computation_avx2(const int32_t *const in, const uint32_t size) {
39     const __m256i zero = _mm256_setzero_si256();
40     uint32_t      i    = 0;
41     __m256i       sum  = zero;
42 
43     do {
44         energy_computation_kernel_avx2(in + i, &sum);
45         i += 8;
46     } while (i < size);
47 
48     return hadd64_avx2(sum);
49 }
50 
energy_computation_64_avx2(const int32_t * in,const uint32_t height)51 static INLINE uint64_t energy_computation_64_avx2(const int32_t *in, const uint32_t height) {
52     const __m256i zero = _mm256_setzero_si256();
53     uint32_t      i    = height;
54     __m256i       sum  = zero;
55 
56     do {
57         energy_computation_kernel_avx2(in + 0 * 8, &sum);
58         energy_computation_kernel_avx2(in + 1 * 8, &sum);
59         energy_computation_kernel_avx2(in + 2 * 8, &sum);
60         energy_computation_kernel_avx2(in + 3 * 8, &sum);
61         in += 64;
62     } while (--i);
63 
64     return hadd64_avx2(sum);
65 }
66 
clean_256_bytes_avx2(int32_t * buf,const uint32_t height)67 static INLINE void clean_256_bytes_avx2(int32_t *buf, const uint32_t height) {
68     const __m256i zero = _mm256_setzero_si256();
69     uint32_t      h    = height;
70 
71     do {
72         _mm256_storeu_si256((__m256i *)(buf + 0 * 8), zero);
73         _mm256_storeu_si256((__m256i *)(buf + 1 * 8), zero);
74         _mm256_storeu_si256((__m256i *)(buf + 2 * 8), zero);
75         _mm256_storeu_si256((__m256i *)(buf + 3 * 8), zero);
76         buf += 64;
77     } while (--h);
78 }
79 
copy_32_bytes_avx2(const int32_t * src,int32_t * dst)80 static INLINE void copy_32_bytes_avx2(const int32_t *src, int32_t *dst) {
81     const __m256i val = _mm256_loadu_si256((__m256i *)(src + 0 * 8));
82     _mm256_storeu_si256((__m256i *)(dst + 0 * 8), val);
83 }
84 
copy_256x_bytes_avx2(const int32_t * src,int32_t * dst,const uint32_t height)85 static INLINE void copy_256x_bytes_avx2(const int32_t *src, int32_t *dst, const uint32_t height) {
86     uint32_t h = height;
87 
88     do {
89         copy_32_bytes_avx2(src + 0 * 8, dst + 0 * 8);
90         copy_32_bytes_avx2(src + 1 * 8, dst + 1 * 8);
91         copy_32_bytes_avx2(src + 2 * 8, dst + 2 * 8);
92         copy_32_bytes_avx2(src + 3 * 8, dst + 3 * 8);
93         src += 64;
94         dst += 32;
95     } while (--h);
96 }
97 
svt_handle_transform16x64_avx2(int32_t * output)98 uint64_t svt_handle_transform16x64_avx2(int32_t *output) {
99     //bottom 16x32 area.
100     const uint64_t three_quad_energy = energy_computation_avx2(output + 16 * 32, 16 * 32);
101 
102     // zero out the bottom 16x32 area.
103     memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
104 
105     return three_quad_energy;
106 }
107 
svt_handle_transform32x64_avx2(int32_t * output)108 uint64_t svt_handle_transform32x64_avx2(int32_t *output) {
109     //bottom 32x32 area.
110     const uint64_t three_quad_energy = energy_computation_avx2(output + 32 * 32, 32 * 32);
111 
112     // zero out the bottom 32x32 area.
113     memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
114 
115     return three_quad_energy;
116 }
117 
svt_handle_transform64x16_avx2(int32_t * output)118 uint64_t svt_handle_transform64x16_avx2(int32_t *output) {
119     // top - right 32x16 area.
120     const uint64_t three_quad_energy = energy_computation_64_avx2(output + 32, 16);
121 
122     // zero out right 32x16 area.
123     clean_256_bytes_avx2(output + 32, 16);
124 
125     // Re-pack non-zero coeffs in the first 32x16 indices.
126     copy_256x_bytes_avx2(output + 64, output + 32, 15);
127 
128     return three_quad_energy;
129 }
130 
svt_handle_transform64x32_avx2(int32_t * output)131 uint64_t svt_handle_transform64x32_avx2(int32_t *output) {
132     // top - right 32x32 area.
133     const uint64_t three_quad_energy = energy_computation_64_avx2(output + 32, 32);
134 
135     // zero out right 32x32 area.
136     clean_256_bytes_avx2(output + 32, 32);
137 
138     // Re-pack non-zero coeffs in the first 32x32 indices.
139     copy_256x_bytes_avx2(output + 64, output + 32, 31);
140 
141     return three_quad_energy;
142 }
143 
svt_handle_transform64x64_avx2(int32_t * output)144 uint64_t svt_handle_transform64x64_avx2(int32_t *output) {
145     uint64_t three_quad_energy;
146 
147     // top - right 32x32 area.
148     three_quad_energy = energy_computation_64_avx2(output + 32, 32);
149     //bottom 64x32 area.
150     three_quad_energy += energy_computation_avx2(output + 32 * 64, 64 * 32);
151 
152     // zero out top-right 32x32 area.
153     clean_256_bytes_avx2(output + 32, 32);
154 
155     // zero out the bottom 64x32 area.
156     memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
157 
158     // Re-pack non-zero coeffs in the first 32x32 indices.
159     copy_256x_bytes_avx2(output + 64, output + 32, 31);
160 
161     return three_quad_energy;
162 }
handle_transform16x64_N2_N4_avx2(int32_t * output)163 uint64_t handle_transform16x64_N2_N4_avx2(int32_t *output) {
164     (void)output;
165     return 0;
166 }
167 
handle_transform32x64_N2_N4_avx2(int32_t * output)168 uint64_t handle_transform32x64_N2_N4_avx2(int32_t *output) {
169     (void)output;
170     return 0;
171 }
172 
handle_transform64x16_N2_N4_avx2(int32_t * output)173 uint64_t handle_transform64x16_N2_N4_avx2(int32_t *output) {
174     // Re-pack non-zero coeffs in the first 32x16 indices.
175     copy_256x_bytes_avx2(output + 64, output + 32, 15);
176     return 0;
177 }
178 
handle_transform64x32_N2_N4_avx2(int32_t * output)179 uint64_t handle_transform64x32_N2_N4_avx2(int32_t *output) {
180     // Re-pack non-zero coeffs in the first 32x32 indices.
181     copy_256x_bytes_avx2(output + 64, output + 32, 31);
182     return 0;
183 }
184 
handle_transform64x64_N2_N4_avx2(int32_t * output)185 uint64_t handle_transform64x64_N2_N4_avx2(int32_t *output) {
186     // Re-pack non-zero coeffs in the first 32x32 indices.
187     copy_256x_bytes_avx2(output + 64, output + 32, 31);
188     return 0;
189 }
190