1 /*
2 * Copyright (C) 2019-2021 Robin Gareus <robin@gareus.org>
3 * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <math.h>
20 #include <stdint.h>
21
22 #ifdef __APPLE__
23 #include <Accelerate/Accelerate.h>
24 #define _HAVE_DSP_COMPUTE_PEAK
25
26 static float
dsp_compute_peak(const float * buf,uint32_t n_samples,float current)27 dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
28 {
29 float tmp = 0.0f;
30 vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
31 return fmaxf (current, tmp);
32 }
33
34 #elif (defined __aarch64__) || (defined __arm__)
35
36 #include <arm_acle.h>
37 #include <arm_neon.h>
38
39 #define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
40 #define _HAVE_DSP_COMPUTE_PEAK
41
42 float
dsp_compute_peak(const float * src,uint32_t nframes,float current)43 dsp_compute_peak (const float* src, uint32_t nframes, float current)
44 {
45 float32x4_t vc0;
46
47 // Broadcast single value to all elements of the register
48 vc0 = vdupq_n_f32 (current);
49
50 // While pointer is not aligned, process one sample at a time
51 while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
52 float32x4_t x0;
53
54 x0 = vld1q_dup_f32 (src);
55 x0 = vabsq_f32 (x0);
56 vc0 = vmaxq_f32 (vc0, x0);
57
58 ++src;
59 --nframes;
60 }
61
62 // SIMD portion with aligned buffers
63 do {
64 while (nframes >= 8) {
65 float32x4_t x0, x1;
66
67 x0 = vld1q_f32 (src + 0);
68 x1 = vld1q_f32 (src + 4);
69
70 x0 = vabsq_f32 (x0);
71 x1 = vabsq_f32 (x1);
72
73 vc0 = vmaxq_f32 (vc0, x0);
74 vc0 = vmaxq_f32 (vc0, x1);
75
76 src += 8;
77 nframes -= 8;
78 }
79
80 while (nframes >= 4) {
81 float32x4_t x0;
82
83 x0 = vld1q_f32 (src);
84
85 x0 = vabsq_f32 (x0);
86 vc0 = vmaxq_f32 (vc0, x0);
87
88 src += 4;
89 nframes -= 4;
90 }
91
92 while (nframes >= 2) {
93 float32x2_t x0;
94 float32x4_t y0;
95
96 x0 = vld1_f32 (src); // Load two elements
97 x0 = vabs_f32 (x0); // Compute ABS value
98 y0 = vcombine_f32 (x0, x0); // Combine two vectors
99
100 vc0 = vmaxq_f32 (vc0, y0);
101
102 src += 2;
103 nframes -= 2;
104 }
105 } while (0);
106
107 // Do remaining samples one frame at a time
108 while (nframes > 0) {
109 float32x4_t x0;
110
111 x0 = vld1q_dup_f32 (src);
112 x0 = vabsq_f32 (x0);
113 vc0 = vmaxq_f32 (vc0, x0);
114
115 ++src;
116 --nframes;
117 }
118
119 // Compute the max in register
120 do {
121 float32x2_t vlo = vget_low_f32 (vc0);
122 float32x2_t vhi = vget_high_f32 (vc0);
123 float32x2_t max0 = vpmax_f32 (vlo, vhi);
124 float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
125 current = vget_lane_f32 (max1, 0);
126 } while (0);
127
128 return current;
129 }
130
131 #undef IS_ALIGNED_TO
132
133 #elif (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) // ARCH_X86
134
135 #include <immintrin.h>
136 #include <xmmintrin.h>
137
138 #ifdef __AVX__ // TODO runtime detect AVX
139 #define _HAVE_DSP_COMPUTE_PEAK
140
141 #warning using AVX, this limits available architectures
142
143 static float
dsp_compute_peak(const float * src,uint32_t n_samples,float current)144 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
145 {
146 const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);
147
148 // Broadcast the current max value to all elements of the YMM register
149 __m256 vmax = _mm256_broadcast_ss (¤t);
150
151 // Compute single min/max of unaligned portion until alignment is reached
152 while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
153 __m256 vsrc;
154
155 vsrc = _mm256_setzero_ps ();
156 vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
157 vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
158 vmax = _mm256_max_ps (vmax, vsrc);
159
160 ++src;
161 --n_samples;
162 }
163
164 // Process the aligned portion 16 samples at a time
165 while (n_samples >= 16) {
166 #ifdef _WIN32
167 _mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
168 #else
169 __builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
170 #endif
171 __m256 vsrc1, vsrc2;
172 vsrc1 = _mm256_load_ps (src + 0);
173 vsrc2 = _mm256_load_ps (src + 8);
174
175 vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
176 vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);
177
178 vmax = _mm256_max_ps (vmax, vsrc1);
179 vmax = _mm256_max_ps (vmax, vsrc2);
180
181 src += 16;
182 n_samples -= 16;
183 }
184
185 // Process the remaining samples 8 at a time
186 while (n_samples >= 8) {
187 __m256 vsrc;
188
189 vsrc = _mm256_load_ps (src);
190 vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
191 vmax = _mm256_max_ps (vmax, vsrc);
192
193 src += 8;
194 n_samples -= 8;
195 }
196
197 // If there are still some left 4 to 8 samples, process them below
198 while (n_samples > 0) {
199 __m256 vsrc;
200
201 vsrc = _mm256_setzero_ps ();
202 vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
203 vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
204 vmax = _mm256_max_ps (vmax, vsrc);
205
206 ++src;
207 --n_samples;
208 }
209
210 __m256 tmp;
211 tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
212 vmax = _mm256_max_ps (tmp, vmax);
213 tmp = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
214 vmax = _mm256_max_ps (tmp, vmax);
215 tmp = _mm256_permute2f128_ps (vmax, vmax, 1);
216 vmax = _mm256_max_ps (tmp, vmax);
217
218 // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
219 _mm256_zeroupper ();
220
221 #if defined(__GNUC__) && (__GNUC__ < 5)
222 return *((float*)&vmax);
223 #elif defined(__GNUC__) && (__GNUC__ < 8)
224 return vmax[0];
225 #else
226 return _mm256_cvtss_f32 (vmax);
227 #endif
228 }
229
230 #elif defined __SSE2__
231 #define _HAVE_DSP_COMPUTE_PEAK
232
233 static float
dsp_compute_peak(const float * src,uint32_t n_samples,float current)234 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
235 {
236 const __m128 ABS_MASK = _mm_set1_ps (-0.0F);
237
238 __m128 vmax;
239 __m128 temp;
240
241 vmax = _mm_set1_ps (current);
242
243 // Compute single max of unaligned portion until alignment is reached
244 while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
245 temp = _mm_set1_ps (*src);
246 temp = _mm_andnot_ps (ABS_MASK, temp);
247 vmax = _mm_max_ps (vmax, temp);
248 ++src;
249 --n_samples;
250 }
251
252 // use 64 byte prefetch for quadruple quads
253 while (n_samples >= 16) {
254 #ifdef _WIN32
255 _mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
256 #else
257 __builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
258 #endif
259 temp = _mm_load_ps (src);
260 temp = _mm_andnot_ps (ABS_MASK, temp);
261 vmax = _mm_max_ps (vmax, temp);
262 src += 4;
263 temp = _mm_load_ps (src);
264 temp = _mm_andnot_ps (ABS_MASK, temp);
265 vmax = _mm_max_ps (vmax, temp);
266 src += 4;
267 temp = _mm_load_ps (src);
268 temp = _mm_andnot_ps (ABS_MASK, temp);
269 vmax = _mm_max_ps (vmax, temp);
270 src += 4;
271 temp = _mm_load_ps (src);
272 temp = _mm_andnot_ps (ABS_MASK, temp);
273 vmax = _mm_max_ps (vmax, temp);
274 src += 4;
275 n_samples -= 16;
276 }
277
278 // temp through aligned buffers
279 while (n_samples >= 4) {
280 temp = _mm_load_ps (src);
281 temp = _mm_andnot_ps (ABS_MASK, temp);
282 vmax = _mm_max_ps (vmax, temp);
283 src += 4;
284 n_samples -= 4;
285 }
286
287 // temp through the rest < 4 samples
288 while (n_samples > 0) {
289 temp = _mm_set1_ps (*src);
290 temp = _mm_andnot_ps (ABS_MASK, temp);
291 vmax = _mm_max_ps (vmax, temp);
292 ++src;
293 --n_samples;
294 }
295
296 temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
297 vmax = _mm_max_ps (temp, vmax);
298 temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
299 vmax = _mm_max_ps (temp, vmax);
300
301 #if defined(__GNUC__) && (__GNUC__ < 5)
302 return *((float*)&vmax);
303 #else
304 return vmax[0];
305 #endif
306 }
307 #endif // SSE
308
309 #endif
310
311 #ifndef _HAVE_DSP_COMPUTE_PEAK
312 #warning dsp_compute_peak is not accelerated on this architecture
313
314 static float
dsp_compute_peak(const float * const buf,uint32_t n_samples,float current)315 dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
316 {
317 for (uint32_t i = 0; i < n_samples; ++i) {
318 const float x = fabsf (buf[i]);
319 if (x > current) {
320 current = x;
321 }
322 }
323 return current;
324 }
325 #endif
326