1 /*
2  * Copyright (C) 2019-2021 Robin Gareus <robin@gareus.org>
3  * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include <math.h>
20 #include <stdint.h>
21 
22 #ifdef __APPLE__
23 #include <Accelerate/Accelerate.h>
24 #define _HAVE_DSP_COMPUTE_PEAK
25 
26 static float
dsp_compute_peak(const float * buf,uint32_t n_samples,float current)27 dsp_compute_peak (const float* buf, uint32_t n_samples, float current)
28 {
29 	float tmp = 0.0f;
30 	vDSP_maxmgv (buf, (vDSP_Stride)1, &tmp, n_samples);
31 	return fmaxf (current, tmp);
32 }
33 
34 #elif (defined __aarch64__) || (defined __arm__)
35 
36 #include <arm_acle.h>
37 #include <arm_neon.h>
38 
39 #define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
40 #define _HAVE_DSP_COMPUTE_PEAK
41 
42 float
dsp_compute_peak(const float * src,uint32_t nframes,float current)43 dsp_compute_peak (const float* src, uint32_t nframes, float current)
44 {
45 	float32x4_t vc0;
46 
47 	// Broadcast single value to all elements of the register
48 	vc0 = vdupq_n_f32 (current);
49 
50 	// While pointer is not aligned, process one sample at a time
51 	while (!IS_ALIGNED_TO (src, sizeof (float32x4_t)) && (nframes > 0)) {
52 		float32x4_t x0;
53 
54 		x0  = vld1q_dup_f32 (src);
55 		x0  = vabsq_f32 (x0);
56 		vc0 = vmaxq_f32 (vc0, x0);
57 
58 		++src;
59 		--nframes;
60 	}
61 
62 	// SIMD portion with aligned buffers
63 	do {
64 		while (nframes >= 8) {
65 			float32x4_t x0, x1;
66 
67 			x0 = vld1q_f32 (src + 0);
68 			x1 = vld1q_f32 (src + 4);
69 
70 			x0 = vabsq_f32 (x0);
71 			x1 = vabsq_f32 (x1);
72 
73 			vc0 = vmaxq_f32 (vc0, x0);
74 			vc0 = vmaxq_f32 (vc0, x1);
75 
76 			src += 8;
77 			nframes -= 8;
78 		}
79 
80 		while (nframes >= 4) {
81 			float32x4_t x0;
82 
83 			x0 = vld1q_f32 (src);
84 
85 			x0  = vabsq_f32 (x0);
86 			vc0 = vmaxq_f32 (vc0, x0);
87 
88 			src += 4;
89 			nframes -= 4;
90 		}
91 
92 		while (nframes >= 2) {
93 			float32x2_t x0;
94 			float32x4_t y0;
95 
96 			x0 = vld1_f32 (src);        // Load two elements
97 			x0 = vabs_f32 (x0);         // Compute ABS value
98 			y0 = vcombine_f32 (x0, x0); // Combine two vectors
99 
100 			vc0 = vmaxq_f32 (vc0, y0);
101 
102 			src += 2;
103 			nframes -= 2;
104 		}
105 	} while (0);
106 
107 	// Do remaining samples one frame at a time
108 	while (nframes > 0) {
109 		float32x4_t x0;
110 
111 		x0  = vld1q_dup_f32 (src);
112 		x0  = vabsq_f32 (x0);
113 		vc0 = vmaxq_f32 (vc0, x0);
114 
115 		++src;
116 		--nframes;
117 	}
118 
119 	// Compute the max in register
120 	do {
121 		float32x2_t vlo  = vget_low_f32 (vc0);
122 		float32x2_t vhi  = vget_high_f32 (vc0);
123 		float32x2_t max0 = vpmax_f32 (vlo, vhi);
124 		float32x2_t max1 = vpmax_f32 (max0, max0); // Max is now at max1[0]
125 		current          = vget_lane_f32 (max1, 0);
126 	} while (0);
127 
128 	return current;
129 }
130 
131 #undef IS_ALIGNED_TO
132 
133 #elif (defined __x86_64__) || (defined __i386__) || (defined _M_X64) || (defined _M_IX86) // ARCH_X86
134 
135 #include <immintrin.h>
136 #include <xmmintrin.h>
137 
138 #ifdef __AVX__ // TODO runtime detect AVX
139 #define _HAVE_DSP_COMPUTE_PEAK
140 
141 #warning using AVX, this limits available architectures
142 
143 static float
dsp_compute_peak(const float * src,uint32_t n_samples,float current)144 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
145 {
146 	const __m256 ABS_MASK = _mm256_set1_ps (-0.0F);
147 
148 	// Broadcast the current max value to all elements of the YMM register
149 	__m256 vmax = _mm256_broadcast_ss (&current);
150 
151 	// Compute single min/max of unaligned portion until alignment is reached
152 	while ((((intptr_t)src) % 32 != 0) && n_samples > 0) {
153 		__m256 vsrc;
154 
155 		vsrc = _mm256_setzero_ps ();
156 		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
157 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
158 		vmax = _mm256_max_ps (vmax, vsrc);
159 
160 		++src;
161 		--n_samples;
162 	}
163 
164 	// Process the aligned portion 16 samples at a time
165 	while (n_samples >= 16) {
166 #ifdef _WIN32
167 		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
168 #else
169 		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
170 #endif
171 		__m256 vsrc1, vsrc2;
172 		vsrc1 = _mm256_load_ps (src + 0);
173 		vsrc2 = _mm256_load_ps (src + 8);
174 
175 		vsrc1 = _mm256_andnot_ps (ABS_MASK, vsrc1);
176 		vsrc2 = _mm256_andnot_ps (ABS_MASK, vsrc2);
177 
178 		vmax = _mm256_max_ps (vmax, vsrc1);
179 		vmax = _mm256_max_ps (vmax, vsrc2);
180 
181 		src += 16;
182 		n_samples -= 16;
183 	}
184 
185 	// Process the remaining samples 8 at a time
186 	while (n_samples >= 8) {
187 		__m256 vsrc;
188 
189 		vsrc = _mm256_load_ps (src);
190 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
191 		vmax = _mm256_max_ps (vmax, vsrc);
192 
193 		src += 8;
194 		n_samples -= 8;
195 	}
196 
197 	// If there are still some left 4 to 8 samples, process them below
198 	while (n_samples > 0) {
199 		__m256 vsrc;
200 
201 		vsrc = _mm256_setzero_ps ();
202 		vsrc = _mm256_castps128_ps256 (_mm_load_ss (src));
203 		vsrc = _mm256_andnot_ps (ABS_MASK, vsrc);
204 		vmax = _mm256_max_ps (vmax, vsrc);
205 
206 		++src;
207 		--n_samples;
208 	}
209 
210 	__m256 tmp;
211 	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
212 	vmax = _mm256_max_ps (tmp, vmax);
213 	tmp  = _mm256_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
214 	vmax = _mm256_max_ps (tmp, vmax);
215 	tmp  = _mm256_permute2f128_ps (vmax, vmax, 1);
216 	vmax = _mm256_max_ps (tmp, vmax);
217 
218 	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
219 	_mm256_zeroupper ();
220 
221 #if defined(__GNUC__) && (__GNUC__ < 5)
222 	return *((float*)&vmax);
223 #elif defined(__GNUC__) && (__GNUC__ < 8)
224 	return vmax[0];
225 #else
226 	return _mm256_cvtss_f32 (vmax);
227 #endif
228 }
229 
230 #elif defined __SSE2__
231 #define _HAVE_DSP_COMPUTE_PEAK
232 
233 static float
dsp_compute_peak(const float * src,uint32_t n_samples,float current)234 dsp_compute_peak (const float* src, uint32_t n_samples, float current)
235 {
236 	const __m128 ABS_MASK = _mm_set1_ps (-0.0F);
237 
238 	__m128 vmax;
239 	__m128 temp;
240 
241 	vmax = _mm_set1_ps (current);
242 
243 	// Compute single max of unaligned portion until alignment is reached
244 	while (((intptr_t)src) % 16 != 0 && n_samples > 0) {
245 		temp = _mm_set1_ps (*src);
246 		temp = _mm_andnot_ps (ABS_MASK, temp);
247 		vmax = _mm_max_ps (vmax, temp);
248 		++src;
249 		--n_samples;
250 	}
251 
252 	// use 64 byte prefetch for quadruple quads
253 	while (n_samples >= 16) {
254 #ifdef _WIN32
255 		_mm_prefetch (((char*)src + (16 * sizeof (float))), _mm_hint (0));
256 #else
257 		__builtin_prefetch (src + (16 * sizeof (float)), 0, 0);
258 #endif
259 		temp = _mm_load_ps (src);
260 		temp = _mm_andnot_ps (ABS_MASK, temp);
261 		vmax = _mm_max_ps (vmax, temp);
262 		src += 4;
263 		temp = _mm_load_ps (src);
264 		temp = _mm_andnot_ps (ABS_MASK, temp);
265 		vmax = _mm_max_ps (vmax, temp);
266 		src += 4;
267 		temp = _mm_load_ps (src);
268 		temp = _mm_andnot_ps (ABS_MASK, temp);
269 		vmax = _mm_max_ps (vmax, temp);
270 		src += 4;
271 		temp = _mm_load_ps (src);
272 		temp = _mm_andnot_ps (ABS_MASK, temp);
273 		vmax = _mm_max_ps (vmax, temp);
274 		src += 4;
275 		n_samples -= 16;
276 	}
277 
278 	// temp through aligned buffers
279 	while (n_samples >= 4) {
280 		temp = _mm_load_ps (src);
281 		temp = _mm_andnot_ps (ABS_MASK, temp);
282 		vmax = _mm_max_ps (vmax, temp);
283 		src += 4;
284 		n_samples -= 4;
285 	}
286 
287 	// temp through the rest < 4 samples
288 	while (n_samples > 0) {
289 		temp = _mm_set1_ps (*src);
290 		temp = _mm_andnot_ps (ABS_MASK, temp);
291 		vmax = _mm_max_ps (vmax, temp);
292 		++src;
293 		--n_samples;
294 	}
295 
296 	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (2, 3, 0, 1));
297 	vmax = _mm_max_ps (temp, vmax);
298 	temp = _mm_shuffle_ps (vmax, vmax, _MM_SHUFFLE (1, 0, 3, 2));
299 	vmax = _mm_max_ps (temp, vmax);
300 
301 #if defined(__GNUC__) && (__GNUC__ < 5)
302 	return *((float*)&vmax);
303 #else
304 	return vmax[0];
305 #endif
306 }
307 #endif // SSE
308 
309 #endif
310 
311 #ifndef _HAVE_DSP_COMPUTE_PEAK
312 #warning dsp_compute_peak is not accelerated on this architecture
313 
314 static float
dsp_compute_peak(const float * const buf,uint32_t n_samples,float current)315 dsp_compute_peak (const float* const buf, uint32_t n_samples, float current)
316 {
317 	for (uint32_t i = 0; i < n_samples; ++i) {
318 		const float x = fabsf (buf[i]);
319 		if (x > current) {
320 			current = x;
321 		}
322 	}
323 	return current;
324 }
325 #endif
326