1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <unistd.h>
5 #include <sched.h>
6 #include <time.h>
7 
8 #define rdtscll(val) \
9      __asm__ __volatile__("rdtsc" : "=A" (val))
10 
11 #define likely(x)       __builtin_expect((x),1)
12 #define unlikely(x)     __builtin_expect((x),0)
13 
14 typedef short int s16;
15 typedef int s32;
16 
17 #if 0
18 #define CONFIG_SMP
19 #endif
20 
21 #ifdef CONFIG_SMP
22 #define LOCK_PREFIX "lock ; "
23 #else
24 #define LOCK_PREFIX ""
25 #endif
26 
27 struct __xchg_dummy { unsigned long a[100]; };
28 #define __xg(x) ((struct __xchg_dummy *)(x))
29 
__cmpxchg(volatile void * ptr,unsigned long old,unsigned long new,int size)30 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
31 				      unsigned long new, int size)
32 {
33 	unsigned long prev;
34 	switch (size) {
35 	case 1:
36 		__asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
37 				     : "=a"(prev)
38 				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
39 				     : "memory");
40 		return prev;
41 	case 2:
42 		__asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
43 				     : "=a"(prev)
44 				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
45 				     : "memory");
46 		return prev;
47 	case 4:
48 		__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
49 				     : "=a"(prev)
50 				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
51 				     : "memory");
52 		return prev;
53 	}
54 	return old;
55 }
56 
57 #define cmpxchg(ptr,o,n)\
58 	((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
59 				       (unsigned long)(n),sizeof(*(ptr))))
60 
atomic_add(volatile int * dst,int v)61 static inline void atomic_add(volatile int *dst, int v)
62 {
63 	__asm__ __volatile__(
64 		LOCK_PREFIX "addl %1,%0"
65 		:"=m" (*dst)
66 		:"ir" (v), "m" (*dst));
67 }
68 
detect_cpu_clock()69 static double detect_cpu_clock()
70 {
71 	struct timespec tm_begin, tm_end;
72 	unsigned long long tsc_begin, tsc_end;
73 
74 	/* Warm cache */
75 	clock_gettime(CLOCK_MONOTONIC, &tm_begin);
76 
77 	rdtscll(tsc_begin);
78 	clock_gettime(CLOCK_MONOTONIC, &tm_begin);
79 
80 	usleep(1000000);
81 
82 	rdtscll(tsc_end);
83 	clock_gettime(CLOCK_MONOTONIC, &tm_end);
84 
85 	return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec + (tm_end.tv_nsec - tm_begin.tv_nsec) / 1e9);
86 }
87 
mix_areas_srv(unsigned int size,const s16 * src,volatile s32 * sum,unsigned int src_step,unsigned int sum_step)88 void mix_areas_srv(unsigned int size,
89 		   const s16 *src,
90 		   volatile s32 *sum,
91 		   unsigned int src_step, unsigned int sum_step)
92 {
93 	src_step /= sizeof(*src);
94 	sum_step /= sizeof(*sum);
95         while (size-- > 0) {
96                 atomic_add(sum, *src);
97                 src += src_step;
98                 sum += sum_step;
99         }
100 }
101 
saturate(unsigned int size,s16 * dst,const s32 * sum,unsigned int dst_step,unsigned int sum_step)102 void saturate(unsigned int size,
103               s16 *dst, const s32 *sum,
104               unsigned int dst_step, unsigned int sum_step)
105 {
106 	dst_step /= sizeof(*dst);
107 	sum_step /= sizeof(*sum);
108         while (size-- > 0) {
109                 s32 sample = *sum;
110                 if (unlikely(sample < -0x8000))
111                         *dst = -0x8000;
112                 else if (unlikely(sample > 0x7fff))
113                         *dst = 0x7fff;
114                 else
115                         *dst = sample;
116                 dst += dst_step;
117                 sum += sum_step;
118         }
119 }
120 
mix_areas0(unsigned int size,volatile s16 * dst,s16 * src,volatile s32 * sum,unsigned int dst_step,unsigned int src_step,unsigned int sum_step)121 void mix_areas0(unsigned int size,
122 		volatile s16 *dst, s16 *src,
123 		volatile s32 *sum,
124 		unsigned int dst_step,
125 		unsigned int src_step,
126 		unsigned int sum_step)
127 {
128 	dst_step /= sizeof(*dst);
129 	src_step /= sizeof(*src);
130 	sum_step /= sizeof(*sum);
131 	while (size-- > 0) {
132 		s32 sample = *dst + *src;
133 		if (unlikely(sample < -0x8000))
134 			*dst = -0x8000;
135 		else if (unlikely(sample > 0x7fff))
136 			*dst = 0x7fff;
137 		else
138 			*dst = sample;
139 		dst += dst_step;
140 		src += src_step;
141 		sum += sum_step;
142 	}
143 }
144 
145 #define MIX_AREAS_16 mix_areas1
146 #define MIX_AREAS_16_MMX mix_areas1_mmx
147 #define MIX_AREAS_32 mix_areas1_32
148 #define MIX_AREAS_24 mix_areas1_24
149 #define MIX_AREAS_24_CMOV mix_areas1_24_cmov
150 #define XADD "addl"
151 #define XSUB "subl"
152 #include "../src/pcm/pcm_dmix_i386.h"
153 static void *ptr_mix_areas1_32 __attribute__((unused)) = &mix_areas1_32;
154 static void *ptr_mix_areas1_24 __attribute__((unused)) = &mix_areas1_24;
155 static void *ptr_mix_areas1_24_cmov __attribute__((unused)) = &mix_areas1_24_cmov;
156 
mix_areas2(unsigned int size,volatile s16 * dst,const s16 * src,volatile s32 * sum,unsigned int dst_step,unsigned int src_step)157 void mix_areas2(unsigned int size,
158 		volatile s16 *dst, const s16 *src,
159 		volatile s32 *sum,
160 		unsigned int dst_step,
161 		unsigned int src_step)
162 {
163 	dst_step /= sizeof(*dst);
164 	src_step /= sizeof(*src);
165 	while (size-- > 0) {
166 		s32 sample = *src;
167 		s32 old_sample = *sum;
168 		if (cmpxchg(dst, 0, 1) == 0)
169 			sample -= old_sample;
170 		atomic_add(sum, sample);
171 		do {
172 			sample = *sum;
173 			if (unlikely(sample < -0x8000))
174 				*dst = -0x8000;
175 			else if (unlikely(sample > 0x7fff))
176 				*dst = 0x7fff;
177 			else
178 				*dst = sample;
179 		} while (unlikely(sample != *sum));
180 		sum++;
181 		dst += dst_step;
182 		src += src_step;
183 	}
184 }
185 
setscheduler(void)186 void setscheduler(void)
187 {
188 	struct sched_param sched_param;
189 
190 	if (sched_getparam(0, &sched_param) < 0) {
191 		printf("Scheduler getparam failed...\n");
192 		return;
193 	}
194 	sched_param.sched_priority = sched_get_priority_max(SCHED_RR);
195 	if (!sched_setscheduler(0, SCHED_RR, &sched_param)) {
196 		printf("Scheduler set to Round Robin with priority %i...\n", sched_param.sched_priority);
197 		fflush(stdout);
198 		return;
199 	}
200 	printf("!!!Scheduler set to Round Robin with priority %i FAILED!!!\n", sched_param.sched_priority);
201 }
202 
203 int cache_size = 1024*1024;
204 
init(s16 * dst,s32 * sum,int size)205 void init(s16 *dst, s32 *sum, int size)
206 {
207 	int count;
208 	char *a;
209 
210 	for (count = size - 1; count >= 0; count--)
211 		*sum++ = 0;
212 	for (count = size - 1; count >= 0; count--)
213 		*dst++ = 0;
214 	a = malloc(cache_size);
215 	for (count = cache_size - 1; count >= 0; count--) {
216 		a[count] = count & 0xff;
217 		a[count] ^= 0x55;
218 		a[count] ^= 0xaa;
219 	}
220 	free(a);
221 }
222 
main(int argc,char ** argv)223 int main(int argc, char **argv)
224 {
225 	int size = 2048, n = 4, max = 32267;
226 	int LOOP = 100;
227 	int i, t;
228 	unsigned long long begin, end, diff, diffS, diff0, diff1, diff1_mmx, diff2;
229         double cpu_clock = detect_cpu_clock();
230 	s16 *dst = malloc(sizeof(*dst) * size);
231 	s32 *sum = calloc(size, sizeof(*sum));
232 	s16 **srcs = malloc(sizeof(*srcs) * n);
233 
234 	setscheduler();
235 #ifndef CONFIG_SMP
236         printf("CPU clock: %fMhz (UP)\n\n", cpu_clock / 10e5);
237 #else
238         printf("CPU clock: %fMhz (SMP)\n\n", cpu_clock / 10e5);
239 #endif
240 	if (argc > 3) {
241 		size = atoi(argv[1]);
242 		n = atoi(argv[2]);
243 		max = atoi(argv[3]);
244 	}
245 	if (argc > 4)
246 		cache_size = atoi(argv[4]) * 1024;
247 	for (i = 0; i < n; i++) {
248 		int k;
249 		s16 *s;
250 		srcs[i] = s = malloc(sizeof(s16) * size);
251 		for (k = 0; k < size; ++k, ++s) {
252 			*s = (rand() % (max * 2)) - max;
253 		}
254 	}
255 
256 	for (t = 0, diffS = -1; t < LOOP; t++) {
257 		init(dst, sum, size);
258 		rdtscll(begin);
259 		for (i = 0; i < n; i++) {
260 			mix_areas_srv(size, srcs[i], sum, 2, 4);
261 		}
262 		saturate(size, dst, sum, 2, 4);
263 		rdtscll(end);
264 		diff = end - begin;
265 		if (diff < diffS)
266 			diffS = diff;
267 		printf("mix_areas_srv : %llu               \r", diff); fflush(stdout);
268 	}
269 
270 	for (t = 0, diff0 = -1; t < LOOP; t++) {
271 		init(dst, sum, size);
272 		rdtscll(begin);
273 		for (i = 0; i < n; i++) {
274 			mix_areas0(size, dst, srcs[i], sum, 2, 2, 4);
275 		}
276 		rdtscll(end);
277 		diff = end - begin;
278 		if (diff < diff0)
279 			diff0 = diff;
280 		printf("mix_areas0    : %llu               \r", diff); fflush(stdout);
281 	}
282 
283 	for (t = 0, diff1 = -1; t < LOOP; t++) {
284 		init(dst, sum, size);
285 		rdtscll(begin);
286 		for (i = 0; i < n; i++) {
287 			mix_areas1(size, dst, srcs[i], sum, 2, 2, 4);
288 		}
289 		rdtscll(end);
290 		diff = end - begin;
291 		if (diff < diff1)
292 			diff1 = diff;
293 		printf("mix_areas1    : %llu              \r", diff); fflush(stdout);
294 	}
295 
296 	for (t = 0, diff1_mmx = -1; t < LOOP; t++) {
297 		init(dst, sum, size);
298 		rdtscll(begin);
299 		for (i = 0; i < n; i++) {
300 			mix_areas1_mmx(size, dst, srcs[i], sum, 2, 2, 4);
301 		}
302 		rdtscll(end);
303 		diff = end - begin;
304 		if (diff < diff1_mmx)
305 			diff1_mmx = diff;
306 		printf("mix_areas1_mmx: %llu              \r", diff); fflush(stdout);
307 	}
308 
309 	for (t = 0, diff2 = -1; t < LOOP; t++) {
310 		init(dst, sum, size);
311 		rdtscll(begin);
312 		for (i = 0; i < n; i++) {
313 			mix_areas2(size, dst, srcs[i], sum, 2, 2);
314 		}
315 		rdtscll(end);
316 		diff = end - begin;
317 		if (diff < diff2)
318 			diff2 = diff;
319 		printf("mix_areas2    : %llu              \r", diff); fflush(stdout);
320 	}
321 
322 	printf("                                                                           \r");
323 	printf("Summary (the best times):\n");
324 	printf("mix_areas_srv  : %8llu %f%%\n", diffS, 100*2*44100.0*diffS/(size*n*cpu_clock));
325 	printf("mix_areas0     : %8llu %f%%\n", diff0, 100*2*44100.0*diff0/(size*n*cpu_clock));
326 	printf("mix_areas1     : %8llu %f%%\n", diff1, 100*2*44100.0*diff1/(size*n*cpu_clock));
327 	printf("mix_areas1_mmx : %8llu %f%%\n", diff1_mmx, 100*2*44100.0*diff1_mmx/(size*n*cpu_clock));
328 	printf("mix_areas2     : %8llu %f%%\n", diff2, 100*2*44100.0*diff2/(size*n*cpu_clock));
329 
330 	printf("\n");
331 	printf("areas1/srv ratio     : %f\n", (double)diff1 / diffS);
332 	printf("areas1_mmx/srv ratio : %f\n", (double)diff1_mmx / diffS);
333 
334 	return 0;
335 }
336