1 /*
2  * Ones' complement checksum test & benchmark
3  *
4  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5  * See https://llvm.org/LICENSE.txt for license information.
6  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7  */
8 
9 #define _GNU_SOURCE
10 #include <inttypes.h>
11 #include <stdbool.h>
12 #include <stdint.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/mman.h>
17 #include <time.h>
18 #include <unistd.h>
19 #include "../include/networking.h"
20 
21 #if WANT_ASSERT
22 #undef NDEBUG
23 #include <assert.h>
24 #define Assert(exp) assert(exp)
25 #else
26 #define Assert(exp) (void) (exp)
27 #endif
28 
29 #ifdef __GNUC__
30 #define may_alias __attribute__((__may_alias__))
31 #else
32 #define may_alias
33 #endif
34 
35 #define CACHE_LINE 64
36 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
37 
38 /* Reference implementation - do not modify! */
39 static uint16_t
checksum_simple(const void * ptr,uint32_t nbytes)40 checksum_simple(const void *ptr, uint32_t nbytes)
41 {
42     const uint16_t *may_alias hptr = ptr;
43     uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
44 
45     /* Sum all halfwords, assume misaligned accesses are handled in HW */
46     for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
47     {
48 	sum += *hptr++;
49     }
50 
51     /* Add any trailing odd byte */
52     if ((nbytes & 0x01) != 0)
53     {
54 	sum += *(uint8_t *) hptr;
55     }
56 
57     /* Fold 64-bit sum to 32 bits */
58     sum = (sum & 0xffffffff) + (sum >> 32);
59     sum = (sum & 0xffffffff) + (sum >> 32);
60     Assert(sum == (uint32_t) sum);
61 
62     /* Fold 32-bit sum to 16 bits */
63     sum = (sum & 0xffff) + (sum >> 16);
64     sum = (sum & 0xffff) + (sum >> 16);
65     Assert(sum == (uint16_t) sum);
66 
67     return (uint16_t) sum;
68 }
69 
70 static struct
71 {
72     uint16_t (*cksum_fp)(const void *, uint32_t);
73     const char *name;
74 } implementations[] =
75 {
76     { checksum_simple, "simple"},
77     { __chksum, "scalar"},
78 #if __arm__
79     { __chksum_arm_simd, "simd" },
80 #elif __aarch64__
81     { __chksum_aarch64_simd, "simd" },
82 #endif
83     { NULL, NULL}
84 };
85 
86 static int
find_impl(const char * name)87 find_impl(const char *name)
88 {
89     for (int i = 0; implementations[i].name != NULL; i++)
90     {
91 	if (strcmp(implementations[i].name, name) == 0)
92 	{
93 	    return i;
94 	}
95     }
96     return -1;
97 }
98 
99 static uint16_t (*CKSUM_FP)(const void *, uint32_t);
100 static volatile uint16_t SINK;
101 
102 static bool
verify(const void * data,uint32_t offset,uint32_t size)103 verify(const void *data, uint32_t offset, uint32_t size)
104 {
105 
106     uint16_t csum_expected = checksum_simple(data, size);
107     uint16_t csum_actual = CKSUM_FP(data, size);
108     if (csum_actual != csum_expected)
109     {
110 	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
111 		"actual %04x expected %04x (valid)",
112 		offset, size, csum_actual, csum_expected);
113 	if (size < 65536)
114 	{
115 	    /* Fatal error */
116 	    exit(EXIT_FAILURE);
117 	}
118 	/* Else some implementations only support sizes up to 2^16 */
119 	return false;
120     }
121     return true;
122 }
123 
124 static uint64_t
clock_get_ns(void)125 clock_get_ns(void)
126 {
127     struct timespec ts;
128     clock_gettime(CLOCK_MONOTONIC, &ts);
129     return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
130 }
131 
132 static void
benchmark(const uint8_t * base,size_t poolsize,uint32_t blksize,uint32_t numops,uint64_t cpufreq)133 benchmark(const uint8_t *base,
134 	  size_t poolsize,
135 	  uint32_t blksize,
136 	  uint32_t numops,
137 	  uint64_t cpufreq)
138 {
139     printf("%11u ", (unsigned int) blksize); fflush(stdout);
140 
141     uint64_t start = clock_get_ns();
142     for (uint32_t i = 0; i < numops; i ++)
143     {
144 	/* Read a random value from the pool */
145 	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
146 	/* Generate a random starting address */
147 	const void *data = &base[random % (poolsize - blksize)];
148 	SINK = CKSUM_FP(data, blksize);
149     }
150     uint64_t end = clock_get_ns();
151 
152 #define MEGABYTE 1000000 /* Decimal megabyte (MB) */
153     uint64_t elapsed_ns = end - start;
154     uint64_t elapsed_ms = elapsed_ns / 1000000;
155     uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
156     uint64_t accbytes = (uint64_t) numops * blksize;
157     printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
158     unsigned int cyc_per_blk = cpufreq / blks_per_s;
159     printf("%11u ", cyc_per_blk);
160     if (blksize != 0)
161     {
162 	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
163 	printf("%7u.%03u ",
164 		cyc_per_byte / 1000, cyc_per_byte % 1000);
165     }
166     printf("\n");
167 }
168 
main(int argc,char * argv[])169 int main(int argc, char *argv[])
170 {
171     int c;
172     bool DUMP = false;
173     uint32_t IMPL = 0;/* Simple implementation */
174     uint64_t CPUFREQ = 0;
175     uint32_t BLKSIZE = 0;
176     uint32_t NUMOPS = 1000000;
177     uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
178 
179     setvbuf(stdout, NULL, _IOLBF, 160);
180     while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
181     {
182 	switch (c)
183 	{
184 	    case 'b' :
185 		{
186 		    int blksize = atoi(optarg);
187 		    if (blksize < 1 || blksize > POOLSIZE / 2)
188 		    {
189 			fprintf(stderr, "Invalid block size %d\n", blksize);
190 			exit(EXIT_FAILURE);
191 		    }
192 		    BLKSIZE = (unsigned) blksize;
193 		    break;
194 		}
195 	    case 'd' :
196 		DUMP = true;
197 		break;
198 	    case 'f' :
199 		{
200 		    int64_t cpufreq = atoll(optarg);
201 		    if (cpufreq < 1)
202 		    {
203 			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
204 				cpufreq);
205 			exit(EXIT_FAILURE);
206 		    }
207 		    CPUFREQ = cpufreq;
208 		    break;
209 		}
210 	    case 'i' :
211 		{
212 		    int impl = find_impl(optarg);
213 		    if (impl < 0)
214 		    {
215 			fprintf(stderr, "Invalid implementation %s\n", optarg);
216 			goto usage;
217 		    }
218 		    IMPL = (unsigned) impl;
219 		    break;
220 		}
221 	    case 'n' :
222 		{
223 		    int numops = atoi(optarg);
224 		    if (numops < 1)
225 		    {
226 			fprintf(stderr, "Invalid number of operations %d\n", numops);
227 			exit(EXIT_FAILURE);
228 		    }
229 		    NUMOPS = (unsigned) numops;
230 		    break;
231 		}
232 	    case 'p' :
233 		{
234 		    int poolsize = atoi(optarg);
235 		    if (poolsize < 4096)
236 		    {
237 			fprintf(stderr, "Invalid pool size %d\n", poolsize);
238 			exit(EXIT_FAILURE);
239 		    }
240 		    char c = optarg[strlen(optarg) - 1];
241 		    if (c == 'M')
242 		    {
243 			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
244 		    }
245 		    else if (c == 'K')
246 		    {
247 			POOLSIZE = (unsigned) poolsize * 1024;
248 		    }
249 		    else
250 		    {
251 			POOLSIZE = (unsigned) poolsize;
252 		    }
253 		    break;
254 		}
255 	    default :
256 usage :
257 		fprintf(stderr, "Usage: checksum <options>\n"
258 			"-b <blksize>    Block size\n"
259 			"-d              Dump first 96 bytes of data\n"
260 			"-f <cpufreq>    CPU frequency (Hz)\n"
261 			"-i <impl>       Implementation\n"
262 			"-n <numops>     Number of operations\n"
263 			"-p <poolsize>   Pool size (K or M suffix)\n"
264 		       );
265 		printf("Implementations:");
266 		for (int i = 0; implementations[i].name != NULL; i++)
267 		{
268 		    printf(" %s", implementations[i].name);
269 		}
270 		printf("\n");
271 		exit(EXIT_FAILURE);
272 	}
273     }
274     if (optind > argc)
275     {
276 	goto usage;
277     }
278 
279     CKSUM_FP = implementations[IMPL].cksum_fp;
280     POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
281     uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
282 			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
283     if (base == MAP_FAILED)
284     {
285 	perror("aligned_alloc"), exit(EXIT_FAILURE);
286     }
287     for (size_t i = 0; i < POOLSIZE / 4; i++)
288     {
289 	((uint32_t *) base)[i] = rand();
290     }
291 
292     printf("Implementation: %s\n", implementations[IMPL].name);
293     printf("numops %u, poolsize ", NUMOPS);
294     if (POOLSIZE % (1024 * 1024) == 0)
295     {
296 	printf("%uMiB", POOLSIZE / (1024 * 1024));
297     }
298     else if (POOLSIZE % 1024 == 0)
299     {
300 	printf("%uKiB", POOLSIZE / 1024);
301     }
302     else
303     {
304 	printf("%uB", POOLSIZE);
305     }
306     printf(", blocksize %u, CPU frequency %juMHz\n",
307 	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
308 #if WANT_ASSERT
309     printf("Warning: assertions are enabled\n");
310 #endif
311 
312     if (DUMP)
313     {
314 	/* Print out first 96 bytes of data for human debugging */
315 	for (int i = 0; i < 96; i++)
316 	{
317 	    if (i % 8 == 0)
318 		printf("%2u:", i);
319 	    printf(" %02x", base[i]);
320 	    if (i % 8 == 7)
321 		printf("\n");
322 	}
323     }
324 
325     /* Verify that chosen algorithm handles all combinations of offsets and sizes */
326     printf("Verifying..."); fflush(stdout);
327     bool success = true;
328     /* Check all (relevant) combinations of size and offset */
329     for (int size = 0; size <= 256; size++)
330     {
331 	for (int offset = 0; offset < 255; offset++)
332 	{
333 	    /* Check at start of mapped memory */
334 	    success &= verify(&base[offset], offset, size);
335 	    /* Check at end of mapped memory */
336 	    uint8_t *p = base + POOLSIZE - (size + offset);
337 	    success &= verify(p, (uintptr_t) p % 64, size);
338 	}
339     }
340     /* Check increasingly larger sizes */
341     for (size_t size = 1; size < POOLSIZE; size *= 2)
342     {
343 	success &= verify(base, 0, size);
344     }
345     /* Check the full size, this can detect accumulator overflows */
346     success &= verify(base, 0, POOLSIZE);
347     printf("%s\n", success ? "OK" : "failure");
348 
349     /* Print throughput in decimal megabyte (1000000B) per second */
350     if (CPUFREQ != 0)
351     {
352 	printf("%11s %11s %11s %11s\n",
353 	       "block size", "MB/s", "cycles/blk", "cycles/byte");
354     }
355     else
356     {
357 	printf("%11s %11s %11s %11s\n",
358 	       "block size", "MB/s", "ns/blk", "ns/byte");
359 	CPUFREQ = 1000000000;
360     }
361     if (BLKSIZE != 0)
362     {
363 	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
364     }
365     else
366     {
367 	static const uint16_t sizes[] =
368 	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
369 	for (int i = 0; sizes[i] != 0; i++)
370 	{
371 	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
372 	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
373 	}
374     }
375 
376     if (munmap(base, POOLSIZE) != 0)
377     {
378 	perror("munmap"), exit(EXIT_FAILURE);
379     }
380 
381     return success ? EXIT_SUCCESS : EXIT_FAILURE;
382 }
383