1 /*
2  * Ones' complement checksum test & benchmark
3  *
4  * Copyright (c) 2016-2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #define _GNU_SOURCE
9 #include <inttypes.h>
10 #include <stdbool.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <sys/mman.h>
16 #include <time.h>
17 #include <unistd.h>
18 #include "../include/networking.h"
19 
20 #if WANT_ASSERT
21 #undef NDEBUG
22 #include <assert.h>
23 #define Assert(exp) assert(exp)
24 #else
25 #define Assert(exp) (void) (exp)
26 #endif
27 
28 #ifdef __GNUC__
29 #define may_alias __attribute__((__may_alias__))
30 #else
31 #define may_alias
32 #endif
33 
34 #define CACHE_LINE 64
35 #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
36 
37 /* Reference implementation - do not modify! */
38 static uint16_t
39 checksum_simple(const void *ptr, uint32_t nbytes)
40 {
41     const uint16_t *may_alias hptr = ptr;
42     uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
43 
44     /* Sum all halfwords, assume misaligned accesses are handled in HW */
45     for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
46     {
47 	sum += *hptr++;
48     }
49 
50     /* Add any trailing odd byte */
51     if ((nbytes & 0x01) != 0)
52     {
53 	sum += *(uint8_t *) hptr;
54     }
55 
56     /* Fold 64-bit sum to 32 bits */
57     sum = (sum & 0xffffffff) + (sum >> 32);
58     sum = (sum & 0xffffffff) + (sum >> 32);
59     Assert(sum == (uint32_t) sum);
60 
61     /* Fold 32-bit sum to 16 bits */
62     sum = (sum & 0xffff) + (sum >> 16);
63     sum = (sum & 0xffff) + (sum >> 16);
64     Assert(sum == (uint16_t) sum);
65 
66     return (uint16_t) sum;
67 }
68 
69 static struct
70 {
71     uint16_t (*cksum_fp)(const void *, uint32_t);
72     const char *name;
73 } implementations[] =
74 {
75     { checksum_simple, "simple"},
76     { __chksum, "scalar"},
77 #if __arm__
78     { __chksum_arm_simd, "simd" },
79 #elif __aarch64__
80     { __chksum_aarch64_simd, "simd" },
81 #endif
82     { NULL, NULL}
83 };
84 
85 static int
86 find_impl(const char *name)
87 {
88     for (int i = 0; implementations[i].name != NULL; i++)
89     {
90 	if (strcmp(implementations[i].name, name) == 0)
91 	{
92 	    return i;
93 	}
94     }
95     return -1;
96 }
97 
98 static uint16_t (*CKSUM_FP)(const void *, uint32_t);
99 static volatile uint16_t SINK;
100 
101 static bool
102 verify(const void *data, uint32_t offset, uint32_t size)
103 {
104 
105     uint16_t csum_expected = checksum_simple(data, size);
106     uint16_t csum_actual = CKSUM_FP(data, size);
107     if (csum_actual != csum_expected)
108     {
109 	fprintf(stderr, "\nInvalid checksum for offset %u size %u: "
110 		"actual %04x expected %04x (valid)",
111 		offset, size, csum_actual, csum_expected);
112 	if (size < 65536)
113 	{
114 	    /* Fatal error */
115 	    exit(EXIT_FAILURE);
116 	}
117 	/* Else some implementations only support sizes up to 2^16 */
118 	return false;
119     }
120     return true;
121 }
122 
123 static uint64_t
124 clock_get_ns(void)
125 {
126     struct timespec ts;
127     clock_gettime(CLOCK_MONOTONIC, &ts);
128     return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
129 }
130 
131 static void
132 benchmark(const uint8_t *base,
133 	  size_t poolsize,
134 	  uint32_t blksize,
135 	  uint32_t numops,
136 	  uint64_t cpufreq)
137 {
138     printf("%11u ", (unsigned int) blksize); fflush(stdout);
139 
140     uint64_t start = clock_get_ns();
141     for (uint32_t i = 0; i < numops; i ++)
142     {
143 	/* Read a random value from the pool */
144 	uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
145 	/* Generate a random starting address */
146 	const void *data = &base[random % (poolsize - blksize)];
147 	SINK = CKSUM_FP(data, blksize);
148     }
149     uint64_t end = clock_get_ns();
150 
151 #define MEGABYTE 1000000 /* Decimal megabyte (MB) */
152     uint64_t elapsed_ns = end - start;
153     uint64_t elapsed_ms = elapsed_ns / 1000000;
154     uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
155     uint64_t accbytes = (uint64_t) numops * blksize;
156     printf("%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
157     unsigned int cyc_per_blk = cpufreq / blks_per_s;
158     printf("%11u ", cyc_per_blk);
159     if (blksize != 0)
160     {
161 	unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
162 	printf("%7u.%03u ",
163 		cyc_per_byte / 1000, cyc_per_byte % 1000);
164     }
165     printf("\n");
166 }
167 
168 int main(int argc, char *argv[])
169 {
170     int c;
171     bool DUMP = false;
172     uint32_t IMPL = 0;/* Simple implementation */
173     uint64_t CPUFREQ = 0;
174     uint32_t BLKSIZE = 0;
175     uint32_t NUMOPS = 1000000;
176     uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
177 
178     setvbuf(stdout, NULL, _IOLBF, 160);
179     while ((c = getopt(argc, argv, "b:df:i:n:p:")) != -1)
180     {
181 	switch (c)
182 	{
183 	    case 'b' :
184 		{
185 		    int blksize = atoi(optarg);
186 		    if (blksize < 1 || blksize > POOLSIZE / 2)
187 		    {
188 			fprintf(stderr, "Invalid block size %d\n", blksize);
189 			exit(EXIT_FAILURE);
190 		    }
191 		    BLKSIZE = (unsigned) blksize;
192 		    break;
193 		}
194 	    case 'd' :
195 		DUMP = true;
196 		break;
197 	    case 'f' :
198 		{
199 		    int64_t cpufreq = atoll(optarg);
200 		    if (cpufreq < 1)
201 		    {
202 			fprintf(stderr, "Invalid CPU frequency %"PRId64"\n",
203 				cpufreq);
204 			exit(EXIT_FAILURE);
205 		    }
206 		    CPUFREQ = cpufreq;
207 		    break;
208 		}
209 	    case 'i' :
210 		{
211 		    int impl = find_impl(optarg);
212 		    if (impl < 0)
213 		    {
214 			fprintf(stderr, "Invalid implementation %s\n", optarg);
215 			goto usage;
216 		    }
217 		    IMPL = (unsigned) impl;
218 		    break;
219 		}
220 	    case 'n' :
221 		{
222 		    int numops = atoi(optarg);
223 		    if (numops < 1)
224 		    {
225 			fprintf(stderr, "Invalid number of operations %d\n", numops);
226 			exit(EXIT_FAILURE);
227 		    }
228 		    NUMOPS = (unsigned) numops;
229 		    break;
230 		}
231 	    case 'p' :
232 		{
233 		    int poolsize = atoi(optarg);
234 		    if (poolsize < 4096)
235 		    {
236 			fprintf(stderr, "Invalid pool size %d\n", poolsize);
237 			exit(EXIT_FAILURE);
238 		    }
239 		    char c = optarg[strlen(optarg) - 1];
240 		    if (c == 'M')
241 		    {
242 			POOLSIZE = (unsigned) poolsize * 1024 * 1024;
243 		    }
244 		    else if (c == 'K')
245 		    {
246 			POOLSIZE = (unsigned) poolsize * 1024;
247 		    }
248 		    else
249 		    {
250 			POOLSIZE = (unsigned) poolsize;
251 		    }
252 		    break;
253 		}
254 	    default :
255 usage :
256 		fprintf(stderr, "Usage: checksum <options>\n"
257 			"-b <blksize>    Block size\n"
258 			"-d              Dump first 96 bytes of data\n"
259 			"-f <cpufreq>    CPU frequency (Hz)\n"
260 			"-i <impl>       Implementation\n"
261 			"-n <numops>     Number of operations\n"
262 			"-p <poolsize>   Pool size (K or M suffix)\n"
263 		       );
264 		printf("Implementations:");
265 		for (int i = 0; implementations[i].name != NULL; i++)
266 		{
267 		    printf(" %s", implementations[i].name);
268 		}
269 		printf("\n");
270 		exit(EXIT_FAILURE);
271 	}
272     }
273     if (optind > argc)
274     {
275 	goto usage;
276     }
277 
278     CKSUM_FP = implementations[IMPL].cksum_fp;
279     POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
280     uint8_t *base = mmap(0, POOLSIZE, PROT_READ|PROT_WRITE,
281 			MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
282     if (base == MAP_FAILED)
283     {
284 	perror("aligned_alloc"), exit(EXIT_FAILURE);
285     }
286     for (size_t i = 0; i < POOLSIZE / 4; i++)
287     {
288 	((uint32_t *) base)[i] = rand();
289     }
290 
291     printf("Implementation: %s\n", implementations[IMPL].name);
292     printf("numops %u, poolsize ", NUMOPS);
293     if (POOLSIZE % (1024 * 1024) == 0)
294     {
295 	printf("%uMiB", POOLSIZE / (1024 * 1024));
296     }
297     else if (POOLSIZE % 1024 == 0)
298     {
299 	printf("%uKiB", POOLSIZE / 1024);
300     }
301     else
302     {
303 	printf("%uB", POOLSIZE);
304     }
305     printf(", blocksize %u, CPU frequency %juMHz\n",
306 	   BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
307 #if WANT_ASSERT
308     printf("Warning: assertions are enabled\n");
309 #endif
310 
311     if (DUMP)
312     {
313 	/* Print out first 96 bytes of data for human debugging */
314 	for (int i = 0; i < 96; i++)
315 	{
316 	    if (i % 8 == 0)
317 		printf("%2u:", i);
318 	    printf(" %02x", base[i]);
319 	    if (i % 8 == 7)
320 		printf("\n");
321 	}
322     }
323 
324     /* Verify that chosen algorithm handles all combinations of offsets and sizes */
325     printf("Verifying..."); fflush(stdout);
326     bool success = true;
327     /* Check all (relevant) combinations of size and offset */
328     for (int size = 0; size <= 256; size++)
329     {
330 	for (int offset = 0; offset < 255; offset++)
331 	{
332 	    /* Check at start of mapped memory */
333 	    success &= verify(&base[offset], offset, size);
334 	    /* Check at end of mapped memory */
335 	    uint8_t *p = base + POOLSIZE - (size + offset);
336 	    success &= verify(p, (uintptr_t) p % 64, size);
337 	}
338     }
339     /* Check increasingly larger sizes */
340     for (size_t size = 1; size < POOLSIZE; size *= 2)
341     {
342 	success &= verify(base, 0, size);
343     }
344     /* Check the full size, this can detect accumulator overflows */
345     success &= verify(base, 0, POOLSIZE);
346     printf("%s\n", success ? "OK" : "failure");
347 
348     /* Print throughput in decimal megabyte (1000000B) per second */
349     if (CPUFREQ != 0)
350     {
351 	printf("%11s %11s %11s %11s\n",
352 	       "block size", "MB/s", "cycles/blk", "cycles/byte");
353     }
354     else
355     {
356 	printf("%11s %11s %11s %11s\n",
357 	       "block size", "MB/s", "ns/blk", "ns/byte");
358 	CPUFREQ = 1000000000;
359     }
360     if (BLKSIZE != 0)
361     {
362 	benchmark(base, POOLSIZE, BLKSIZE, NUMOPS, CPUFREQ);
363     }
364     else
365     {
366 	static const uint16_t sizes[] =
367 	    { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
368 	for (int i = 0; sizes[i] != 0; i++)
369 	{
370 	    uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
371 	    benchmark(base, POOLSIZE, sizes[i], numops, CPUFREQ);
372 	}
373     }
374 
375     if (munmap(base, POOLSIZE) != 0)
376     {
377 	perror("munmap"), exit(EXIT_FAILURE);
378     }
379 
380     return success ? EXIT_SUCCESS : EXIT_FAILURE;
381 }
382