1 #include <stdio.h>
2 #include <errno.h>
3 #include <assert.h>
4 #include <stdlib.h>
5 #include <stddef.h>
6 #include <signal.h>
7 #include <inttypes.h>
8 #include <math.h>
9 
10 #ifdef CONFIG_LIBAIO
11 #include <libaio.h>
12 #endif
13 
14 #include <sys/types.h>
15 #include <sys/stat.h>
16 #include <sys/ioctl.h>
17 #include <sys/syscall.h>
18 #include <sys/resource.h>
19 #include <sys/mman.h>
20 #include <sys/uio.h>
21 #include <linux/fs.h>
22 #include <fcntl.h>
23 #include <unistd.h>
24 #include <string.h>
25 #include <pthread.h>
26 #include <sched.h>
27 
28 #include "../arch/arch.h"
29 #include "../lib/types.h"
30 #include "../lib/roundup.h"
31 #include "../lib/rand.h"
32 #include "../minmax.h"
33 #include "../os/linux/io_uring.h"
34 
35 struct io_sq_ring {
36 	unsigned *head;
37 	unsigned *tail;
38 	unsigned *ring_mask;
39 	unsigned *ring_entries;
40 	unsigned *flags;
41 	unsigned *array;
42 };
43 
44 struct io_cq_ring {
45 	unsigned *head;
46 	unsigned *tail;
47 	unsigned *ring_mask;
48 	unsigned *ring_entries;
49 	struct io_uring_cqe *cqes;
50 };
51 
52 #define DEPTH			128
53 #define BATCH_SUBMIT		32
54 #define BATCH_COMPLETE		32
55 #define BS			4096
56 
57 #define MAX_FDS			16
58 
59 static unsigned sq_ring_mask, cq_ring_mask;
60 
61 struct file {
62 	unsigned long max_blocks;
63 	unsigned long max_size;
64 	unsigned long cur_off;
65 	unsigned pending_ios;
66 	int real_fd;
67 	int fixed_fd;
68 	int fileno;
69 };
70 
71 #define PLAT_BITS		6
72 #define PLAT_VAL		(1 << PLAT_BITS)
73 #define PLAT_GROUP_NR		29
74 #define PLAT_NR			(PLAT_GROUP_NR * PLAT_VAL)
75 
76 struct submitter {
77 	pthread_t thread;
78 	int ring_fd;
79 	int index;
80 	struct io_sq_ring sq_ring;
81 	struct io_uring_sqe *sqes;
82 	struct io_cq_ring cq_ring;
83 	int inflight;
84 	int tid;
85 	unsigned long reaps;
86 	unsigned long done;
87 	unsigned long calls;
88 	volatile int finish;
89 
90 	__s32 *fds;
91 
92 	struct taus258_state rand_state;
93 
94 	unsigned long *clock_batch;
95 	int clock_index;
96 	unsigned long *plat;
97 
98 #ifdef CONFIG_LIBAIO
99 	io_context_t aio_ctx;
100 #endif
101 
102 	struct file files[MAX_FDS];
103 	unsigned nr_files;
104 	unsigned cur_file;
105 	struct iovec iovecs[];
106 };
107 
108 static struct submitter *submitter;
109 static volatile int finish;
110 static int stats_running;
111 static unsigned long max_iops;
112 
113 static int depth = DEPTH;
114 static int batch_submit = BATCH_SUBMIT;
115 static int batch_complete = BATCH_COMPLETE;
116 static int bs = BS;
117 static int polled = 1;		/* use IO polling */
118 static int fixedbufs = 1;	/* use fixed user buffers */
119 static int dma_map;		/* pre-map DMA buffers */
120 static int register_files = 1;	/* use fixed files */
121 static int buffered = 0;	/* use buffered IO, not O_DIRECT */
122 static int sq_thread_poll = 0;	/* use kernel submission/poller thread */
123 static int sq_thread_cpu = -1;	/* pin above thread to this CPU */
124 static int do_nop = 0;		/* no-op SQ ring commands */
125 static int nthreads = 1;
126 static int stats = 0;		/* generate IO stats */
127 static int aio = 0;		/* use libaio */
128 static int runtime = 0;		/* runtime */
129 static int random_io = 1;	/* random or sequential IO */
130 
131 static unsigned long tsc_rate;
132 
133 #define TSC_RATE_FILE	"tsc-rate"
134 
135 static int vectored = 1;
136 
137 static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
138 			80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
139 static int plist_len = 17;
140 
141 #ifndef IORING_REGISTER_MAP_BUFFERS
142 #define IORING_REGISTER_MAP_BUFFERS	20
143 struct io_uring_map_buffers {
144 	__s32	fd;
145 	__u32	buf_start;
146 	__u32	buf_end;
147 	__u32	flags;
148 	__u64	rsvd[2];
149 };
150 #endif
151 
cycles_to_nsec(unsigned long cycles)152 static unsigned long cycles_to_nsec(unsigned long cycles)
153 {
154 	uint64_t val;
155 
156 	if (!tsc_rate)
157 		return cycles;
158 
159 	val = cycles * 1000000000ULL;
160 	return val / tsc_rate;
161 }
162 
plat_idx_to_val(unsigned int idx)163 static unsigned long plat_idx_to_val(unsigned int idx)
164 {
165 	unsigned int error_bits;
166 	unsigned long k, base;
167 
168 	assert(idx < PLAT_NR);
169 
170 	/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
171 	 * all bits of the sample as index */
172 	if (idx < (PLAT_VAL << 1))
173 		return cycles_to_nsec(idx);
174 
175 	/* Find the group and compute the minimum value of that group */
176 	error_bits = (idx >> PLAT_BITS) - 1;
177 	base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
178 
179 	/* Find its bucket number of the group */
180 	k = idx % PLAT_VAL;
181 
182 	/* Return the mean of the range of the bucket */
183 	return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
184 }
185 
calc_clat_percentiles(unsigned long * io_u_plat,unsigned long nr,unsigned long ** output,unsigned long * maxv,unsigned long * minv)186 unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
187 				   unsigned long **output,
188 				   unsigned long *maxv, unsigned long *minv)
189 {
190 	unsigned long sum = 0;
191 	unsigned int len = plist_len, i, j = 0;
192 	unsigned long *ovals = NULL;
193 	bool is_last;
194 
195 	*minv = -1UL;
196 	*maxv = 0;
197 
198 	ovals = malloc(len * sizeof(*ovals));
199 	if (!ovals)
200 		return 0;
201 
202 	/*
203 	 * Calculate bucket values, note down max and min values
204 	 */
205 	is_last = false;
206 	for (i = 0; i < PLAT_NR && !is_last; i++) {
207 		sum += io_u_plat[i];
208 		while (sum >= ((long double) plist[j] / 100.0 * nr)) {
209 			assert(plist[j] <= 100.0);
210 
211 			ovals[j] = plat_idx_to_val(i);
212 			if (ovals[j] < *minv)
213 				*minv = ovals[j];
214 			if (ovals[j] > *maxv)
215 				*maxv = ovals[j];
216 
217 			is_last = (j == len - 1) != 0;
218 			if (is_last)
219 				break;
220 
221 			j++;
222 		}
223 	}
224 
225 	if (!is_last)
226 		fprintf(stderr, "error calculating latency percentiles\n");
227 
228 	*output = ovals;
229 	return len;
230 }
231 
show_clat_percentiles(unsigned long * io_u_plat,unsigned long nr,unsigned int precision)232 static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
233 				  unsigned int precision)
234 {
235 	unsigned int divisor, len, i, j = 0;
236 	unsigned long minv, maxv;
237 	unsigned long *ovals;
238 	int per_line, scale_down, time_width;
239 	bool is_last;
240 	char fmt[32];
241 
242 	len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
243 	if (!len || !ovals)
244 		goto out;
245 
246 	if (!tsc_rate) {
247 		scale_down = 0;
248 		divisor = 1;
249 		printf("    percentiles (tsc ticks):\n     |");
250 	} else if (minv > 2000 && maxv > 99999) {
251 		scale_down = 1;
252 		divisor = 1000;
253 		printf("    percentiles (usec):\n     |");
254 	} else {
255 		scale_down = 0;
256 		divisor = 1;
257 		printf("    percentiles (nsec):\n     |");
258 	}
259 
260 	time_width = max(5, (int) (log10(maxv / divisor) + 1));
261 	snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
262 			precision, time_width);
263 	/* fmt will be something like " %5.2fth=[%4llu]%c" */
264 	per_line = (80 - 7) / (precision + 10 + time_width);
265 
266 	for (j = 0; j < len; j++) {
267 		/* for formatting */
268 		if (j != 0 && (j % per_line) == 0)
269 			printf("     |");
270 
271 		/* end of the list */
272 		is_last = (j == len - 1) != 0;
273 
274 		for (i = 0; i < scale_down; i++)
275 			ovals[j] = (ovals[j] + 999) / 1000;
276 
277 		printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
278 
279 		if (is_last)
280 			break;
281 
282 		if ((j % per_line) == per_line - 1)	/* for formatting */
283 			printf("\n");
284 	}
285 
286 out:
287 	free(ovals);
288 }
289 
plat_val_to_idx(unsigned long val)290 static unsigned int plat_val_to_idx(unsigned long val)
291 {
292 	unsigned int msb, error_bits, base, offset, idx;
293 
294 	/* Find MSB starting from bit 0 */
295 	if (val == 0)
296 		msb = 0;
297 	else
298 		msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
299 
300 	/*
301 	 * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
302 	 * all bits of the sample as index
303 	 */
304 	if (msb <= PLAT_BITS)
305 		return val;
306 
307 	/* Compute the number of error bits to discard*/
308 	error_bits = msb - PLAT_BITS;
309 
310 	/* Compute the number of buckets before the group */
311 	base = (error_bits + 1) << PLAT_BITS;
312 
313 	/*
314 	 * Discard the error bits and apply the mask to find the
315 	 * index for the buckets in the group
316 	 */
317 	offset = (PLAT_VAL - 1) & (val >> error_bits);
318 
319 	/* Make sure the index does not exceed (array size - 1) */
320 	idx = (base + offset) < (PLAT_NR - 1) ?
321 		(base + offset) : (PLAT_NR - 1);
322 
323 	return idx;
324 }
325 
add_stat(struct submitter * s,int clock_index,int nr)326 static void add_stat(struct submitter *s, int clock_index, int nr)
327 {
328 #ifdef ARCH_HAVE_CPU_CLOCK
329 	unsigned long cycles;
330 	unsigned int pidx;
331 
332 	if (!s->finish && clock_index) {
333 		cycles = get_cpu_clock();
334 		cycles -= s->clock_batch[clock_index];
335 		pidx = plat_val_to_idx(cycles);
336 		s->plat[pidx] += nr;
337 	}
338 #endif
339 }
340 
io_uring_map_buffers(struct submitter * s)341 static int io_uring_map_buffers(struct submitter *s)
342 {
343 	struct io_uring_map_buffers map = {
344 		.fd		= s->files[0].real_fd,
345 		.buf_end	= depth,
346 	};
347 
348 	if (do_nop)
349 		return 0;
350 	if (s->nr_files > 1) {
351 		fprintf(stderr, "Can't map buffers with multiple files\n");
352 		return -1;
353 	}
354 
355 	return syscall(__NR_io_uring_register, s->ring_fd,
356 			IORING_REGISTER_MAP_BUFFERS, &map, 1);
357 }
358 
io_uring_register_buffers(struct submitter * s)359 static int io_uring_register_buffers(struct submitter *s)
360 {
361 	if (do_nop)
362 		return 0;
363 
364 	return syscall(__NR_io_uring_register, s->ring_fd,
365 			IORING_REGISTER_BUFFERS, s->iovecs, depth);
366 }
367 
io_uring_register_files(struct submitter * s)368 static int io_uring_register_files(struct submitter *s)
369 {
370 	int i;
371 
372 	if (do_nop)
373 		return 0;
374 
375 	s->fds = calloc(s->nr_files, sizeof(__s32));
376 	for (i = 0; i < s->nr_files; i++) {
377 		s->fds[i] = s->files[i].real_fd;
378 		s->files[i].fixed_fd = i;
379 	}
380 
381 	return syscall(__NR_io_uring_register, s->ring_fd,
382 			IORING_REGISTER_FILES, s->fds, s->nr_files);
383 }
384 
io_uring_setup(unsigned entries,struct io_uring_params * p)385 static int io_uring_setup(unsigned entries, struct io_uring_params *p)
386 {
387 	/*
388 	 * Clamp CQ ring size at our SQ ring size, we don't need more entries
389 	 * than that.
390 	 */
391 	p->flags |= IORING_SETUP_CQSIZE;
392 	p->cq_entries = entries;
393 
394 	return syscall(__NR_io_uring_setup, entries, p);
395 }
396 
io_uring_probe(int fd)397 static void io_uring_probe(int fd)
398 {
399 	struct io_uring_probe *p;
400 	int ret;
401 
402 	p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
403 	if (!p)
404 		return;
405 
406 	memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
407 	ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
408 	if (ret < 0)
409 		goto out;
410 
411 	if (IORING_OP_READ > p->ops_len)
412 		goto out;
413 
414 	if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED))
415 		vectored = 0;
416 out:
417 	free(p);
418 }
419 
io_uring_enter(struct submitter * s,unsigned int to_submit,unsigned int min_complete,unsigned int flags)420 static int io_uring_enter(struct submitter *s, unsigned int to_submit,
421 			  unsigned int min_complete, unsigned int flags)
422 {
423 	return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
424 			flags, NULL, 0);
425 }
426 
427 #ifndef CONFIG_HAVE_GETTID
gettid(void)428 static int gettid(void)
429 {
430 	return syscall(__NR_gettid);
431 }
432 #endif
433 
file_depth(struct submitter * s)434 static unsigned file_depth(struct submitter *s)
435 {
436 	return (depth + s->nr_files - 1) / s->nr_files;
437 }
438 
init_io(struct submitter * s,unsigned index)439 static void init_io(struct submitter *s, unsigned index)
440 {
441 	struct io_uring_sqe *sqe = &s->sqes[index];
442 	unsigned long offset;
443 	struct file *f;
444 	long r;
445 
446 	if (do_nop) {
447 		sqe->opcode = IORING_OP_NOP;
448 		return;
449 	}
450 
451 	if (s->nr_files == 1) {
452 		f = &s->files[0];
453 	} else {
454 		f = &s->files[s->cur_file];
455 		if (f->pending_ios >= file_depth(s)) {
456 			s->cur_file++;
457 			if (s->cur_file == s->nr_files)
458 				s->cur_file = 0;
459 			f = &s->files[s->cur_file];
460 		}
461 	}
462 	f->pending_ios++;
463 
464 	if (random_io) {
465 		r = __rand64(&s->rand_state);
466 		offset = (r % (f->max_blocks - 1)) * bs;
467 	} else {
468 		offset = f->cur_off;
469 		f->cur_off += bs;
470 		if (f->cur_off + bs > f->max_size)
471 			f->cur_off = 0;
472 	}
473 
474 	if (register_files) {
475 		sqe->flags = IOSQE_FIXED_FILE;
476 		sqe->fd = f->fixed_fd;
477 	} else {
478 		sqe->flags = 0;
479 		sqe->fd = f->real_fd;
480 	}
481 	if (fixedbufs) {
482 		sqe->opcode = IORING_OP_READ_FIXED;
483 		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
484 		sqe->len = bs;
485 		sqe->buf_index = index;
486 	} else if (!vectored) {
487 		sqe->opcode = IORING_OP_READ;
488 		sqe->addr = (unsigned long) s->iovecs[index].iov_base;
489 		sqe->len = bs;
490 		sqe->buf_index = 0;
491 	} else {
492 		sqe->opcode = IORING_OP_READV;
493 		sqe->addr = (unsigned long) &s->iovecs[index];
494 		sqe->len = 1;
495 		sqe->buf_index = 0;
496 	}
497 	sqe->ioprio = 0;
498 	sqe->off = offset;
499 	sqe->user_data = (unsigned long) f->fileno;
500 	if (stats && stats_running)
501 		sqe->user_data |= ((uint64_t)s->clock_index << 32);
502 }
503 
prep_more_ios_uring(struct submitter * s,int max_ios)504 static int prep_more_ios_uring(struct submitter *s, int max_ios)
505 {
506 	struct io_sq_ring *ring = &s->sq_ring;
507 	unsigned index, tail, next_tail, prepped = 0;
508 
509 	next_tail = tail = *ring->tail;
510 	do {
511 		next_tail++;
512 		if (next_tail == atomic_load_acquire(ring->head))
513 			break;
514 
515 		index = tail & sq_ring_mask;
516 		init_io(s, index);
517 		ring->array[index] = index;
518 		prepped++;
519 		tail = next_tail;
520 	} while (prepped < max_ios);
521 
522 	if (prepped)
523 		atomic_store_release(ring->tail, tail);
524 	return prepped;
525 }
526 
get_file_size(struct file * f)527 static int get_file_size(struct file *f)
528 {
529 	struct stat st;
530 
531 	if (fstat(f->real_fd, &st) < 0)
532 		return -1;
533 	if (S_ISBLK(st.st_mode)) {
534 		unsigned long long bytes;
535 
536 		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
537 			return -1;
538 
539 		f->max_blocks = bytes / bs;
540 		f->max_size = bytes;
541 		return 0;
542 	} else if (S_ISREG(st.st_mode)) {
543 		f->max_blocks = st.st_size / bs;
544 		f->max_size = st.st_size;
545 		return 0;
546 	}
547 
548 	return -1;
549 }
550 
reap_events_uring(struct submitter * s)551 static int reap_events_uring(struct submitter *s)
552 {
553 	struct io_cq_ring *ring = &s->cq_ring;
554 	struct io_uring_cqe *cqe;
555 	unsigned head, reaped = 0;
556 	int last_idx = -1, stat_nr = 0;
557 
558 	head = *ring->head;
559 	do {
560 		struct file *f;
561 
562 		read_barrier();
563 		if (head == atomic_load_acquire(ring->tail))
564 			break;
565 		cqe = &ring->cqes[head & cq_ring_mask];
566 		if (!do_nop) {
567 			int fileno = cqe->user_data & 0xffffffff;
568 
569 			f = &s->files[fileno];
570 			f->pending_ios--;
571 			if (cqe->res != bs) {
572 				printf("io: unexpected ret=%d\n", cqe->res);
573 				if (polled && cqe->res == -EOPNOTSUPP)
574 					printf("Your filesystem/driver/kernel doesn't support polled IO\n");
575 				return -1;
576 			}
577 		}
578 		if (stats) {
579 			int clock_index = cqe->user_data >> 32;
580 
581 			if (last_idx != clock_index) {
582 				if (last_idx != -1) {
583 					add_stat(s, last_idx, stat_nr);
584 					stat_nr = 0;
585 				}
586 				last_idx = clock_index;
587 			}
588 			stat_nr++;
589 		}
590 		reaped++;
591 		head++;
592 	} while (1);
593 
594 	if (stat_nr)
595 		add_stat(s, last_idx, stat_nr);
596 
597 	if (reaped) {
598 		s->inflight -= reaped;
599 		atomic_store_release(ring->head, head);
600 	}
601 	return reaped;
602 }
603 
submitter_init(struct submitter * s)604 static int submitter_init(struct submitter *s)
605 {
606 	int i, nr_batch;
607 
608 	s->tid = gettid();
609 	printf("submitter=%d, tid=%d\n", s->index, s->tid);
610 
611 	__init_rand64(&s->rand_state, pthread_self());
612 	srand48(pthread_self());
613 
614 	for (i = 0; i < MAX_FDS; i++)
615 		s->files[i].fileno = i;
616 
617 	if (stats) {
618 		nr_batch = roundup_pow2(depth / batch_submit);
619 		if (nr_batch < 2)
620 			nr_batch = 2;
621 		s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
622 		s->clock_index = 1;
623 
624 		s->plat = calloc(PLAT_NR, sizeof(unsigned long));
625 	} else {
626 		s->clock_batch = NULL;
627 		s->plat = NULL;
628 		nr_batch = 0;
629 	}
630 
631 	return nr_batch;
632 }
633 
634 #ifdef CONFIG_LIBAIO
prep_more_ios_aio(struct submitter * s,int max_ios,struct iocb * iocbs)635 static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
636 {
637 	unsigned long offset, data;
638 	struct file *f;
639 	unsigned index;
640 	long r;
641 
642 	index = 0;
643 	while (index < max_ios) {
644 		struct iocb *iocb = &iocbs[index];
645 
646 		if (s->nr_files == 1) {
647 			f = &s->files[0];
648 		} else {
649 			f = &s->files[s->cur_file];
650 			if (f->pending_ios >= file_depth(s)) {
651 				s->cur_file++;
652 				if (s->cur_file == s->nr_files)
653 					s->cur_file = 0;
654 				f = &s->files[s->cur_file];
655 			}
656 		}
657 		f->pending_ios++;
658 
659 		r = lrand48();
660 		offset = (r % (f->max_blocks - 1)) * bs;
661 		io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
662 				s->iovecs[index].iov_len, offset);
663 
664 		data = f->fileno;
665 		if (stats && stats_running)
666 			data |= ((unsigned long) s->clock_index << 32);
667 		iocb->data = (void *) (uintptr_t) data;
668 		index++;
669 	}
670 	return index;
671 }
672 
reap_events_aio(struct submitter * s,struct io_event * events,int evs)673 static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
674 {
675 	int last_idx = -1, stat_nr = 0;
676 	int reaped = 0;
677 
678 	while (evs) {
679 		unsigned long data = (uintptr_t) events[reaped].data;
680 		struct file *f = &s->files[data & 0xffffffff];
681 
682 		f->pending_ios--;
683 		if (events[reaped].res != bs) {
684 			printf("io: unexpected ret=%ld\n", events[reaped].res);
685 			return -1;
686 		}
687 		if (stats) {
688 			int clock_index = data >> 32;
689 
690 			if (last_idx != clock_index) {
691 				if (last_idx != -1) {
692 					add_stat(s, last_idx, stat_nr);
693 					stat_nr = 0;
694 				}
695 				last_idx = clock_index;
696 			}
697 			stat_nr++;
698 		}
699 		reaped++;
700 		evs--;
701 	}
702 
703 	if (stat_nr)
704 		add_stat(s, last_idx, stat_nr);
705 
706 	s->inflight -= reaped;
707 	s->done += reaped;
708 	return reaped;
709 }
710 
submitter_aio_fn(void * data)711 static void *submitter_aio_fn(void *data)
712 {
713 	struct submitter *s = data;
714 	int i, ret, prepped, nr_batch;
715 	struct iocb **iocbsptr;
716 	struct iocb *iocbs;
717 	struct io_event *events;
718 
719 	nr_batch = submitter_init(s);
720 
721 	iocbsptr = calloc(depth, sizeof(struct iocb *));
722 	iocbs = calloc(depth, sizeof(struct iocb));
723 	events = calloc(depth, sizeof(struct io_event));
724 
725 	for (i = 0; i < depth; i++)
726 		iocbsptr[i] = &iocbs[i];
727 
728 	prepped = 0;
729 	do {
730 		int to_wait, to_submit, to_prep;
731 
732 		if (!prepped && s->inflight < depth) {
733 			to_prep = min(depth - s->inflight, batch_submit);
734 			prepped = prep_more_ios_aio(s, to_prep, iocbs);
735 #ifdef ARCH_HAVE_CPU_CLOCK
736 			if (prepped && stats) {
737 				s->clock_batch[s->clock_index] = get_cpu_clock();
738 				s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
739 			}
740 #endif
741 		}
742 		s->inflight += prepped;
743 		to_submit = prepped;
744 
745 		if (to_submit && (s->inflight + to_submit <= depth))
746 			to_wait = 0;
747 		else
748 			to_wait = min(s->inflight + to_submit, batch_complete);
749 
750 		ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
751 		s->calls++;
752 		if (ret < 0) {
753 			perror("io_submit");
754 			break;
755 		} else if (ret != to_submit) {
756 			printf("submitted %d, wanted %d\n", ret, to_submit);
757 			break;
758 		}
759 		prepped = 0;
760 
761 		while (to_wait) {
762 			int r;
763 
764 			s->calls++;
765 			r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
766 			if (r < 0) {
767 				perror("io_getevents");
768 				break;
769 			} else if (r != to_wait) {
770 				printf("r=%d, wait=%d\n", r, to_wait);
771 				break;
772 			}
773 			r = reap_events_aio(s, events, r);
774 			s->reaps += r;
775 			to_wait -= r;
776 		}
777 	} while (!s->finish);
778 
779 	free(iocbsptr);
780 	free(iocbs);
781 	free(events);
782 	finish = 1;
783 	return NULL;
784 }
785 #endif
786 
submitter_uring_fn(void * data)787 static void *submitter_uring_fn(void *data)
788 {
789 	struct submitter *s = data;
790 	struct io_sq_ring *ring = &s->sq_ring;
791 	int ret, prepped, nr_batch;
792 
793 	nr_batch = submitter_init(s);
794 
795 	prepped = 0;
796 	do {
797 		int to_wait, to_submit, this_reap, to_prep;
798 		unsigned ring_flags = 0;
799 
800 		if (!prepped && s->inflight < depth) {
801 			to_prep = min(depth - s->inflight, batch_submit);
802 			prepped = prep_more_ios_uring(s, to_prep);
803 #ifdef ARCH_HAVE_CPU_CLOCK
804 			if (prepped && stats) {
805 				s->clock_batch[s->clock_index] = get_cpu_clock();
806 				s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
807 			}
808 #endif
809 		}
810 		s->inflight += prepped;
811 submit_more:
812 		to_submit = prepped;
813 submit:
814 		if (to_submit && (s->inflight + to_submit <= depth))
815 			to_wait = 0;
816 		else
817 			to_wait = min(s->inflight + to_submit, batch_complete);
818 
819 		/*
820 		 * Only need to call io_uring_enter if we're not using SQ thread
821 		 * poll, or if IORING_SQ_NEED_WAKEUP is set.
822 		 */
823 		if (sq_thread_poll)
824 			ring_flags = atomic_load_acquire(ring->flags);
825 		if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
826 			unsigned flags = 0;
827 
828 			if (to_wait)
829 				flags = IORING_ENTER_GETEVENTS;
830 			if (ring_flags & IORING_SQ_NEED_WAKEUP)
831 				flags |= IORING_ENTER_SQ_WAKEUP;
832 			ret = io_uring_enter(s, to_submit, to_wait, flags);
833 			s->calls++;
834 		} else {
835 			/* for SQPOLL, we submitted it all effectively */
836 			ret = to_submit;
837 		}
838 
839 		/*
840 		 * For non SQ thread poll, we already got the events we needed
841 		 * through the io_uring_enter() above. For SQ thread poll, we
842 		 * need to loop here until we find enough events.
843 		 */
844 		this_reap = 0;
845 		do {
846 			int r;
847 
848 			r = reap_events_uring(s);
849 			if (r == -1) {
850 				s->finish = 1;
851 				break;
852 			} else if (r > 0)
853 				this_reap += r;
854 		} while (sq_thread_poll && this_reap < to_wait);
855 		s->reaps += this_reap;
856 
857 		if (ret >= 0) {
858 			if (!ret) {
859 				to_submit = 0;
860 				if (s->inflight)
861 					goto submit;
862 				continue;
863 			} else if (ret < to_submit) {
864 				int diff = to_submit - ret;
865 
866 				s->done += ret;
867 				prepped -= diff;
868 				goto submit_more;
869 			}
870 			s->done += ret;
871 			prepped = 0;
872 			continue;
873 		} else if (ret < 0) {
874 			if (errno == EAGAIN) {
875 				if (s->finish)
876 					break;
877 				if (this_reap)
878 					goto submit;
879 				to_submit = 0;
880 				goto submit;
881 			}
882 			printf("io_submit: %s\n", strerror(errno));
883 			break;
884 		}
885 	} while (!s->finish);
886 
887 	finish = 1;
888 	return NULL;
889 }
890 
get_submitter(int offset)891 static struct submitter *get_submitter(int offset)
892 {
893 	void *ret;
894 
895 	ret = submitter;
896 	if (offset)
897 		ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
898 	return ret;
899 }
900 
do_finish(const char * reason)901 static void do_finish(const char *reason)
902 {
903 	int j;
904 	printf("Exiting on %s\n", reason);
905 	for (j = 0; j < nthreads; j++) {
906 		struct submitter *s = get_submitter(j);
907 		s->finish = 1;
908 	}
909 	if (max_iops > 100000)
910 		printf("Maximum IOPS=%luK\n", max_iops / 1000);
911 	else if (max_iops)
912 		printf("Maximum IOPS=%lu\n", max_iops);
913 	finish = 1;
914 }
915 
sig_int(int sig)916 static void sig_int(int sig)
917 {
918 	do_finish("signal");
919 }
920 
arm_sig_int(void)921 static void arm_sig_int(void)
922 {
923 	struct sigaction act;
924 
925 	memset(&act, 0, sizeof(act));
926 	act.sa_handler = sig_int;
927 	act.sa_flags = SA_RESTART;
928 	sigaction(SIGINT, &act, NULL);
929 
930 	/* Windows uses SIGBREAK as a quit signal from other applications */
931 #ifdef WIN32
932 	sigaction(SIGBREAK, &act, NULL);
933 #endif
934 }
935 
setup_aio(struct submitter * s)936 static int setup_aio(struct submitter *s)
937 {
938 #ifdef CONFIG_LIBAIO
939 	if (polled) {
940 		fprintf(stderr, "aio does not support polled IO\n");
941 		polled = 0;
942 	}
943 	if (sq_thread_poll) {
944 		fprintf(stderr, "aio does not support SQPOLL IO\n");
945 		sq_thread_poll = 0;
946 	}
947 	if (do_nop) {
948 		fprintf(stderr, "aio does not support polled IO\n");
949 		do_nop = 0;
950 	}
951 	if (fixedbufs || register_files) {
952 		fprintf(stderr, "aio does not support registered files or buffers\n");
953 		fixedbufs = register_files = 0;
954 	}
955 
956 	return io_queue_init(depth, &s->aio_ctx);
957 #else
958 	fprintf(stderr, "Legacy AIO not available on this system/build\n");
959 	errno = EINVAL;
960 	return -1;
961 #endif
962 }
963 
setup_ring(struct submitter * s)964 static int setup_ring(struct submitter *s)
965 {
966 	struct io_sq_ring *sring = &s->sq_ring;
967 	struct io_cq_ring *cring = &s->cq_ring;
968 	struct io_uring_params p;
969 	int ret, fd;
970 	void *ptr;
971 
972 	memset(&p, 0, sizeof(p));
973 
974 	if (polled && !do_nop)
975 		p.flags |= IORING_SETUP_IOPOLL;
976 	if (sq_thread_poll) {
977 		p.flags |= IORING_SETUP_SQPOLL;
978 		if (sq_thread_cpu != -1) {
979 			p.flags |= IORING_SETUP_SQ_AFF;
980 			p.sq_thread_cpu = sq_thread_cpu;
981 		}
982 	}
983 
984 	fd = io_uring_setup(depth, &p);
985 	if (fd < 0) {
986 		perror("io_uring_setup");
987 		return 1;
988 	}
989 	s->ring_fd = fd;
990 
991 	io_uring_probe(fd);
992 
993 	if (fixedbufs) {
994 		struct rlimit rlim;
995 
996 		rlim.rlim_cur = RLIM_INFINITY;
997 		rlim.rlim_max = RLIM_INFINITY;
998 		/* ignore potential error, not needed on newer kernels */
999 		setrlimit(RLIMIT_MEMLOCK, &rlim);
1000 
1001 		ret = io_uring_register_buffers(s);
1002 		if (ret < 0) {
1003 			perror("io_uring_register_buffers");
1004 			return 1;
1005 		}
1006 
1007 		if (dma_map) {
1008 			ret = io_uring_map_buffers(s);
1009 			if (ret < 0) {
1010 				perror("io_uring_map_buffers");
1011 				return 1;
1012 			}
1013 		}
1014 	}
1015 
1016 	if (register_files) {
1017 		ret = io_uring_register_files(s);
1018 		if (ret < 0) {
1019 			perror("io_uring_register_files");
1020 			return 1;
1021 		}
1022 	}
1023 
1024 	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
1025 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1026 			IORING_OFF_SQ_RING);
1027 	sring->head = ptr + p.sq_off.head;
1028 	sring->tail = ptr + p.sq_off.tail;
1029 	sring->ring_mask = ptr + p.sq_off.ring_mask;
1030 	sring->ring_entries = ptr + p.sq_off.ring_entries;
1031 	sring->flags = ptr + p.sq_off.flags;
1032 	sring->array = ptr + p.sq_off.array;
1033 	sq_ring_mask = *sring->ring_mask;
1034 
1035 	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
1036 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1037 			IORING_OFF_SQES);
1038 
1039 	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
1040 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
1041 			IORING_OFF_CQ_RING);
1042 	cring->head = ptr + p.cq_off.head;
1043 	cring->tail = ptr + p.cq_off.tail;
1044 	cring->ring_mask = ptr + p.cq_off.ring_mask;
1045 	cring->ring_entries = ptr + p.cq_off.ring_entries;
1046 	cring->cqes = ptr + p.cq_off.cqes;
1047 	cq_ring_mask = *cring->ring_mask;
1048 	return 0;
1049 }
1050 
file_depths(char * buf)1051 static void file_depths(char *buf)
1052 {
1053 	bool prev = false;
1054 	char *p;
1055 	int i, j;
1056 
1057 	buf[0] = '\0';
1058 	p = buf;
1059 	for (j = 0; j < nthreads; j++) {
1060 		struct submitter *s = get_submitter(j);
1061 
1062 		for (i = 0; i < s->nr_files; i++) {
1063 			struct file *f = &s->files[i];
1064 
1065 			if (prev)
1066 				p += sprintf(p, " %d", f->pending_ios);
1067 			else
1068 				p += sprintf(p, "%d", f->pending_ios);
1069 			prev = true;
1070 		}
1071 	}
1072 }
1073 
usage(char * argv,int status)1074 static void usage(char *argv, int status)
1075 {
1076 	char runtime_str[16];
1077 	snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
1078 	printf("%s [options] -- [filenames]\n"
1079 		" -d <int>  : IO Depth, default %d\n"
1080 		" -s <int>  : Batch submit, default %d\n"
1081 		" -c <int>  : Batch complete, default %d\n"
1082 		" -b <int>  : Block size, default %d\n"
1083 		" -p <bool> : Polled IO, default %d\n"
1084 		" -B <bool> : Fixed buffers, default %d\n"
1085 		" -D <bool> : DMA map fixed buffers, default %d\n"
1086 		" -F <bool> : Register files, default %d\n"
1087 		" -n <int>  : Number of threads, default %d\n"
1088 		" -O <bool> : Use O_DIRECT, default %d\n"
1089 		" -N <bool> : Perform just no-op requests, default %d\n"
1090 		" -t <bool> : Track IO latencies, default %d\n"
1091 		" -T <int>  : TSC rate in HZ\n"
1092 		" -r <int>  : Runtime in seconds, default %s\n"
1093 		" -R <bool> : Use random IO, default %d\n"
1094 		" -a <bool> : Use legacy aio, default %d\n",
1095 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
1096 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
1097 		stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io);
1098 	exit(status);
1099 }
1100 
read_tsc_rate(void)1101 static void read_tsc_rate(void)
1102 {
1103 	char buffer[32];
1104 	int fd, ret;
1105 
1106 	if (tsc_rate)
1107 		return;
1108 
1109 	fd = open(TSC_RATE_FILE, O_RDONLY);
1110 	if (fd < 0)
1111 		return;
1112 
1113 	ret = read(fd, buffer, sizeof(buffer));
1114 	if (ret < 0) {
1115 		close(fd);
1116 		return;
1117 	}
1118 
1119 	tsc_rate = strtoul(buffer, NULL, 10);
1120 	printf("Using TSC rate %luHz\n", tsc_rate);
1121 	close(fd);
1122 }
1123 
write_tsc_rate(void)1124 static void write_tsc_rate(void)
1125 {
1126 	char buffer[32];
1127 	struct stat sb;
1128 	int fd, ret;
1129 
1130 	if (!stat(TSC_RATE_FILE, &sb))
1131 		return;
1132 
1133 	fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
1134 	if (fd < 0)
1135 		return;
1136 
1137 	memset(buffer, 0, sizeof(buffer));
1138 	sprintf(buffer, "%lu", tsc_rate);
1139 	ret = write(fd, buffer, strlen(buffer));
1140 	if (ret < 0)
1141 		perror("write");
1142 	close(fd);
1143 }
1144 
main(int argc,char * argv[])1145 int main(int argc, char *argv[])
1146 {
1147 	struct submitter *s;
1148 	unsigned long done, calls, reap;
1149 	int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
1150 	struct file f;
1151 	char *fdepths;
1152 	void *ret;
1153 
1154 	if (!do_nop && argc < 2)
1155 		usage(argv[0], 1);
1156 
1157 	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:h?")) != -1) {
1158 		switch (opt) {
1159 		case 'a':
1160 			aio = !!atoi(optarg);
1161 			break;
1162 		case 'd':
1163 			depth = atoi(optarg);
1164 			break;
1165 		case 's':
1166 			batch_submit = atoi(optarg);
1167 			if (!batch_submit)
1168 				batch_submit = 1;
1169 			break;
1170 		case 'c':
1171 			batch_complete = atoi(optarg);
1172 			if (!batch_complete)
1173 				batch_complete = 1;
1174 			break;
1175 		case 'b':
1176 			bs = atoi(optarg);
1177 			break;
1178 		case 'p':
1179 			polled = !!atoi(optarg);
1180 			break;
1181 		case 'B':
1182 			fixedbufs = !!atoi(optarg);
1183 			break;
1184 		case 'F':
1185 			register_files = !!atoi(optarg);
1186 			break;
1187 		case 'n':
1188 			nthreads = atoi(optarg);
1189 			if (!nthreads) {
1190 				printf("Threads must be non-zero\n");
1191 				usage(argv[0], 1);
1192 			}
1193 			break;
1194 		case 'N':
1195 			do_nop = !!atoi(optarg);
1196 			break;
1197 		case 'O':
1198 			buffered = !atoi(optarg);
1199 			break;
1200 		case 't':
1201 #ifndef ARCH_HAVE_CPU_CLOCK
1202 			fprintf(stderr, "Stats not supported on this CPU\n");
1203 			return 1;
1204 #endif
1205 			stats = !!atoi(optarg);
1206 			break;
1207 		case 'T':
1208 #ifndef ARCH_HAVE_CPU_CLOCK
1209 			fprintf(stderr, "Stats not supported on this CPU\n");
1210 			return 1;
1211 #endif
1212 			tsc_rate = strtoul(optarg, NULL, 10);
1213 			write_tsc_rate();
1214 			break;
1215 		case 'r':
1216 			runtime = atoi(optarg);
1217 			break;
1218 		case 'D':
1219 			dma_map = !!atoi(optarg);
1220 			break;
1221 		case 'R':
1222 			random_io = !!atoi(optarg);
1223 			break;
1224 		case 'h':
1225 		case '?':
1226 		default:
1227 			usage(argv[0], 0);
1228 			break;
1229 		}
1230 	}
1231 
1232 	if (stats)
1233 		read_tsc_rate();
1234 
1235 	if (batch_complete > depth)
1236 		batch_complete = depth;
1237 	if (batch_submit > depth)
1238 		batch_submit = depth;
1239 	if (!fixedbufs && dma_map)
1240 		dma_map = 0;
1241 
1242 	submitter = calloc(nthreads, sizeof(*submitter) +
1243 				depth * sizeof(struct iovec));
1244 	for (j = 0; j < nthreads; j++) {
1245 		s = get_submitter(j);
1246 		s->index = j;
1247 		s->done = s->calls = s->reaps = 0;
1248 	}
1249 
1250 	flags = O_RDONLY | O_NOATIME;
1251 	if (!buffered)
1252 		flags |= O_DIRECT;
1253 
1254 	j = 0;
1255 	i = optind;
1256 	nfiles = argc - i;
1257 	if (!do_nop) {
1258 		if (!nfiles) {
1259 			printf("No files specified\n");
1260 			usage(argv[0], 1);
1261 		}
1262 		threads_per_f = nthreads / nfiles;
1263 		/* make sure each thread gets assigned files */
1264 		if (threads_per_f == 0) {
1265 			threads_per_f = 1;
1266 		} else {
1267 			threads_rem = nthreads - threads_per_f * nfiles;
1268 		}
1269 	}
1270 	while (!do_nop && i < argc) {
1271 		int k, limit;
1272 
1273 		memset(&f, 0, sizeof(f));
1274 
1275 		fd = open(argv[i], flags);
1276 		if (fd < 0) {
1277 			perror("open");
1278 			return 1;
1279 		}
1280 		f.real_fd = fd;
1281 		if (get_file_size(&f)) {
1282 			printf("failed getting size of device/file\n");
1283 			return 1;
1284 		}
1285 		if (f.max_blocks <= 1) {
1286 			printf("Zero file/device size?\n");
1287 			return 1;
1288 		}
1289 		f.max_blocks--;
1290 
1291 		limit = threads_per_f;
1292 		limit += threads_rem > 0 ? 1 : 0;
1293 		for (k = 0; k < limit; k++) {
1294 			s = get_submitter((j + k) % nthreads);
1295 
1296 			if (s->nr_files == MAX_FDS) {
1297 				printf("Max number of files (%d) reached\n", MAX_FDS);
1298 				break;
1299 			}
1300 
1301 			memcpy(&s->files[s->nr_files], &f, sizeof(f));
1302 
1303 			printf("Added file %s (submitter %d)\n", argv[i], s->index);
1304 			s->nr_files++;
1305 		}
1306 		threads_rem--;
1307 		i++;
1308 		j += limit;
1309 	}
1310 
1311 	arm_sig_int();
1312 
1313 	for (j = 0; j < nthreads; j++) {
1314 		s = get_submitter(j);
1315 		for (i = 0; i < depth; i++) {
1316 			void *buf;
1317 
1318 			if (posix_memalign(&buf, bs, bs)) {
1319 				printf("failed alloc\n");
1320 				return 1;
1321 			}
1322 			s->iovecs[i].iov_base = buf;
1323 			s->iovecs[i].iov_len = bs;
1324 		}
1325 	}
1326 
1327 	for (j = 0; j < nthreads; j++) {
1328 		s = get_submitter(j);
1329 
1330 		if (!aio)
1331 			err = setup_ring(s);
1332 		else
1333 			err = setup_aio(s);
1334 		if (err) {
1335 			printf("ring setup failed: %s, %d\n", strerror(errno), err);
1336 			return 1;
1337 		}
1338 	}
1339 	s = get_submitter(0);
1340 	printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
1341 	if (!aio)
1342 		printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
1343 	else
1344 		printf("Engine=aio\n");
1345 
1346 	for (j = 0; j < nthreads; j++) {
1347 		s = get_submitter(j);
1348 		if (!aio)
1349 			pthread_create(&s->thread, NULL, submitter_uring_fn, s);
1350 #ifdef CONFIG_LIBAIO
1351 		else
1352 			pthread_create(&s->thread, NULL, submitter_aio_fn, s);
1353 #endif
1354 	}
1355 
1356 	fdepths = malloc(8 * s->nr_files * nthreads);
1357 	reap = calls = done = 0;
1358 	do {
1359 		unsigned long this_done = 0;
1360 		unsigned long this_reap = 0;
1361 		unsigned long this_call = 0;
1362 		unsigned long rpc = 0, ipc = 0;
1363 		unsigned long iops, bw;
1364 
1365 		sleep(1);
1366 		if (runtime && !--runtime)
1367 			do_finish("timeout");
1368 
1369 		/* don't print partial run, if interrupted by signal */
1370 		if (finish)
1371 			break;
1372 
1373 		/* one second in to the run, enable stats */
1374 		if (stats)
1375 			stats_running = 1;
1376 
1377 		for (j = 0; j < nthreads; j++) {
1378 			s = get_submitter(j);
1379 			this_done += s->done;
1380 			this_call += s->calls;
1381 			this_reap += s->reaps;
1382 		}
1383 		if (this_call - calls) {
1384 			rpc = (this_done - done) / (this_call - calls);
1385 			ipc = (this_reap - reap) / (this_call - calls);
1386 		} else
1387 			rpc = ipc = -1;
1388 		file_depths(fdepths);
1389 		iops = this_done - done;
1390 		if (bs > 1048576)
1391 			bw = iops * (bs / 1048576);
1392 		else
1393 			bw = iops / (1048576 / bs);
1394 		if (iops > 100000)
1395 			printf("IOPS=%luK, ", iops / 1000);
1396 		else
1397 			printf("IOPS=%lu, ", iops);
1398 		max_iops = max(max_iops, iops);
1399 		if (!do_nop)
1400 			printf("BW=%luMiB/s, ", bw);
1401 		printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);
1402 		done = this_done;
1403 		calls = this_call;
1404 		reap = this_reap;
1405 	} while (!finish);
1406 
1407 	for (j = 0; j < nthreads; j++) {
1408 		s = get_submitter(j);
1409 		pthread_join(s->thread, &ret);
1410 		close(s->ring_fd);
1411 
1412 		if (stats) {
1413 			unsigned long nr;
1414 
1415 			printf("%d: Latency percentiles:\n", s->tid);
1416 			for (i = 0, nr = 0; i < PLAT_NR; i++)
1417 				nr += s->plat[i];
1418 			show_clat_percentiles(s->plat, nr, 4);
1419 			free(s->clock_batch);
1420 			free(s->plat);
1421 		}
1422 	}
1423 
1424 	free(fdepths);
1425 	free(submitter);
1426 	return 0;
1427 }
1428