1 /*
2  * Copyright (c) 2012,2013,2018 Intel Corporation
3  * Author: Andi Kleen
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that: (1) source code distributions
7  * retain the above copyright notice and this paragraph in its entirety, (2)
8  * distributions including binary code include the above copyright notice and
9  * this paragraph in its entirety in the documentation or other materials
10  * provided with the distribution
11  *
12  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
13  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
14  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
15  */
16 
17 #include "config.h"
18 
19 /* Ring 3 RDPMC support */
20 #include <unistd.h>
21 #include <stdio.h>
22 #include <sys/mman.h>
23 #include <sys/fcntl.h>
24 #include <linux/perf_event.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <sys/syscall.h>
28 
29 #include <rdma/fi_errno.h>
30 
31 
32 /**
33  * DOC: Ring 3 counting for CPU performance counters
34  *
35  * This library allows accessing CPU performance counters from ring 3
36  * using the perf_events subsystem. This is useful to measure specific
37  * parts of programs (e.g. excluding initialization code)
38  *
39  * Requires a Linux 3.3+ kernel
40  */
41 
42 #include "linux/rdpmc.h"
43 
44 typedef unsigned long long u64;
45 
46 #define rmb() asm volatile("" ::: "memory")
47 
48 
perf_event_open(struct perf_event_attr * attr,pid_t pid,int cpu,int group_fd,unsigned long flags)49 static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
50 			   int group_fd, unsigned long flags)
51 {
52 	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
53 }
54 
55 /**
56  * rdpmc_open - initialize a simple ring 3 readable performance counter
57  * @counter: Raw event descriptor (UUEE UU unit mask EE event)
58  * @ctx:     Pointer to struct &rdpmc_ctx that is initialized
59  *
60  * The counter will be set up to count CPU events excluding the kernel.
61  * Must be called for each thread using the counter.
62  * The caller must make sure counter is suitable for the running CPU.
63  * Only works in 3.3+ kernels.
64  * Must be closed with rdpmc_close()
65  */
66 
rdpmc_open(unsigned counter,struct rdpmc_ctx * ctx)67 int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx)
68 {
69 	struct perf_event_attr attr = {
70 		.type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE,
71 		.size = PERF_ATTR_SIZE_VER0,
72 		.config = counter,
73 		.sample_type = PERF_SAMPLE_READ,
74 		.exclude_kernel = 1,
75 	};
76 	return rdpmc_open_attr(&attr, ctx, NULL);
77 }
78 
79 /**
80  * rdpmc_open_attr - initialize a raw ring 3 readable performance counter
81  * @attr: perf struct %perf_event_attr for the counter
82  * @ctx:  Pointer to struct %rdpmc_ctx that is initialized.
83  * @leader_ctx: context of group leader or NULL
84  *
85  * This allows more flexible setup with a custom &perf_event_attr.
86  * For simple uses rdpmc_open() should be used instead.
87  * Must be called for each thread using the counter.
88  * Must be closed with rdpmc_close()
89  */
rdpmc_open_attr(struct perf_event_attr * attr,struct rdpmc_ctx * ctx,struct rdpmc_ctx * leader_ctx)90 int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx,
91 		    struct rdpmc_ctx *leader_ctx)
92 {
93 	ctx->fd = perf_event_open(attr, 0, -1,
94 			  leader_ctx ? leader_ctx->fd : -1, 0);
95 	if (ctx->fd < 0) {
96 		perror("perf_event_open");
97 		return -1;
98 	}
99 	ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0);
100 	if (ctx->buf == MAP_FAILED) {
101 		close(ctx->fd);
102 		perror("mmap on perf fd");
103 		return -1;
104 	}
105 	/* Not sure why this happens? */
106 	if (ctx->buf->index == 0) {
107 		munmap(ctx->buf, sysconf(_SC_PAGESIZE));
108 		close(ctx->fd);
109 		return -1;
110 	}
111 	return 0;
112 }
113 
114 /**
115  * rdpmc_close - free a ring 3 readable performance counter
116  * @ctx: Pointer to &rdpmc_ctx context.
117  *
118  * Must be called by each thread for each context it initialized.
119  */
rdpmc_close(struct rdpmc_ctx * ctx)120 void rdpmc_close(struct rdpmc_ctx *ctx)
121 {
122 	close(ctx->fd);
123 	munmap(ctx->buf, sysconf(_SC_PAGESIZE));
124 }
125 
126 /**
127  * rdpmc_read - read a ring 3 readable performance counter
128  * @ctx: Pointer to initialized &rdpmc_ctx structure.
129  *
130  * Read the current value of a running performance counter.
131  * This should only be called from the same thread/process as opened
132  * the context. For new threads please create a new context.
133  */
rdpmc_read(struct rdpmc_ctx * ctx)134 unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
135 {
136 	u64 val;
137 	unsigned seq;
138 	u64 offset;
139 	typeof (ctx->buf) buf = ctx->buf;
140 	unsigned index;
141 
142 	do {
143 		seq = buf->lock;
144 		rmb();
145 		index = buf->index;
146 		offset = buf->offset;
147 		if (index == 0) /* rdpmc not allowed */
148 			return offset;
149 		val = __builtin_ia32_rdpmc(index - 1);
150 		rmb();
151 	} while (buf->lock != seq);
152 	return val + offset;
153 }
154 
155 
rdpmc_hw_id(uint32_t cntr_id)156 static uint64_t rdpmc_hw_id(uint32_t cntr_id)
157 {
158 	switch (cntr_id) {
159 	case OFI_PMC_CPU_CYCLES:
160 		return PERF_COUNT_HW_CPU_CYCLES;
161 	case OFI_PMC_CPU_INSTR:
162 		return PERF_COUNT_HW_INSTRUCTIONS;
163 	default:
164 		return ~0;
165 	}
166 }
167 
rdpmc_cache_id(uint32_t cntr_id,uint32_t flags)168 static uint64_t rdpmc_cache_id(uint32_t cntr_id, uint32_t flags)
169 {
170 	/* TODO */
171 	return ~0;
172 }
173 
rdpmc_sw_id(uint32_t cntr_id)174 static uint64_t rdpmc_sw_id(uint32_t cntr_id)
175 {
176 	switch (cntr_id) {
177 	case OFI_PMC_OS_PAGE_FAULT:
178 		return PERF_COUNT_SW_PAGE_FAULTS;
179 	default:
180 		return ~0;
181 	}
182 }
183 
ofi_pmu_open(struct ofi_perf_ctx ** ctx,enum ofi_perf_domain domain,uint32_t cntr_id,uint32_t flags)184 int ofi_pmu_open(struct ofi_perf_ctx **ctx, enum ofi_perf_domain domain,
185 		 uint32_t cntr_id, uint32_t flags)
186 {
187 	struct perf_event_attr attr = {
188 		.size = PERF_ATTR_SIZE_VER0,
189 		.sample_type = PERF_SAMPLE_READ,
190 		.exclude_kernel = 1,
191 	};
192 	int ret;
193 
194 	*ctx = calloc(1, sizeof **ctx);
195 	if (!*ctx)
196 		return -FI_ENOMEM;
197 
198 	switch(domain) {
199 	case OFI_PMU_CPU:
200 		attr.type = PERF_TYPE_HARDWARE;
201 		attr.config = rdpmc_hw_id(cntr_id);
202 		break;
203 	case OFI_PMU_CACHE:
204 		attr.type = PERF_TYPE_HW_CACHE;
205 		attr.config = rdpmc_cache_id(cntr_id, flags);
206 		break;
207 	case OFI_PMU_OS:
208 		attr.type = PERF_TYPE_SOFTWARE;
209 		attr.config = rdpmc_sw_id(cntr_id);
210 		break;
211 	default:
212 		return -FI_ENOSYS;
213 	}
214 
215 	if (attr.config == ~0)
216 		return -FI_ENOSYS;
217 
218 	ret = rdpmc_open_attr(&attr, &(*ctx)->ctx, NULL);
219 	return ret ? -errno : 0;
220 }
221 
ofi_pmu_read(struct ofi_perf_ctx * ctx)222 inline uint64_t ofi_pmu_read(struct ofi_perf_ctx *ctx)
223 {
224 	return rdpmc_read(&ctx->ctx);
225 }
226 
ofi_pmu_close(struct ofi_perf_ctx * ctx)227 inline void ofi_pmu_close(struct ofi_perf_ctx *ctx)
228 {
229 	rdpmc_close(&ctx->ctx);
230 	free(ctx);
231 }
232