1 /*
2 * Copyright © 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use, copy,
8 * modify, merge, publish, distribute, sublicense, and/or sell copies
9 * of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including
13 * the next paragraph) shall be included in all copies or substantial
14 * portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #ifdef __linux__
30 #include <linux/perf_event.h>
31 #endif
32 #include <sys/syscall.h>
33 #include "libhsakmt.h"
34 #include "pmc_table.h"
35 #include "linux/kfd_ioctl.h"
36 #include <unistd.h>
37 #include <sys/ioctl.h>
38 #include <errno.h>
39 #include <sys/mman.h>
40 #include <fcntl.h>
41 #include <semaphore.h>
42
43 #define BITS_PER_BYTE CHAR_BIT
44
45 #define HSA_PERF_MAGIC4CC 0x54415348
46
47 enum perf_trace_state {
48 PERF_TRACE_STATE__STOPPED = 0,
49 PERF_TRACE_STATE__STARTED
50 };
51
52 struct perf_trace_block {
53 enum perf_block_id block_id;
54 uint32_t num_counters;
55 uint64_t *counter_id;
56 int *perf_event_fd;
57 };
58
59 struct perf_trace {
60 uint32_t magic4cc;
61 uint32_t gpu_id;
62 enum perf_trace_state state;
63 uint32_t num_blocks;
64 void *buf;
65 uint64_t buf_size;
66 struct perf_trace_block blocks[0];
67 };
68
69 enum perf_trace_action {
70 PERF_TRACE_ACTION__ACQUIRE = 0,
71 PERF_TRACE_ACTION__RELEASE
72 };
73
74 struct perf_shared_table {
75 uint32_t magic4cc;
76 uint32_t iommu_slots_left;
77 };
78
79 struct perf_counts_values {
80 union {
81 struct {
82 u64 val;
83 u64 ena;
84 u64 run;
85 };
86 u64 values[3];
87 };
88 };
89
90 static HsaCounterProperties **counter_props;
91 static unsigned int counter_props_count;
92 static const char shmem_name[] = "/hsakmt_shared_mem";
93 static int shmem_fd;
94 static const char sem_name[] = "hsakmt_semaphore";
95 static sem_t *sem = SEM_FAILED;
96 struct perf_shared_table *shared_table;
97
readn(int fd,void * buf,size_t n)98 static ssize_t readn(int fd, void *buf, size_t n)
99 {
100 size_t left = n;
101 ssize_t bytes;
102
103 while (left) {
104 bytes = read(fd, buf, left);
105 if (!bytes) /* reach EOF */
106 return (n - left);
107 if (bytes < 0) {
108 if (errno == EINTR) /* read got interrupted */
109 continue;
110 else
111 return -errno;
112 }
113 left -= bytes;
114 buf = VOID_PTR_ADD(buf, bytes);
115 }
116 return n;
117 }
118
init_shared_region(void)119 static HSAKMT_STATUS init_shared_region(void)
120 {
121 sem = sem_open(sem_name, O_CREAT, 0666, 1);
122 if (sem == SEM_FAILED)
123 return HSAKMT_STATUS_ERROR;
124
125 shmem_fd = shm_open(shmem_name, O_CREAT | O_RDWR, 0666);
126 if (shmem_fd < 0)
127 goto exit_1;
128
129 if (ftruncate(shmem_fd, sizeof(struct perf_shared_table)) < 0)
130 goto exit_2;
131
132 shared_table = mmap(NULL, sizeof(*shared_table),
133 PROT_READ | PROT_WRITE, MAP_SHARED, shmem_fd, 0);
134 if (shared_table == MAP_FAILED)
135 goto exit_2;
136
137 return HSAKMT_STATUS_SUCCESS;
138
139 exit_2:
140 shm_unlink(shmem_name);
141 shmem_fd = 0;
142 exit_1:
143 sem_close(sem);
144 sem_unlink(sem_name);
145 sem = SEM_FAILED;
146 return HSAKMT_STATUS_ERROR;
147 }
148
destroy_shared_region(void)149 static void destroy_shared_region(void)
150 {
151 if (shared_table && shared_table != MAP_FAILED)
152 munmap(shared_table, sizeof(*shared_table));
153
154 if (shmem_fd > 0) {
155 close(shmem_fd);
156 shm_unlink(shmem_name);
157 }
158
159 if (sem != SEM_FAILED) {
160 sem_close(sem);
161 sem_unlink(sem_name);
162 sem = SEM_FAILED;
163 }
164 }
165
init_perf_shared_table(void)166 static void init_perf_shared_table(void)
167 {
168 sem_wait(sem);
169
170 /* If the magic number exists, the perf shared table has been
171 * initialized by another process and is in use. Don't overwrite it.
172 */
173 if (shared_table->magic4cc == HSA_PERF_MAGIC4CC) {
174 sem_post(sem);
175 return;
176 }
177
178 /* write the perf content */
179 shared_table->magic4cc = HSA_PERF_MAGIC4CC;
180 shared_table->iommu_slots_left =
181 pmc_table_get_max_concurrent(PERFCOUNTER_BLOCKID__IOMMUV2);
182
183 sem_post(sem);
184 }
185
init_counter_props(unsigned int NumNodes)186 HSAKMT_STATUS init_counter_props(unsigned int NumNodes)
187 {
188 counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *));
189 if (!counter_props) {
190 pr_warn("Profiling is not available.\n");
191 return HSAKMT_STATUS_NO_MEMORY;
192 }
193
194 counter_props_count = NumNodes;
195 alloc_pmc_blocks();
196
197 if (init_shared_region() != HSAKMT_STATUS_SUCCESS) {
198 pr_warn("Profiling of privileged blocks is not available.\n");
199 return HSAKMT_STATUS_ERROR;
200 }
201 init_perf_shared_table();
202
203 return HSAKMT_STATUS_SUCCESS;
204 }
205
destroy_counter_props(void)206 void destroy_counter_props(void)
207 {
208 unsigned int i;
209
210 destroy_shared_region();
211
212 if (!counter_props)
213 return;
214
215 for (i = 0; i < counter_props_count; i++)
216 if (counter_props[i]) {
217 free(counter_props[i]);
218 counter_props[i] = NULL;
219 }
220
221 free(counter_props);
222 free_pmc_blocks();
223 }
224
blockid2uuid(enum perf_block_id block_id,HSA_UUID * uuid)225 static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid)
226 {
227 int rc = 0;
228
229 switch (block_id) {
230 case PERFCOUNTER_BLOCKID__CB:
231 *uuid = HSA_PROFILEBLOCK_AMD_CB;
232 break;
233 case PERFCOUNTER_BLOCKID__CPF:
234 *uuid = HSA_PROFILEBLOCK_AMD_CPF;
235 break;
236 case PERFCOUNTER_BLOCKID__CPG:
237 *uuid = HSA_PROFILEBLOCK_AMD_CPG;
238 break;
239 case PERFCOUNTER_BLOCKID__DB:
240 *uuid = HSA_PROFILEBLOCK_AMD_DB;
241 break;
242 case PERFCOUNTER_BLOCKID__GDS:
243 *uuid = HSA_PROFILEBLOCK_AMD_GDS;
244 break;
245 case PERFCOUNTER_BLOCKID__GRBM:
246 *uuid = HSA_PROFILEBLOCK_AMD_GRBM;
247 break;
248 case PERFCOUNTER_BLOCKID__GRBMSE:
249 *uuid = HSA_PROFILEBLOCK_AMD_GRBMSE;
250 break;
251 case PERFCOUNTER_BLOCKID__IA:
252 *uuid = HSA_PROFILEBLOCK_AMD_IA;
253 break;
254 case PERFCOUNTER_BLOCKID__MC:
255 *uuid = HSA_PROFILEBLOCK_AMD_MC;
256 break;
257 case PERFCOUNTER_BLOCKID__PASC:
258 *uuid = HSA_PROFILEBLOCK_AMD_PASC;
259 break;
260 case PERFCOUNTER_BLOCKID__PASU:
261 *uuid = HSA_PROFILEBLOCK_AMD_PASU;
262 break;
263 case PERFCOUNTER_BLOCKID__SPI:
264 *uuid = HSA_PROFILEBLOCK_AMD_SPI;
265 break;
266 case PERFCOUNTER_BLOCKID__SRBM:
267 *uuid = HSA_PROFILEBLOCK_AMD_SRBM;
268 break;
269 case PERFCOUNTER_BLOCKID__SQ:
270 *uuid = HSA_PROFILEBLOCK_AMD_SQ;
271 break;
272 case PERFCOUNTER_BLOCKID__SX:
273 *uuid = HSA_PROFILEBLOCK_AMD_SX;
274 break;
275 case PERFCOUNTER_BLOCKID__TA:
276 *uuid = HSA_PROFILEBLOCK_AMD_TA;
277 break;
278 case PERFCOUNTER_BLOCKID__TCA:
279 *uuid = HSA_PROFILEBLOCK_AMD_TCA;
280 break;
281 case PERFCOUNTER_BLOCKID__TCC:
282 *uuid = HSA_PROFILEBLOCK_AMD_TCC;
283 break;
284 case PERFCOUNTER_BLOCKID__TCP:
285 *uuid = HSA_PROFILEBLOCK_AMD_TCP;
286 break;
287 case PERFCOUNTER_BLOCKID__TCS:
288 *uuid = HSA_PROFILEBLOCK_AMD_TCS;
289 break;
290 case PERFCOUNTER_BLOCKID__TD:
291 *uuid = HSA_PROFILEBLOCK_AMD_TD;
292 break;
293 case PERFCOUNTER_BLOCKID__VGT:
294 *uuid = HSA_PROFILEBLOCK_AMD_VGT;
295 break;
296 case PERFCOUNTER_BLOCKID__WD:
297 *uuid = HSA_PROFILEBLOCK_AMD_WD;
298 break;
299 case PERFCOUNTER_BLOCKID__IOMMUV2:
300 *uuid = HSA_PROFILEBLOCK_AMD_IOMMUV2;
301 break;
302 default:
303 /* If we reach this point, it's a bug */
304 rc = -1;
305 break;
306 }
307
308 return rc;
309 }
310
get_block_concurrent_limit(uint32_t node_id,HSAuint32 block_id)311 static HSAuint32 get_block_concurrent_limit(uint32_t node_id,
312 HSAuint32 block_id)
313 {
314 uint32_t i;
315 HsaCounterBlockProperties *block = &counter_props[node_id]->Blocks[0];
316
317 for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
318 if (block->Counters[0].BlockIndex == block_id)
319 return block->NumConcurrent;
320 block = (HsaCounterBlockProperties *)&block->Counters[block->NumCounters];
321 }
322
323 return 0;
324 }
325
update_block_slots(enum perf_trace_action action,uint32_t block_id,uint32_t num_slots)326 static HSAKMT_STATUS update_block_slots(enum perf_trace_action action,
327 uint32_t block_id, uint32_t num_slots)
328 {
329 uint32_t *slots_left;
330 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
331
332 if (shmem_fd <= 0)
333 return HSAKMT_STATUS_UNAVAILABLE;
334 if (sem == SEM_FAILED)
335 return HSAKMT_STATUS_UNAVAILABLE;
336
337 sem_wait(sem);
338
339 if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2)
340 slots_left = &shared_table->iommu_slots_left;
341 else {
342 ret = HSAKMT_STATUS_UNAVAILABLE;
343 goto out;
344 }
345
346 switch (action) {
347 case PERF_TRACE_ACTION__ACQUIRE:
348 if (*slots_left >= num_slots)
349 *slots_left -= num_slots;
350 else
351 ret = HSAKMT_STATUS_UNAVAILABLE;
352 break;
353 case PERF_TRACE_ACTION__RELEASE:
354 if ((*slots_left + num_slots) <=
355 pmc_table_get_max_concurrent(block_id))
356 *slots_left += num_slots;
357 else
358 ret = HSAKMT_STATUS_ERROR;
359 break;
360 default:
361 ret = HSAKMT_STATUS_INVALID_PARAMETER;
362 break;
363 }
364
365 out:
366 sem_post(sem);
367
368 return ret;
369 }
370
get_perf_event_type(enum perf_block_id block_id)371 static unsigned int get_perf_event_type(enum perf_block_id block_id)
372 {
373 FILE *file = NULL;
374 unsigned int type = 0;
375
376 if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) {
377 /* Starting from kernel 4.12, amd_iommu_0 is used */
378 file = fopen("/sys/bus/event_source/devices/amd_iommu_0/type",
379 "r");
380 if (!file)
381 file = fopen(/* kernel 4.11 and older */
382 "/sys/bus/event_source/devices/amd_iommu/type",
383 "r");
384 }
385
386 if (!file)
387 return 0;
388
389 if (fscanf(file, "%d", &type) != 1)
390 type = 0;
391 fclose(file);
392
393 return type;
394 }
395
396 /* close_perf_event_fd - Close all FDs opened for this block.
397 * When RT acquires the trace access, RT has no ideas about each
398 * individual FD opened for this block. We should treat the whole
399 * block as one and close all of them.
400 */
close_perf_event_fd(struct perf_trace_block * block)401 static void close_perf_event_fd(struct perf_trace_block *block)
402 {
403 uint32_t i;
404
405 if (!block || !block->perf_event_fd)
406 return;
407
408 for (i = 0; i < block->num_counters; i++)
409 if (block->perf_event_fd[i] > 0) {
410 close(block->perf_event_fd[i]);
411 block->perf_event_fd[i] = 0;
412 }
413 }
414
415 /* open_perf_event_fd - Open FDs required for this block.
416 * If one of them fails, we should close all FDs that have been
417 * opened because RT has no ideas about those FDs successfully
418 * opened and it won't send anything to close them.
419 */
open_perf_event_fd(struct perf_trace_block * block)420 static HSAKMT_STATUS open_perf_event_fd(struct perf_trace_block *block)
421 {
422 #ifdef __linux__
423 struct perf_event_attr attr;
424 uint32_t i;
425 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
426
427 if (!block || !block->perf_event_fd)
428 return HSAKMT_STATUS_INVALID_HANDLE;
429
430 if (getuid()) {
431 pr_err("Must be root to open perf_event.\n");
432 return HSAKMT_STATUS_ERROR;
433 }
434
435 memset(&attr, 0, sizeof(struct perf_event_attr));
436 attr.type = get_perf_event_type(block->block_id);
437 if (!attr.type)
438 return HSAKMT_STATUS_ERROR;
439
440 for (i = 0; i < block->num_counters; i++) {
441 attr.size = sizeof(struct perf_event_attr);
442 attr.config = block->counter_id[i];
443 attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
444 PERF_FORMAT_TOTAL_TIME_RUNNING;
445 attr.disabled = 1;
446 attr.inherit = 1;
447
448 /* We are profiling system wide, not per cpu, so no threads,
449 * no groups -> pid=-1 and group_fd=-1. cpu = 0
450 * flags=PERF_FLAG_FD_NO_GROUP
451 */
452 block->perf_event_fd[i] = syscall(__NR_perf_event_open, &attr,
453 -1, 0, -1, PERF_FLAG_FD_NO_GROUP);
454
455 if (block->perf_event_fd[i] < 0) {
456 ret = HSAKMT_STATUS_ERROR;
457 close_perf_event_fd(block);
458 break;
459 }
460 }
461
462 return ret;
463 #else
464 return HSAKMT_STATUS_ERROR;
465 #endif
466 }
467
perf_trace_ioctl(struct perf_trace_block * block,uint32_t cmd)468 static HSAKMT_STATUS perf_trace_ioctl(struct perf_trace_block *block,
469 uint32_t cmd)
470 {
471 uint32_t i;
472
473 for (i = 0; i < block->num_counters; i++) {
474 if (block->perf_event_fd[i] < 0)
475 return HSAKMT_STATUS_UNAVAILABLE;
476 if (ioctl(block->perf_event_fd[i], cmd, NULL))
477 return HSAKMT_STATUS_ERROR;
478 }
479
480 return HSAKMT_STATUS_SUCCESS;
481 }
482
query_trace(int fd,uint64_t * buf)483 static HSAKMT_STATUS query_trace(int fd, uint64_t *buf)
484 {
485 #ifdef __linux__
486 struct perf_counts_values content;
487
488 if (fd < 0)
489 return HSAKMT_STATUS_ERROR;
490 if (readn(fd, &content, sizeof(content)) != sizeof(content))
491 return HSAKMT_STATUS_ERROR;
492
493 *buf = content.val;
494 return HSAKMT_STATUS_SUCCESS;
495 #else
496 return HSAKMT_STATUS_ERROR;
497 #endif
498 }
499
hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,HsaCounterProperties ** CounterProperties)500 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId,
501 HsaCounterProperties **CounterProperties)
502 {
503 #ifdef __linux__
504 HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS;
505 uint32_t gpu_id, i, block_id;
506 uint32_t counter_props_size = 0;
507 uint32_t total_counters = 0;
508 uint32_t total_concurrent = 0;
509 struct perf_counter_block block = {0};
510 uint32_t total_blocks = 0;
511 HsaCounterBlockProperties *block_prop;
512
513 if (!counter_props)
514 return HSAKMT_STATUS_NO_MEMORY;
515
516 if (!CounterProperties)
517 return HSAKMT_STATUS_INVALID_PARAMETER;
518
519 if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
520 return HSAKMT_STATUS_INVALID_NODE_UNIT;
521
522 if (counter_props[NodeId]) {
523 *CounterProperties = counter_props[NodeId];
524 return HSAKMT_STATUS_SUCCESS;
525 }
526
527 for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
528 rc = get_block_properties(NodeId, i, &block);
529 if (rc != HSAKMT_STATUS_SUCCESS)
530 return rc;
531 total_concurrent += block.num_of_slots;
532 total_counters += block.num_of_counters;
533 /* If num_of_slots=0, this block doesn't exist */
534 if (block.num_of_slots)
535 total_blocks++;
536 }
537
538 counter_props_size = sizeof(HsaCounterProperties) +
539 sizeof(HsaCounterBlockProperties) * (total_blocks - 1) +
540 sizeof(HsaCounter) * (total_counters - total_blocks);
541
542 counter_props[NodeId] = malloc(counter_props_size);
543 if (!counter_props[NodeId])
544 return HSAKMT_STATUS_NO_MEMORY;
545
546 counter_props[NodeId]->NumBlocks = total_blocks;
547 counter_props[NodeId]->NumConcurrent = total_concurrent;
548
549 block_prop = &counter_props[NodeId]->Blocks[0];
550 for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) {
551 rc = get_block_properties(NodeId, block_id, &block);
552 if (rc != HSAKMT_STATUS_SUCCESS) {
553 free(counter_props[NodeId]);
554 counter_props[NodeId] = NULL;
555 return rc;
556 }
557
558 if (!block.num_of_slots) /* not a valid block */
559 continue;
560
561 blockid2uuid(block_id, &block_prop->BlockId);
562 block_prop->NumCounters = block.num_of_counters;
563 block_prop->NumConcurrent = block.num_of_slots;
564 for (i = 0; i < block.num_of_counters; i++) {
565 block_prop->Counters[i].BlockIndex = block_id;
566 block_prop->Counters[i].CounterId = block.counter_ids[i];
567 block_prop->Counters[i].CounterSizeInBits = block.counter_size_in_bits;
568 block_prop->Counters[i].CounterMask = block.counter_mask;
569 block_prop->Counters[i].Flags.ui32.Global = 1;
570 if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2)
571 block_prop->Counters[i].Type = HSA_PROFILE_TYPE_PRIVILEGED_IMMEDIATE;
572 else
573 block_prop->Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE;
574 }
575
576 block_prop = (HsaCounterBlockProperties *)&block_prop->Counters[block_prop->NumCounters];
577 }
578
579 *CounterProperties = counter_props[NodeId];
580
581 return HSAKMT_STATUS_SUCCESS;
582 #else
583 return HSAKMT_STATUS_ERROR;
584 #endif
585 }
586
587 /* Registers a set of (HW) counters to be used for tracing/profiling */
hsaKmtPmcRegisterTrace(HSAuint32 NodeId,HSAuint32 NumberOfCounters,HsaCounter * Counters,HsaPmcTraceRoot * TraceRoot)588 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId,
589 HSAuint32 NumberOfCounters,
590 HsaCounter *Counters,
591 HsaPmcTraceRoot *TraceRoot)
592 {
593 uint32_t gpu_id, i, j;
594 uint64_t min_buf_size = 0;
595 struct perf_trace *trace = NULL;
596 uint32_t concurrent_limit;
597 const uint32_t MAX_COUNTERS = 512;
598 uint64_t counter_id[PERFCOUNTER_BLOCKID__MAX][MAX_COUNTERS];
599 uint32_t num_counters[PERFCOUNTER_BLOCKID__MAX] = {0};
600 uint32_t block, num_blocks = 0, total_counters = 0;
601 uint64_t *counter_id_ptr;
602 int *fd_ptr;
603
604 pr_debug("[%s] Number of counters %d\n", __func__, NumberOfCounters);
605
606 if (!counter_props)
607 return HSAKMT_STATUS_NO_MEMORY;
608
609 if (!Counters || !TraceRoot || NumberOfCounters == 0)
610 return HSAKMT_STATUS_INVALID_PARAMETER;
611
612 if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
613 return HSAKMT_STATUS_INVALID_NODE_UNIT;
614
615 if (NumberOfCounters > MAX_COUNTERS) {
616 pr_err("MAX_COUNTERS is too small for %d.\n",
617 NumberOfCounters);
618 return HSAKMT_STATUS_NO_MEMORY;
619 }
620
621 /* Calculating the minimum buffer size */
622 for (i = 0; i < NumberOfCounters; i++) {
623 if (Counters[i].BlockIndex >= PERFCOUNTER_BLOCKID__MAX)
624 return HSAKMT_STATUS_INVALID_PARAMETER;
625 /* Only privileged counters need to register */
626 if (Counters[i].Type > HSA_PROFILE_TYPE_PRIVILEGED_STREAMING)
627 continue;
628 min_buf_size += Counters[i].CounterSizeInBits/BITS_PER_BYTE;
629 /* j: the first blank entry in the block to record counter_id */
630 j = num_counters[Counters[i].BlockIndex];
631 counter_id[Counters[i].BlockIndex][j] = Counters[i].CounterId;
632 num_counters[Counters[i].BlockIndex]++;
633 total_counters++;
634 }
635
636 /* Verify that the number of counters per block is not larger than the
637 * number of slots.
638 */
639 for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
640 if (!num_counters[i])
641 continue;
642 concurrent_limit = get_block_concurrent_limit(NodeId, i);
643 if (!concurrent_limit) {
644 pr_err("Invalid block ID: %d\n", i);
645 return HSAKMT_STATUS_INVALID_PARAMETER;
646 }
647 if (num_counters[i] > concurrent_limit) {
648 pr_err("Counters exceed the limit.\n");
649 return HSAKMT_STATUS_INVALID_PARAMETER;
650 }
651 num_blocks++;
652 }
653
654 if (!num_blocks)
655 return HSAKMT_STATUS_INVALID_PARAMETER;
656
657 /* Now we have sorted blocks/counters information in
658 * num_counters[block_id] and counter_id[block_id][]. Allocate trace
659 * and record the information.
660 */
661 trace = (struct perf_trace *)calloc(sizeof(struct perf_trace)
662 + sizeof(struct perf_trace_block) * num_blocks
663 + sizeof(uint64_t) * total_counters
664 + sizeof(int) * total_counters,
665 1);
666 if (!trace)
667 return HSAKMT_STATUS_NO_MEMORY;
668
669 /* Allocated area is partitioned as:
670 * +---------------------------------+ trace
671 * | perf_trace |
672 * |---------------------------------| trace->blocks[0]
673 * | perf_trace_block 0 |
674 * | .... |
675 * | perf_trace_block N-1 | trace->blocks[N-1]
676 * |---------------------------------| <-- counter_id_ptr starts here
677 * | block 0's counter IDs(uint64_t) |
678 * | ...... |
679 * | block N-1's counter IDs |
680 * |---------------------------------| <-- perf_event_fd starts here
681 * | block 0's perf_event_fds(int) |
682 * | ...... |
683 * | block N-1's perf_event_fds |
684 * +---------------------------------+
685 */
686 block = 0;
687 counter_id_ptr = (uint64_t *)((char *)
688 trace + sizeof(struct perf_trace)
689 + sizeof(struct perf_trace_block) * num_blocks);
690 fd_ptr = (int *)(counter_id_ptr + total_counters);
691 /* Fill in each block's information to the TraceId */
692 for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) {
693 if (!num_counters[i]) /* not a block to trace */
694 continue;
695 /* Following perf_trace + perf_trace_block x N are those
696 * counter_id arrays. Assign the counter_id array belonging to
697 * this block.
698 */
699 trace->blocks[block].counter_id = counter_id_ptr;
700 /* Fill in counter IDs to the counter_id array. */
701 for (j = 0; j < num_counters[i]; j++)
702 trace->blocks[block].counter_id[j] = counter_id[i][j];
703 trace->blocks[block].perf_event_fd = fd_ptr;
704 /* how many counters to trace */
705 trace->blocks[block].num_counters = num_counters[i];
706 /* block index in "enum perf_block_id" */
707 trace->blocks[block].block_id = i;
708 block++; /* move to next */
709 counter_id_ptr += num_counters[i];
710 fd_ptr += num_counters[i];
711 }
712
713 trace->magic4cc = HSA_PERF_MAGIC4CC;
714 trace->gpu_id = gpu_id;
715 trace->state = PERF_TRACE_STATE__STOPPED;
716 trace->num_blocks = num_blocks;
717
718 TraceRoot->NumberOfPasses = 1;
719 TraceRoot->TraceBufferMinSizeBytes = PAGE_ALIGN_UP(min_buf_size);
720 TraceRoot->TraceId = PORT_VPTR_TO_UINT64(trace);
721
722 return HSAKMT_STATUS_SUCCESS;
723 }
724
725 /* Unregisters a set of (HW) counters used for tracing/profiling */
726
hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,HSATraceId TraceId)727 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId,
728 HSATraceId TraceId)
729 {
730 uint32_t gpu_id;
731 struct perf_trace *trace;
732
733 pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
734
735 if (TraceId == 0)
736 return HSAKMT_STATUS_INVALID_PARAMETER;
737
738 if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
739 return HSAKMT_STATUS_INVALID_NODE_UNIT;
740
741 trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
742
743 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
744 return HSAKMT_STATUS_INVALID_HANDLE;
745
746 if (trace->gpu_id != gpu_id)
747 return HSAKMT_STATUS_INVALID_NODE_UNIT;
748
749 /* If the trace is in the running state, stop it */
750 if (trace->state == PERF_TRACE_STATE__STARTED) {
751 HSAKMT_STATUS status = hsaKmtPmcStopTrace(TraceId);
752
753 if (status != HSAKMT_STATUS_SUCCESS)
754 return status;
755 }
756
757 free(trace);
758
759 return HSAKMT_STATUS_SUCCESS;
760 }
761
hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,HSATraceId TraceId)762 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId,
763 HSATraceId TraceId)
764 {
765 struct perf_trace *trace;
766 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
767 uint32_t gpu_id, i;
768 int j;
769
770 pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
771
772 if (TraceId == 0)
773 return HSAKMT_STATUS_INVALID_PARAMETER;
774
775 trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
776
777 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
778 return HSAKMT_STATUS_INVALID_HANDLE;
779
780 if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS)
781 return HSAKMT_STATUS_INVALID_NODE_UNIT;
782
783 for (i = 0; i < trace->num_blocks; i++) {
784 ret = update_block_slots(PERF_TRACE_ACTION__ACQUIRE,
785 trace->blocks[i].block_id,
786 trace->blocks[i].num_counters);
787 if (ret != HSAKMT_STATUS_SUCCESS)
788 goto out;
789 ret = open_perf_event_fd(&trace->blocks[i]);
790 if (ret != HSAKMT_STATUS_SUCCESS) {
791 i++; /* to release slots just reserved */
792 goto out;
793 }
794 }
795
796 out:
797 if (ret != HSAKMT_STATUS_SUCCESS) {
798 for (j = i-1; j >= 0; j--) {
799 update_block_slots(PERF_TRACE_ACTION__RELEASE,
800 trace->blocks[j].block_id,
801 trace->blocks[j].num_counters);
802 close_perf_event_fd(&trace->blocks[j]);
803 }
804 }
805
806 return ret;
807 }
808
hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,HSATraceId TraceId)809 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId,
810 HSATraceId TraceId)
811 {
812 struct perf_trace *trace;
813 uint32_t i;
814
815 pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
816
817 if (TraceId == 0)
818 return HSAKMT_STATUS_INVALID_PARAMETER;
819
820 trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
821
822 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
823 return HSAKMT_STATUS_INVALID_HANDLE;
824
825 for (i = 0; i < trace->num_blocks; i++) {
826 update_block_slots(PERF_TRACE_ACTION__RELEASE,
827 trace->blocks[i].block_id,
828 trace->blocks[i].num_counters);
829 close_perf_event_fd(&trace->blocks[i]);
830 }
831
832 return HSAKMT_STATUS_SUCCESS;
833 }
834
835
836 /* Starts tracing operation on a previously established set of performance counters */
hsaKmtPmcStartTrace(HSATraceId TraceId,void * TraceBuffer,HSAuint64 TraceBufferSizeBytes)837 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId,
838 void *TraceBuffer,
839 HSAuint64 TraceBufferSizeBytes)
840 {
841 #ifdef __linux__
842 struct perf_trace *trace =
843 (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
844 uint32_t i;
845 int32_t j;
846 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
847
848 pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
849
850 if (TraceId == 0 || !TraceBuffer || TraceBufferSizeBytes == 0)
851 return HSAKMT_STATUS_INVALID_PARAMETER;
852
853 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
854 return HSAKMT_STATUS_INVALID_HANDLE;
855
856 for (i = 0; i < trace->num_blocks; i++) {
857 ret = perf_trace_ioctl(&trace->blocks[i],
858 PERF_EVENT_IOC_ENABLE);
859 if (ret != HSAKMT_STATUS_SUCCESS)
860 break;
861 }
862 if (ret != HSAKMT_STATUS_SUCCESS) {
863 /* Disable enabled blocks before returning the failure. */
864 j = (int32_t)i;
865 while (--j >= 0)
866 perf_trace_ioctl(&trace->blocks[j],
867 PERF_EVENT_IOC_DISABLE);
868 return ret;
869 }
870
871 trace->state = PERF_TRACE_STATE__STARTED;
872 trace->buf = TraceBuffer;
873 trace->buf_size = TraceBufferSizeBytes;
874
875 return HSAKMT_STATUS_SUCCESS;
876 #else
877 return HSAKMT_STATUS_ERROR;
878 #endif
879 }
880
881
882 /*Forces an update of all the counters that a previously started trace operation has registered */
883
hsaKmtPmcQueryTrace(HSATraceId TraceId)884 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId)
885 {
886 struct perf_trace *trace =
887 (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
888 uint32_t i, j;
889 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
890 uint64_t *buf;
891 uint64_t buf_filled = 0;
892
893 if (TraceId == 0)
894 return HSAKMT_STATUS_INVALID_PARAMETER;
895
896 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
897 return HSAKMT_STATUS_INVALID_HANDLE;
898
899 buf = (uint64_t *)trace->buf;
900 pr_debug("[%s] Trace buffer(%p): ", __func__, buf);
901 for (i = 0; i < trace->num_blocks; i++)
902 for (j = 0; j < trace->blocks[i].num_counters; j++) {
903 buf_filled += sizeof(uint64_t);
904 if (buf_filled > trace->buf_size)
905 return HSAKMT_STATUS_NO_MEMORY;
906 ret = query_trace(trace->blocks[i].perf_event_fd[j],
907 buf);
908 if (ret != HSAKMT_STATUS_SUCCESS)
909 return ret;
910 pr_debug("%lu_", *buf);
911 buf++;
912 }
913 pr_debug("\n");
914
915 return HSAKMT_STATUS_SUCCESS;
916 }
917
918
919 /* Stops tracing operation on a previously established set of performance counters */
hsaKmtPmcStopTrace(HSATraceId TraceId)920 HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId)
921 {
922 #ifdef __linux__
923 struct perf_trace *trace =
924 (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId);
925 uint32_t i;
926 HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS;
927
928 pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId);
929
930 if (TraceId == 0)
931 return HSAKMT_STATUS_INVALID_PARAMETER;
932
933 if (trace->magic4cc != HSA_PERF_MAGIC4CC)
934 return HSAKMT_STATUS_INVALID_HANDLE;
935
936 for (i = 0; i < trace->num_blocks; i++) {
937 ret = perf_trace_ioctl(&trace->blocks[i],
938 PERF_EVENT_IOC_DISABLE);
939 if (ret != HSAKMT_STATUS_SUCCESS)
940 return ret;
941 }
942
943 trace->state = PERF_TRACE_STATE__STOPPED;
944
945 return ret;
946 #else
947 return HSAKMT_STATUS_ERROR;
948 #endif
949 }
950