/* * Copyright © 2014 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including * the next paragraph) shall be included in all copies or substantial * portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #ifdef __linux__ #include #endif #include #include "libhsakmt.h" #include "pmc_table.h" #include "linux/kfd_ioctl.h" #include #include #include #include #include #include #define BITS_PER_BYTE CHAR_BIT #define HSA_PERF_MAGIC4CC 0x54415348 enum perf_trace_state { PERF_TRACE_STATE__STOPPED = 0, PERF_TRACE_STATE__STARTED }; struct perf_trace_block { enum perf_block_id block_id; uint32_t num_counters; uint64_t *counter_id; int *perf_event_fd; }; struct perf_trace { uint32_t magic4cc; uint32_t gpu_id; enum perf_trace_state state; uint32_t num_blocks; void *buf; uint64_t buf_size; struct perf_trace_block blocks[0]; }; enum perf_trace_action { PERF_TRACE_ACTION__ACQUIRE = 0, PERF_TRACE_ACTION__RELEASE }; struct perf_shared_table { uint32_t magic4cc; uint32_t iommu_slots_left; }; struct perf_counts_values { union { struct { u64 val; u64 ena; u64 run; }; u64 values[3]; }; }; static HsaCounterProperties **counter_props; static unsigned int counter_props_count; static const char shmem_name[] = "/hsakmt_shared_mem"; static int shmem_fd; static const char sem_name[] = "hsakmt_semaphore"; static sem_t *sem = SEM_FAILED; struct perf_shared_table *shared_table; static ssize_t readn(int fd, void *buf, size_t n) { size_t left = n; ssize_t bytes; while (left) { bytes = read(fd, buf, left); if (!bytes) /* reach EOF */ return (n - left); if (bytes < 0) { if (errno == EINTR) /* read got interrupted */ continue; else return -errno; } left -= bytes; buf = VOID_PTR_ADD(buf, bytes); } return n; } static HSAKMT_STATUS init_shared_region(void) { sem = sem_open(sem_name, O_CREAT, 0666, 1); if (sem == SEM_FAILED) return HSAKMT_STATUS_ERROR; shmem_fd = shm_open(shmem_name, O_CREAT | O_RDWR, 0666); if (shmem_fd < 0) goto exit_1; if (ftruncate(shmem_fd, sizeof(struct perf_shared_table)) < 0) goto exit_2; shared_table = mmap(NULL, sizeof(*shared_table), PROT_READ | PROT_WRITE, MAP_SHARED, shmem_fd, 0); if (shared_table == MAP_FAILED) goto exit_2; return HSAKMT_STATUS_SUCCESS; exit_2: shm_unlink(shmem_name); shmem_fd = 0; exit_1: sem_close(sem); sem_unlink(sem_name); sem = SEM_FAILED; return HSAKMT_STATUS_ERROR; } static void destroy_shared_region(void) { if (shared_table && shared_table != MAP_FAILED) munmap(shared_table, sizeof(*shared_table)); if (shmem_fd > 0) { close(shmem_fd); shm_unlink(shmem_name); } if (sem != SEM_FAILED) { sem_close(sem); sem_unlink(sem_name); sem = SEM_FAILED; } } static void init_perf_shared_table(void) { sem_wait(sem); /* If the magic number exists, the perf shared table has been * initialized by another process and is in use. Don't overwrite it. */ if (shared_table->magic4cc == HSA_PERF_MAGIC4CC) { sem_post(sem); return; } /* write the perf content */ shared_table->magic4cc = HSA_PERF_MAGIC4CC; shared_table->iommu_slots_left = pmc_table_get_max_concurrent(PERFCOUNTER_BLOCKID__IOMMUV2); sem_post(sem); } HSAKMT_STATUS init_counter_props(unsigned int NumNodes) { counter_props = calloc(NumNodes, sizeof(struct HsaCounterProperties *)); if (!counter_props) { pr_warn("Profiling is not available.\n"); return HSAKMT_STATUS_NO_MEMORY; } counter_props_count = NumNodes; alloc_pmc_blocks(); if (init_shared_region() != HSAKMT_STATUS_SUCCESS) { pr_warn("Profiling of privileged blocks is not available.\n"); return HSAKMT_STATUS_ERROR; } init_perf_shared_table(); return HSAKMT_STATUS_SUCCESS; } void destroy_counter_props(void) { unsigned int i; destroy_shared_region(); if (!counter_props) return; for (i = 0; i < counter_props_count; i++) if (counter_props[i]) { free(counter_props[i]); counter_props[i] = NULL; } free(counter_props); free_pmc_blocks(); } static int blockid2uuid(enum perf_block_id block_id, HSA_UUID *uuid) { int rc = 0; switch (block_id) { case PERFCOUNTER_BLOCKID__CB: *uuid = HSA_PROFILEBLOCK_AMD_CB; break; case PERFCOUNTER_BLOCKID__CPF: *uuid = HSA_PROFILEBLOCK_AMD_CPF; break; case PERFCOUNTER_BLOCKID__CPG: *uuid = HSA_PROFILEBLOCK_AMD_CPG; break; case PERFCOUNTER_BLOCKID__DB: *uuid = HSA_PROFILEBLOCK_AMD_DB; break; case PERFCOUNTER_BLOCKID__GDS: *uuid = HSA_PROFILEBLOCK_AMD_GDS; break; case PERFCOUNTER_BLOCKID__GRBM: *uuid = HSA_PROFILEBLOCK_AMD_GRBM; break; case PERFCOUNTER_BLOCKID__GRBMSE: *uuid = HSA_PROFILEBLOCK_AMD_GRBMSE; break; case PERFCOUNTER_BLOCKID__IA: *uuid = HSA_PROFILEBLOCK_AMD_IA; break; case PERFCOUNTER_BLOCKID__MC: *uuid = HSA_PROFILEBLOCK_AMD_MC; break; case PERFCOUNTER_BLOCKID__PASC: *uuid = HSA_PROFILEBLOCK_AMD_PASC; break; case PERFCOUNTER_BLOCKID__PASU: *uuid = HSA_PROFILEBLOCK_AMD_PASU; break; case PERFCOUNTER_BLOCKID__SPI: *uuid = HSA_PROFILEBLOCK_AMD_SPI; break; case PERFCOUNTER_BLOCKID__SRBM: *uuid = HSA_PROFILEBLOCK_AMD_SRBM; break; case PERFCOUNTER_BLOCKID__SQ: *uuid = HSA_PROFILEBLOCK_AMD_SQ; break; case PERFCOUNTER_BLOCKID__SX: *uuid = HSA_PROFILEBLOCK_AMD_SX; break; case PERFCOUNTER_BLOCKID__TA: *uuid = HSA_PROFILEBLOCK_AMD_TA; break; case PERFCOUNTER_BLOCKID__TCA: *uuid = HSA_PROFILEBLOCK_AMD_TCA; break; case PERFCOUNTER_BLOCKID__TCC: *uuid = HSA_PROFILEBLOCK_AMD_TCC; break; case PERFCOUNTER_BLOCKID__TCP: *uuid = HSA_PROFILEBLOCK_AMD_TCP; break; case PERFCOUNTER_BLOCKID__TCS: *uuid = HSA_PROFILEBLOCK_AMD_TCS; break; case PERFCOUNTER_BLOCKID__TD: *uuid = HSA_PROFILEBLOCK_AMD_TD; break; case PERFCOUNTER_BLOCKID__VGT: *uuid = HSA_PROFILEBLOCK_AMD_VGT; break; case PERFCOUNTER_BLOCKID__WD: *uuid = HSA_PROFILEBLOCK_AMD_WD; break; case PERFCOUNTER_BLOCKID__IOMMUV2: *uuid = HSA_PROFILEBLOCK_AMD_IOMMUV2; break; default: /* If we reach this point, it's a bug */ rc = -1; break; } return rc; } static HSAuint32 get_block_concurrent_limit(uint32_t node_id, HSAuint32 block_id) { uint32_t i; HsaCounterBlockProperties *block = &counter_props[node_id]->Blocks[0]; for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (block->Counters[0].BlockIndex == block_id) return block->NumConcurrent; block = (HsaCounterBlockProperties *)&block->Counters[block->NumCounters]; } return 0; } static HSAKMT_STATUS update_block_slots(enum perf_trace_action action, uint32_t block_id, uint32_t num_slots) { uint32_t *slots_left; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; if (shmem_fd <= 0) return HSAKMT_STATUS_UNAVAILABLE; if (sem == SEM_FAILED) return HSAKMT_STATUS_UNAVAILABLE; sem_wait(sem); if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) slots_left = &shared_table->iommu_slots_left; else { ret = HSAKMT_STATUS_UNAVAILABLE; goto out; } switch (action) { case PERF_TRACE_ACTION__ACQUIRE: if (*slots_left >= num_slots) *slots_left -= num_slots; else ret = HSAKMT_STATUS_UNAVAILABLE; break; case PERF_TRACE_ACTION__RELEASE: if ((*slots_left + num_slots) <= pmc_table_get_max_concurrent(block_id)) *slots_left += num_slots; else ret = HSAKMT_STATUS_ERROR; break; default: ret = HSAKMT_STATUS_INVALID_PARAMETER; break; } out: sem_post(sem); return ret; } static unsigned int get_perf_event_type(enum perf_block_id block_id) { FILE *file = NULL; unsigned int type = 0; if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) { /* Starting from kernel 4.12, amd_iommu_0 is used */ file = fopen("/sys/bus/event_source/devices/amd_iommu_0/type", "r"); if (!file) file = fopen(/* kernel 4.11 and older */ "/sys/bus/event_source/devices/amd_iommu/type", "r"); } if (!file) return 0; if (fscanf(file, "%d", &type) != 1) type = 0; fclose(file); return type; } /* close_perf_event_fd - Close all FDs opened for this block. * When RT acquires the trace access, RT has no ideas about each * individual FD opened for this block. We should treat the whole * block as one and close all of them. */ static void close_perf_event_fd(struct perf_trace_block *block) { uint32_t i; if (!block || !block->perf_event_fd) return; for (i = 0; i < block->num_counters; i++) if (block->perf_event_fd[i] > 0) { close(block->perf_event_fd[i]); block->perf_event_fd[i] = 0; } } /* open_perf_event_fd - Open FDs required for this block. * If one of them fails, we should close all FDs that have been * opened because RT has no ideas about those FDs successfully * opened and it won't send anything to close them. */ static HSAKMT_STATUS open_perf_event_fd(struct perf_trace_block *block) { #ifdef __linux__ struct perf_event_attr attr; uint32_t i; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; if (!block || !block->perf_event_fd) return HSAKMT_STATUS_INVALID_HANDLE; if (getuid()) { pr_err("Must be root to open perf_event.\n"); return HSAKMT_STATUS_ERROR; } memset(&attr, 0, sizeof(struct perf_event_attr)); attr.type = get_perf_event_type(block->block_id); if (!attr.type) return HSAKMT_STATUS_ERROR; for (i = 0; i < block->num_counters; i++) { attr.size = sizeof(struct perf_event_attr); attr.config = block->counter_id[i]; attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; attr.disabled = 1; attr.inherit = 1; /* We are profiling system wide, not per cpu, so no threads, * no groups -> pid=-1 and group_fd=-1. cpu = 0 * flags=PERF_FLAG_FD_NO_GROUP */ block->perf_event_fd[i] = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_NO_GROUP); if (block->perf_event_fd[i] < 0) { ret = HSAKMT_STATUS_ERROR; close_perf_event_fd(block); break; } } return ret; #else return HSAKMT_STATUS_ERROR; #endif } static HSAKMT_STATUS perf_trace_ioctl(struct perf_trace_block *block, uint32_t cmd) { uint32_t i; for (i = 0; i < block->num_counters; i++) { if (block->perf_event_fd[i] < 0) return HSAKMT_STATUS_UNAVAILABLE; if (ioctl(block->perf_event_fd[i], cmd, NULL)) return HSAKMT_STATUS_ERROR; } return HSAKMT_STATUS_SUCCESS; } static HSAKMT_STATUS query_trace(int fd, uint64_t *buf) { #ifdef __linux__ struct perf_counts_values content; if (fd < 0) return HSAKMT_STATUS_ERROR; if (readn(fd, &content, sizeof(content)) != sizeof(content)) return HSAKMT_STATUS_ERROR; *buf = content.val; return HSAKMT_STATUS_SUCCESS; #else return HSAKMT_STATUS_ERROR; #endif } HSAKMT_STATUS HSAKMTAPI hsaKmtPmcGetCounterProperties(HSAuint32 NodeId, HsaCounterProperties **CounterProperties) { #ifdef __linux__ HSAKMT_STATUS rc = HSAKMT_STATUS_SUCCESS; uint32_t gpu_id, i, block_id; uint32_t counter_props_size = 0; uint32_t total_counters = 0; uint32_t total_concurrent = 0; struct perf_counter_block block = {0}; uint32_t total_blocks = 0; HsaCounterBlockProperties *block_prop; if (!counter_props) return HSAKMT_STATUS_NO_MEMORY; if (!CounterProperties) return HSAKMT_STATUS_INVALID_PARAMETER; if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (counter_props[NodeId]) { *CounterProperties = counter_props[NodeId]; return HSAKMT_STATUS_SUCCESS; } for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { rc = get_block_properties(NodeId, i, &block); if (rc != HSAKMT_STATUS_SUCCESS) return rc; total_concurrent += block.num_of_slots; total_counters += block.num_of_counters; /* If num_of_slots=0, this block doesn't exist */ if (block.num_of_slots) total_blocks++; } counter_props_size = sizeof(HsaCounterProperties) + sizeof(HsaCounterBlockProperties) * (total_blocks - 1) + sizeof(HsaCounter) * (total_counters - total_blocks); counter_props[NodeId] = malloc(counter_props_size); if (!counter_props[NodeId]) return HSAKMT_STATUS_NO_MEMORY; counter_props[NodeId]->NumBlocks = total_blocks; counter_props[NodeId]->NumConcurrent = total_concurrent; block_prop = &counter_props[NodeId]->Blocks[0]; for (block_id = 0; block_id < PERFCOUNTER_BLOCKID__MAX; block_id++) { rc = get_block_properties(NodeId, block_id, &block); if (rc != HSAKMT_STATUS_SUCCESS) { free(counter_props[NodeId]); counter_props[NodeId] = NULL; return rc; } if (!block.num_of_slots) /* not a valid block */ continue; blockid2uuid(block_id, &block_prop->BlockId); block_prop->NumCounters = block.num_of_counters; block_prop->NumConcurrent = block.num_of_slots; for (i = 0; i < block.num_of_counters; i++) { block_prop->Counters[i].BlockIndex = block_id; block_prop->Counters[i].CounterId = block.counter_ids[i]; block_prop->Counters[i].CounterSizeInBits = block.counter_size_in_bits; block_prop->Counters[i].CounterMask = block.counter_mask; block_prop->Counters[i].Flags.ui32.Global = 1; if (block_id == PERFCOUNTER_BLOCKID__IOMMUV2) block_prop->Counters[i].Type = HSA_PROFILE_TYPE_PRIVILEGED_IMMEDIATE; else block_prop->Counters[i].Type = HSA_PROFILE_TYPE_NONPRIV_IMMEDIATE; } block_prop = (HsaCounterBlockProperties *)&block_prop->Counters[block_prop->NumCounters]; } *CounterProperties = counter_props[NodeId]; return HSAKMT_STATUS_SUCCESS; #else return HSAKMT_STATUS_ERROR; #endif } /* Registers a set of (HW) counters to be used for tracing/profiling */ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcRegisterTrace(HSAuint32 NodeId, HSAuint32 NumberOfCounters, HsaCounter *Counters, HsaPmcTraceRoot *TraceRoot) { uint32_t gpu_id, i, j; uint64_t min_buf_size = 0; struct perf_trace *trace = NULL; uint32_t concurrent_limit; const uint32_t MAX_COUNTERS = 512; uint64_t counter_id[PERFCOUNTER_BLOCKID__MAX][MAX_COUNTERS]; uint32_t num_counters[PERFCOUNTER_BLOCKID__MAX] = {0}; uint32_t block, num_blocks = 0, total_counters = 0; uint64_t *counter_id_ptr; int *fd_ptr; pr_debug("[%s] Number of counters %d\n", __func__, NumberOfCounters); if (!counter_props) return HSAKMT_STATUS_NO_MEMORY; if (!Counters || !TraceRoot || NumberOfCounters == 0) return HSAKMT_STATUS_INVALID_PARAMETER; if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; if (NumberOfCounters > MAX_COUNTERS) { pr_err("MAX_COUNTERS is too small for %d.\n", NumberOfCounters); return HSAKMT_STATUS_NO_MEMORY; } /* Calculating the minimum buffer size */ for (i = 0; i < NumberOfCounters; i++) { if (Counters[i].BlockIndex >= PERFCOUNTER_BLOCKID__MAX) return HSAKMT_STATUS_INVALID_PARAMETER; /* Only privileged counters need to register */ if (Counters[i].Type > HSA_PROFILE_TYPE_PRIVILEGED_STREAMING) continue; min_buf_size += Counters[i].CounterSizeInBits/BITS_PER_BYTE; /* j: the first blank entry in the block to record counter_id */ j = num_counters[Counters[i].BlockIndex]; counter_id[Counters[i].BlockIndex][j] = Counters[i].CounterId; num_counters[Counters[i].BlockIndex]++; total_counters++; } /* Verify that the number of counters per block is not larger than the * number of slots. */ for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (!num_counters[i]) continue; concurrent_limit = get_block_concurrent_limit(NodeId, i); if (!concurrent_limit) { pr_err("Invalid block ID: %d\n", i); return HSAKMT_STATUS_INVALID_PARAMETER; } if (num_counters[i] > concurrent_limit) { pr_err("Counters exceed the limit.\n"); return HSAKMT_STATUS_INVALID_PARAMETER; } num_blocks++; } if (!num_blocks) return HSAKMT_STATUS_INVALID_PARAMETER; /* Now we have sorted blocks/counters information in * num_counters[block_id] and counter_id[block_id][]. Allocate trace * and record the information. */ trace = (struct perf_trace *)calloc(sizeof(struct perf_trace) + sizeof(struct perf_trace_block) * num_blocks + sizeof(uint64_t) * total_counters + sizeof(int) * total_counters, 1); if (!trace) return HSAKMT_STATUS_NO_MEMORY; /* Allocated area is partitioned as: * +---------------------------------+ trace * | perf_trace | * |---------------------------------| trace->blocks[0] * | perf_trace_block 0 | * | .... | * | perf_trace_block N-1 | trace->blocks[N-1] * |---------------------------------| <-- counter_id_ptr starts here * | block 0's counter IDs(uint64_t) | * | ...... | * | block N-1's counter IDs | * |---------------------------------| <-- perf_event_fd starts here * | block 0's perf_event_fds(int) | * | ...... | * | block N-1's perf_event_fds | * +---------------------------------+ */ block = 0; counter_id_ptr = (uint64_t *)((char *) trace + sizeof(struct perf_trace) + sizeof(struct perf_trace_block) * num_blocks); fd_ptr = (int *)(counter_id_ptr + total_counters); /* Fill in each block's information to the TraceId */ for (i = 0; i < PERFCOUNTER_BLOCKID__MAX; i++) { if (!num_counters[i]) /* not a block to trace */ continue; /* Following perf_trace + perf_trace_block x N are those * counter_id arrays. Assign the counter_id array belonging to * this block. */ trace->blocks[block].counter_id = counter_id_ptr; /* Fill in counter IDs to the counter_id array. */ for (j = 0; j < num_counters[i]; j++) trace->blocks[block].counter_id[j] = counter_id[i][j]; trace->blocks[block].perf_event_fd = fd_ptr; /* how many counters to trace */ trace->blocks[block].num_counters = num_counters[i]; /* block index in "enum perf_block_id" */ trace->blocks[block].block_id = i; block++; /* move to next */ counter_id_ptr += num_counters[i]; fd_ptr += num_counters[i]; } trace->magic4cc = HSA_PERF_MAGIC4CC; trace->gpu_id = gpu_id; trace->state = PERF_TRACE_STATE__STOPPED; trace->num_blocks = num_blocks; TraceRoot->NumberOfPasses = 1; TraceRoot->TraceBufferMinSizeBytes = PAGE_ALIGN_UP(min_buf_size); TraceRoot->TraceId = PORT_VPTR_TO_UINT64(trace); return HSAKMT_STATUS_SUCCESS; } /* Unregisters a set of (HW) counters used for tracing/profiling */ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcUnregisterTrace(HSAuint32 NodeId, HSATraceId TraceId) { uint32_t gpu_id; struct perf_trace *trace; pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId); if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; if (trace->gpu_id != gpu_id) return HSAKMT_STATUS_INVALID_NODE_UNIT; /* If the trace is in the running state, stop it */ if (trace->state == PERF_TRACE_STATE__STARTED) { HSAKMT_STATUS status = hsaKmtPmcStopTrace(TraceId); if (status != HSAKMT_STATUS_SUCCESS) return status; } free(trace); return HSAKMT_STATUS_SUCCESS; } HSAKMT_STATUS HSAKMTAPI hsaKmtPmcAcquireTraceAccess(HSAuint32 NodeId, HSATraceId TraceId) { struct perf_trace *trace; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; uint32_t gpu_id, i; int j; pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId); if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; if (validate_nodeid(NodeId, &gpu_id) != HSAKMT_STATUS_SUCCESS) return HSAKMT_STATUS_INVALID_NODE_UNIT; for (i = 0; i < trace->num_blocks; i++) { ret = update_block_slots(PERF_TRACE_ACTION__ACQUIRE, trace->blocks[i].block_id, trace->blocks[i].num_counters); if (ret != HSAKMT_STATUS_SUCCESS) goto out; ret = open_perf_event_fd(&trace->blocks[i]); if (ret != HSAKMT_STATUS_SUCCESS) { i++; /* to release slots just reserved */ goto out; } } out: if (ret != HSAKMT_STATUS_SUCCESS) { for (j = i-1; j >= 0; j--) { update_block_slots(PERF_TRACE_ACTION__RELEASE, trace->blocks[j].block_id, trace->blocks[j].num_counters); close_perf_event_fd(&trace->blocks[j]); } } return ret; } HSAKMT_STATUS HSAKMTAPI hsaKmtPmcReleaseTraceAccess(HSAuint32 NodeId, HSATraceId TraceId) { struct perf_trace *trace; uint32_t i; pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId); if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; for (i = 0; i < trace->num_blocks; i++) { update_block_slots(PERF_TRACE_ACTION__RELEASE, trace->blocks[i].block_id, trace->blocks[i].num_counters); close_perf_event_fd(&trace->blocks[i]); } return HSAKMT_STATUS_SUCCESS; } /* Starts tracing operation on a previously established set of performance counters */ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStartTrace(HSATraceId TraceId, void *TraceBuffer, HSAuint64 TraceBufferSizeBytes) { #ifdef __linux__ struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); uint32_t i; int32_t j; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId); if (TraceId == 0 || !TraceBuffer || TraceBufferSizeBytes == 0) return HSAKMT_STATUS_INVALID_PARAMETER; if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; for (i = 0; i < trace->num_blocks; i++) { ret = perf_trace_ioctl(&trace->blocks[i], PERF_EVENT_IOC_ENABLE); if (ret != HSAKMT_STATUS_SUCCESS) break; } if (ret != HSAKMT_STATUS_SUCCESS) { /* Disable enabled blocks before returning the failure. */ j = (int32_t)i; while (--j >= 0) perf_trace_ioctl(&trace->blocks[j], PERF_EVENT_IOC_DISABLE); return ret; } trace->state = PERF_TRACE_STATE__STARTED; trace->buf = TraceBuffer; trace->buf_size = TraceBufferSizeBytes; return HSAKMT_STATUS_SUCCESS; #else return HSAKMT_STATUS_ERROR; #endif } /*Forces an update of all the counters that a previously started trace operation has registered */ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcQueryTrace(HSATraceId TraceId) { struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); uint32_t i, j; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; uint64_t *buf; uint64_t buf_filled = 0; if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; buf = (uint64_t *)trace->buf; pr_debug("[%s] Trace buffer(%p): ", __func__, buf); for (i = 0; i < trace->num_blocks; i++) for (j = 0; j < trace->blocks[i].num_counters; j++) { buf_filled += sizeof(uint64_t); if (buf_filled > trace->buf_size) return HSAKMT_STATUS_NO_MEMORY; ret = query_trace(trace->blocks[i].perf_event_fd[j], buf); if (ret != HSAKMT_STATUS_SUCCESS) return ret; pr_debug("%lu_", *buf); buf++; } pr_debug("\n"); return HSAKMT_STATUS_SUCCESS; } /* Stops tracing operation on a previously established set of performance counters */ HSAKMT_STATUS HSAKMTAPI hsaKmtPmcStopTrace(HSATraceId TraceId) { #ifdef __linux__ struct perf_trace *trace = (struct perf_trace *)PORT_UINT64_TO_VPTR(TraceId); uint32_t i; HSAKMT_STATUS ret = HSAKMT_STATUS_SUCCESS; pr_debug("[%s] Trace ID 0x%lx\n", __func__, TraceId); if (TraceId == 0) return HSAKMT_STATUS_INVALID_PARAMETER; if (trace->magic4cc != HSA_PERF_MAGIC4CC) return HSAKMT_STATUS_INVALID_HANDLE; for (i = 0; i < trace->num_blocks; i++) { ret = perf_trace_ioctl(&trace->blocks[i], PERF_EVENT_IOC_DISABLE); if (ret != HSAKMT_STATUS_SUCCESS) return ret; } trace->state = PERF_TRACE_STATE__STOPPED; return ret; #else return HSAKMT_STATUS_ERROR; #endif }