1 /*
2 * Copyright (C) 2020-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #include "level_zero/tools/source/sysman/global_operations/linux/os_global_operations_imp.h"
9
10 #include "shared/source/os_interface/device_factory.h"
11
12 #include "level_zero/core/source/device/device_imp.h"
13 #include "level_zero/tools/source/sysman/global_operations/global_operations_imp.h"
14 #include "level_zero/tools/source/sysman/linux/fs_access.h"
15 #include "level_zero/tools/source/sysman/sysman_const.h"
16 #include <level_zero/zet_api.h>
17
18 #include <chrono>
19 #include <time.h>
20
21 namespace L0 {
22
23 const std::string LinuxGlobalOperationsImp::deviceDir("device");
24 const std::string LinuxGlobalOperationsImp::subsystemVendorFile("device/subsystem_vendor");
25 const std::string LinuxGlobalOperationsImp::driverFile("device/driver");
26 const std::string LinuxGlobalOperationsImp::functionLevelReset("device/reset");
27 const std::string LinuxGlobalOperationsImp::clientsDir("clients");
28 const std::string LinuxGlobalOperationsImp::srcVersionFile("/sys/module/i915/srcversion");
29 const std::string LinuxGlobalOperationsImp::agamaVersionFile("/sys/module/i915/agama_version");
30 const std::string LinuxGlobalOperationsImp::ueventWedgedFile("/var/lib/libze_intel_gpu/wedged_file");
31
32 // Map engine entries(numeric values) present in /sys/class/drm/card<n>/clients/<client_n>/busy,
33 // with engine enum defined in leve-zero spec
34 // Note that entries with int 2 and 3(represented by i915 as CLASS_VIDEO and CLASS_VIDEO_ENHANCE)
35 // are both mapped to MEDIA, as CLASS_VIDEO represents any media fixed-function hardware.
36 static const std::map<int, zes_engine_type_flags_t> engineMap = {
37 {0, ZES_ENGINE_TYPE_FLAG_3D},
38 {1, ZES_ENGINE_TYPE_FLAG_DMA},
39 {2, ZES_ENGINE_TYPE_FLAG_MEDIA},
40 {3, ZES_ENGINE_TYPE_FLAG_MEDIA},
41 {4, ZES_ENGINE_TYPE_FLAG_COMPUTE}};
42
getSerialNumber(char (& serialNumber)[ZES_STRING_PROPERTY_SIZE])43 void LinuxGlobalOperationsImp::getSerialNumber(char (&serialNumber)[ZES_STRING_PROPERTY_SIZE]) {
44 std::strncpy(serialNumber, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
45 }
46
getDevice()47 Device *LinuxGlobalOperationsImp::getDevice() {
48 return pDevice;
49 }
50
getBoardNumber(char (& boardNumber)[ZES_STRING_PROPERTY_SIZE])51 void LinuxGlobalOperationsImp::getBoardNumber(char (&boardNumber)[ZES_STRING_PROPERTY_SIZE]) {
52 std::strncpy(boardNumber, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
53 }
54
getBrandName(char (& brandName)[ZES_STRING_PROPERTY_SIZE])55 void LinuxGlobalOperationsImp::getBrandName(char (&brandName)[ZES_STRING_PROPERTY_SIZE]) {
56 std::string strVal;
57 ze_result_t result = pSysfsAccess->read(subsystemVendorFile, strVal);
58 if (ZE_RESULT_SUCCESS != result) {
59 std::strncpy(brandName, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
60 return;
61 }
62 if (strVal.compare(intelPciId) == 0) {
63 std::strncpy(brandName, vendorIntel.c_str(), ZES_STRING_PROPERTY_SIZE);
64 } else {
65 std::strncpy(brandName, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
66 }
67 }
68
getModelName(char (& modelName)[ZES_STRING_PROPERTY_SIZE])69 void LinuxGlobalOperationsImp::getModelName(char (&modelName)[ZES_STRING_PROPERTY_SIZE]) {
70 NEO::Device *neoDevice = pDevice->getNEODevice();
71 std::string deviceModelName = neoDevice->getDeviceName(neoDevice->getHardwareInfo());
72 std::strncpy(modelName, deviceModelName.c_str(), ZES_STRING_PROPERTY_SIZE);
73 }
74
getVendorName(char (& vendorName)[ZES_STRING_PROPERTY_SIZE])75 void LinuxGlobalOperationsImp::getVendorName(char (&vendorName)[ZES_STRING_PROPERTY_SIZE]) {
76 ze_device_properties_t coreDeviceProperties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
77 pDevice->getProperties(&coreDeviceProperties);
78 std::stringstream pciId;
79 pciId << std::hex << coreDeviceProperties.vendorId;
80 if (("0x" + pciId.str()).compare(intelPciId) == 0) {
81 std::strncpy(vendorName, vendorIntel.c_str(), ZES_STRING_PROPERTY_SIZE);
82 } else {
83 std::strncpy(vendorName, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
84 }
85 }
86
getDriverVersion(char (& driverVersion)[ZES_STRING_PROPERTY_SIZE])87 void LinuxGlobalOperationsImp::getDriverVersion(char (&driverVersion)[ZES_STRING_PROPERTY_SIZE]) {
88 std::string strVal;
89 std::strncpy(driverVersion, unknown.c_str(), ZES_STRING_PROPERTY_SIZE);
90 ze_result_t result = pFsAccess->read(agamaVersionFile, strVal);
91 if (ZE_RESULT_SUCCESS != result) {
92 if (ZE_RESULT_ERROR_NOT_AVAILABLE != result) {
93 return;
94 }
95 result = pFsAccess->read(srcVersionFile, strVal);
96 if (ZE_RESULT_SUCCESS != result) {
97 return;
98 }
99 }
100 std::strncpy(driverVersion, strVal.c_str(), ZES_STRING_PROPERTY_SIZE);
101 return;
102 }
103
getPidFdsForOpenDevice(ProcfsAccess * pProcfsAccess,SysfsAccess * pSysfsAccess,const::pid_t pid,std::vector<int> & deviceFds)104 static void getPidFdsForOpenDevice(ProcfsAccess *pProcfsAccess, SysfsAccess *pSysfsAccess, const ::pid_t pid, std::vector<int> &deviceFds) {
105 // Return a list of all the file descriptors of this process that point to this device
106 std::vector<int> fds;
107 deviceFds.clear();
108 if (ZE_RESULT_SUCCESS != pProcfsAccess->getFileDescriptors(pid, fds)) {
109 // Process exited. Not an error. Just ignore.
110 return;
111 }
112 for (auto &&fd : fds) {
113 std::string file;
114 if (pProcfsAccess->getFileName(pid, fd, file) != ZE_RESULT_SUCCESS) {
115 // Process closed this file. Not an error. Just ignore.
116 continue;
117 }
118 if (pSysfsAccess->isMyDeviceFile(file)) {
119 deviceFds.push_back(fd);
120 }
121 }
122 }
123
releaseSysmanDeviceResources()124 void LinuxGlobalOperationsImp::releaseSysmanDeviceResources() {
125 pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->releaseEngines();
126 pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->releaseRasHandles();
127 pLinuxSysmanImp->getSysmanDeviceImp()->pDiagnosticsHandleContext->releaseDiagnosticsHandles();
128 pLinuxSysmanImp->getSysmanDeviceImp()->pFirmwareHandleContext->releaseFwHandles();
129 pLinuxSysmanImp->releasePmtObject();
130 pLinuxSysmanImp->releaseFwUtilInterface();
131 pLinuxSysmanImp->releaseLocalDrmHandle();
132 }
133
releaseDeviceResources()134 void LinuxGlobalOperationsImp::releaseDeviceResources() {
135 releaseSysmanDeviceResources();
136 auto device = static_cast<DeviceImp *>(getDevice());
137 device->releaseResources();
138 executionEnvironment->memoryManager->releaseDeviceSpecificMemResources(rootDeviceIndex);
139 executionEnvironment->releaseRootDeviceEnvironmentResources(executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].get());
140 executionEnvironment->rootDeviceEnvironments[rootDeviceIndex].reset();
141 }
142
reInitSysmanDeviceResources()143 void LinuxGlobalOperationsImp::reInitSysmanDeviceResources() {
144 pLinuxSysmanImp->getSysmanDeviceImp()->updateSubDeviceHandlesLocally();
145 pLinuxSysmanImp->createPmtHandles();
146 pLinuxSysmanImp->getSysmanDeviceImp()->pRasHandleContext->init(pLinuxSysmanImp->getSysmanDeviceImp()->deviceHandles);
147 pLinuxSysmanImp->getSysmanDeviceImp()->pEngineHandleContext->init();
148 pLinuxSysmanImp->getSysmanDeviceImp()->pDiagnosticsHandleContext->init(pLinuxSysmanImp->getSysmanDeviceImp()->deviceHandles);
149 pLinuxSysmanImp->getSysmanDeviceImp()->pFirmwareHandleContext->init();
150 }
151
initDevice()152 ze_result_t LinuxGlobalOperationsImp::initDevice() {
153 ze_result_t result = ZE_RESULT_SUCCESS;
154 auto device = static_cast<DeviceImp *>(getDevice());
155
156 auto neoDevice = NEO::DeviceFactory::createDevice(*executionEnvironment, devicePciBdf, rootDeviceIndex);
157 if (neoDevice == nullptr) {
158 return ZE_RESULT_ERROR_DEVICE_LOST;
159 }
160 static_cast<L0::DriverHandleImp *>(device->getDriverHandle())->updateRootDeviceBitFields(neoDevice);
161 static_cast<L0::DriverHandleImp *>(device->getDriverHandle())->enableRootDeviceDebugger(neoDevice);
162 Device::deviceReinit(device->getDriverHandle(), device, neoDevice, &result);
163 reInitSysmanDeviceResources();
164 return ZE_RESULT_SUCCESS;
165 }
166
reset(ze_bool_t force)167 ze_result_t LinuxGlobalOperationsImp::reset(ze_bool_t force) {
168 std::string resetPath;
169 std::string resetName;
170 ze_result_t result = ZE_RESULT_SUCCESS;
171
172 pSysfsAccess->getRealPath(functionLevelReset, resetPath);
173 // Must run as root. Verify permission to perform reset.
174 result = pFsAccess->canWrite(resetPath);
175 if (ZE_RESULT_SUCCESS != result) {
176 return result;
177 }
178 pSysfsAccess->getRealPath(deviceDir, resetName);
179 resetName = pFsAccess->getBaseName(resetName);
180
181 ::pid_t myPid = pProcfsAccess->myProcessId();
182 std::vector<int> myPidFds;
183 std::vector<::pid_t> processes;
184
185 result = pProcfsAccess->listProcesses(processes);
186 if (ZE_RESULT_SUCCESS != result) {
187 return result;
188 }
189 for (auto &&pid : processes) {
190 std::vector<int> fds;
191 getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds);
192 if (pid == myPid) {
193 // L0 is expected to have this file open.
194 // Keep list of fds. Close before unbind.
195 myPidFds = fds;
196 } else if (!fds.empty()) {
197 if (force) {
198 pProcfsAccess->kill(pid);
199 } else {
200 // Device is in use by another process.
201 // Don't reset while in use.
202 return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE;
203 }
204 }
205 }
206
207 ExecutionEnvironmentRefCountRestore restorer(executionEnvironment);
208 releaseDeviceResources();
209 for (auto &&fd : myPidFds) {
210 // Close open filedescriptors to the device
211 // before unbinding device.
212 // From this point forward, there is no
213 // graceful way to fail the reset call.
214 // All future ze calls by this process for this
215 // device will fail.
216 ::close(fd);
217 }
218
219 // Unbind the device from the kernel driver.
220 result = pSysfsAccess->unbindDevice(resetName);
221 if (ZE_RESULT_SUCCESS != result) {
222 return result;
223 }
224
225 // If someone opened the device
226 // after we check, kill them here.
227 result = pProcfsAccess->listProcesses(processes);
228 if (ZE_RESULT_SUCCESS != result) {
229 return result;
230 }
231 std::vector<::pid_t> deviceUsingPids;
232 deviceUsingPids.clear();
233 for (auto &&pid : processes) {
234 std::vector<int> fds;
235 getPidFdsForOpenDevice(pProcfsAccess, pSysfsAccess, pid, fds);
236 if (!fds.empty()) {
237
238 // Kill all processes that have the device open.
239 pProcfsAccess->kill(pid);
240 deviceUsingPids.push_back(pid);
241 }
242 }
243
244 // Wait for all the processes to exit
245 // If they don't all exit within resetTimeout
246 // just fail reset.
247 auto start = std::chrono::steady_clock::now();
248 auto end = start;
249 for (auto &&pid : deviceUsingPids) {
250 while (pProcfsAccess->isAlive(pid)) {
251 if (std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() > resetTimeout) {
252 return ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE;
253 }
254
255 struct ::timespec timeout = {.tv_sec = 0, .tv_nsec = 1000};
256 ::nanosleep(&timeout, NULL);
257 end = std::chrono::steady_clock::now();
258 }
259 }
260
261 // Reset the device.
262 result = pFsAccess->write(resetPath, "1");
263 if (ZE_RESULT_SUCCESS != result) {
264 return result;
265 }
266
267 // Rebind the device to the kernel driver.
268 result = pSysfsAccess->bindDevice(resetName);
269 if (ZE_RESULT_SUCCESS != result) {
270 return result;
271 }
272
273 return initDevice();
274 }
275
276 // Processes in the form of clients are present in sysfs like this:
277 // # /sys/class/drm/card0/clients$ ls
278 // 4 5
279 // # /sys/class/drm/card0/clients/4$ ls
280 // busy name pid
281 // # /sys/class/drm/card0/clients/4/busy$ ls
282 // 0 1 2 3
283 //
284 // Number of processes(If one process opened drm device multiple times, then multiple entries will be
285 // present for same process in clients directory) will be the number of clients
286 // (For example from above example, processes dirs are 4,5)
287 // Thus total number of times drm connection opened with this device will be 2.
288 // process.pid = pid (from above example)
289 // process.engines -> For each client's busy dir, numbers 0,1,2,3 represent engines and they contain
290 // accumulated nanoseconds each client spent on engines.
291 // Thus we traverse each file in busy dir for non-zero time and if we find that file say 0,then we could say that
292 // this engine 0 is used by process.
scanProcessesState(std::vector<zes_process_state_t> & pProcessList)293 ze_result_t LinuxGlobalOperationsImp::scanProcessesState(std::vector<zes_process_state_t> &pProcessList) {
294 std::vector<std::string> clientIds;
295 struct deviceMemStruct {
296 uint64_t deviceMemorySize;
297 uint64_t deviceSharedMemorySize;
298 };
299 struct engineMemoryPairType {
300 int64_t engineTypeField;
301 deviceMemStruct deviceMemStructField;
302 };
303
304 ze_result_t result = pSysfsAccess->scanDirEntries(clientsDir, clientIds);
305 if (ZE_RESULT_SUCCESS != result) {
306 return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
307 }
308
309 // Create a map with unique pid as key and engineType as value
310 std::map<uint64_t, engineMemoryPairType> pidClientMap;
311 for (const auto &clientId : clientIds) {
312 // realClientPidPath will be something like: clients/<clientId>/pid
313 std::string realClientPidPath = clientsDir + "/" + clientId + "/" + "pid";
314 uint64_t pid;
315 result = pSysfsAccess->read(realClientPidPath, pid);
316
317 if (ZE_RESULT_SUCCESS != result) {
318 std::string bPidString;
319 result = pSysfsAccess->read(realClientPidPath, bPidString);
320 if (result == ZE_RESULT_SUCCESS) {
321 size_t start = bPidString.find("<");
322 size_t end = bPidString.find(">");
323 std::string bPid = bPidString.substr(start + 1, end - start - 1);
324 pid = std::stoull(bPid, nullptr, 10);
325 }
326 }
327
328 if (ZE_RESULT_SUCCESS != result) {
329 if (ZE_RESULT_ERROR_NOT_AVAILABLE == result) {
330 // update the result as Success as ZE_RESULT_ERROR_NOT_AVAILABLE is expected if the "realClientPidPath" folder is empty
331 // this condition(when encountered) must not prevent the information accumulated for other clientIds
332 // this situation occurs when there is no call modifying result,
333 result = ZE_RESULT_SUCCESS;
334 continue;
335 } else {
336 return result;
337 }
338 }
339 // Traverse the clients/<clientId>/busy directory to get accelerator engines used by process
340 std::vector<std::string> engineNums = {};
341 int64_t engineType = 0;
342 std::string busyDirForEngines = clientsDir + "/" + clientId + "/" + "busy";
343 result = pSysfsAccess->scanDirEntries(busyDirForEngines, engineNums);
344 if (ZE_RESULT_SUCCESS != result) {
345 if (ZE_RESULT_ERROR_NOT_AVAILABLE == result) {
346 // update the result as Success as ZE_RESULT_ERROR_NOT_AVAILABLE is expected if the "realClientPidPath" folder is empty
347 // this condition(when encountered) must not prevent the information accumulated for other clientIds
348 // this situation occurs when there is no call modifying result,
349 // Here its seen when the last element of clientIds returns ZE_RESULT_ERROR_NOT_AVAILABLE for some reason.
350 engineType = ZES_ENGINE_TYPE_FLAG_OTHER; // When busy node is absent assign engine type with ZES_ENGINE_TYPE_FLAG_OTHER
351 } else {
352 return result;
353 }
354 }
355 // Scan all engine files present in /sys/class/drm/card0/clients/<ClientId>/busy and check
356 // whether that engine is used by process
357 for (const auto &engineNum : engineNums) {
358 uint64_t timeSpent = 0;
359 std::string engine = busyDirForEngines + "/" + engineNum;
360 result = pSysfsAccess->read(engine, timeSpent);
361 if (ZE_RESULT_SUCCESS != result) {
362 if (ZE_RESULT_ERROR_NOT_AVAILABLE == result) {
363 continue;
364 } else {
365 return result;
366 }
367 }
368 if (timeSpent > 0) {
369 int i915EnginNumber = stoi(engineNum);
370 auto i915MapToL0EngineType = engineMap.find(i915EnginNumber);
371 zes_engine_type_flags_t val = ZES_ENGINE_TYPE_FLAG_OTHER;
372 if (i915MapToL0EngineType != engineMap.end()) {
373 // Found a valid map
374 val = i915MapToL0EngineType->second;
375 }
376 // In this for loop we want to retrieve the overall engines used by process
377 engineType = engineType | val;
378 }
379 }
380
381 uint64_t memSize = 0;
382 std::string realClientTotalMemoryPath = clientsDir + "/" + clientId + "/" + "total_device_memory_buffer_objects" + "/" + "created_bytes";
383 result = pSysfsAccess->read(realClientTotalMemoryPath, memSize);
384 if (ZE_RESULT_SUCCESS != result) {
385 if (ZE_RESULT_ERROR_NOT_AVAILABLE != result) {
386 return result;
387 }
388 }
389
390 uint64_t sharedMemSize = 0;
391 std::string realClientTotalSharedMemoryPath = clientsDir + "/" + clientId + "/" + "total_device_memory_buffer_objects" + "/" + "imported_bytes";
392 result = pSysfsAccess->read(realClientTotalSharedMemoryPath, sharedMemSize);
393 if (ZE_RESULT_SUCCESS != result) {
394 if (ZE_RESULT_ERROR_NOT_AVAILABLE != result) {
395 return result;
396 }
397 }
398 deviceMemStruct totalDeviceMem = {memSize, sharedMemSize};
399 engineMemoryPairType engineMemoryPair = {engineType, totalDeviceMem};
400 auto ret = pidClientMap.insert(std::make_pair(pid, engineMemoryPair));
401 if (ret.second == false) {
402 // insertion failed as entry with same pid already exists in map
403 // Now update the engineMemoryPairType field for the existing pid entry
404 engineMemoryPairType updateEngineMemoryPair;
405 auto pidEntryFromMap = pidClientMap.find(pid);
406 auto existingEngineType = pidEntryFromMap->second.engineTypeField;
407 auto existingdeviceMemorySize = pidEntryFromMap->second.deviceMemStructField.deviceMemorySize;
408 auto existingdeviceSharedMemorySize = pidEntryFromMap->second.deviceMemStructField.deviceSharedMemorySize;
409 updateEngineMemoryPair.engineTypeField = existingEngineType | engineMemoryPair.engineTypeField;
410 updateEngineMemoryPair.deviceMemStructField.deviceMemorySize = existingdeviceMemorySize + engineMemoryPair.deviceMemStructField.deviceMemorySize;
411 updateEngineMemoryPair.deviceMemStructField.deviceSharedMemorySize = existingdeviceSharedMemorySize + engineMemoryPair.deviceMemStructField.deviceSharedMemorySize;
412 pidClientMap[pid] = updateEngineMemoryPair;
413 }
414 result = ZE_RESULT_SUCCESS;
415 }
416
417 // iterate through all elements of pidClientMap
418 for (auto itr = pidClientMap.begin(); itr != pidClientMap.end(); ++itr) {
419 zes_process_state_t process;
420 process.processId = static_cast<uint32_t>(itr->first);
421 process.memSize = itr->second.deviceMemStructField.deviceMemorySize;
422 process.sharedSize = itr->second.deviceMemStructField.deviceSharedMemorySize;
423 process.engines = static_cast<uint32_t>(itr->second.engineTypeField);
424 pProcessList.push_back(process);
425 }
426 return result;
427 }
428
getWedgedStatus(zes_device_state_t * pState)429 void LinuxGlobalOperationsImp::getWedgedStatus(zes_device_state_t *pState) {
430 uint32_t valWedged = 0;
431 if (ZE_RESULT_SUCCESS == pFsAccess->read(ueventWedgedFile, valWedged)) {
432 if (valWedged != 0) {
433 pState->reset |= ZES_RESET_REASON_FLAG_WEDGED;
434 }
435 }
436 }
deviceGetState(zes_device_state_t * pState)437 ze_result_t LinuxGlobalOperationsImp::deviceGetState(zes_device_state_t *pState) {
438 memset(pState, 0, sizeof(zes_device_state_t));
439 pState->repaired = ZES_REPAIR_STATUS_UNSUPPORTED;
440 getWedgedStatus(pState);
441 getRepairStatus(pState);
442 return ZE_RESULT_SUCCESS;
443 }
444
LinuxGlobalOperationsImp(OsSysman * pOsSysman)445 LinuxGlobalOperationsImp::LinuxGlobalOperationsImp(OsSysman *pOsSysman) {
446 pLinuxSysmanImp = static_cast<LinuxSysmanImp *>(pOsSysman);
447
448 pSysfsAccess = &pLinuxSysmanImp->getSysfsAccess();
449 pProcfsAccess = &pLinuxSysmanImp->getProcfsAccess();
450 pFsAccess = &pLinuxSysmanImp->getFsAccess();
451 pDevice = pLinuxSysmanImp->getDeviceHandle();
452 auto device = static_cast<DeviceImp *>(pDevice);
453 devicePciBdf = device->getNEODevice()->getRootDeviceEnvironment().osInterface->getDriverModel()->as<NEO::Drm>()->getPciPath();
454 executionEnvironment = device->getNEODevice()->getExecutionEnvironment();
455 rootDeviceIndex = device->getNEODevice()->getRootDeviceIndex();
456 }
457
create(OsSysman * pOsSysman)458 OsGlobalOperations *OsGlobalOperations::create(OsSysman *pOsSysman) {
459 LinuxGlobalOperationsImp *pLinuxGlobalOperationsImp = new LinuxGlobalOperationsImp(pOsSysman);
460 return static_cast<OsGlobalOperations *>(pLinuxGlobalOperationsImp);
461 }
462
463 } // namespace L0
464