1 /*
2 * Copyright (C) 2018-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #include "opencl/source/command_queue/command_queue.h"
9
10 #include "shared/source/command_stream/command_stream_receiver.h"
11 #include "shared/source/helpers/aligned_memory.h"
12 #include "shared/source/helpers/array_count.h"
13 #include "shared/source/helpers/engine_node_helper.h"
14 #include "shared/source/helpers/get_info.h"
15 #include "shared/source/helpers/ptr_math.h"
16 #include "shared/source/helpers/string.h"
17 #include "shared/source/helpers/timestamp_packet.h"
18 #include "shared/source/memory_manager/internal_allocation_storage.h"
19 #include "shared/source/os_interface/hw_info_config.h"
20 #include "shared/source/os_interface/os_context.h"
21 #include "shared/source/utilities/api_intercept.h"
22 #include "shared/source/utilities/tag_allocator.h"
23
24 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
25 #include "opencl/source/cl_device/cl_device.h"
26 #include "opencl/source/context/context.h"
27 #include "opencl/source/device_queue/device_queue.h"
28 #include "opencl/source/event/event_builder.h"
29 #include "opencl/source/event/user_event.h"
30 #include "opencl/source/gtpin/gtpin_notify.h"
31 #include "opencl/source/helpers/cl_hw_helper.h"
32 #include "opencl/source/helpers/convert_color.h"
33 #include "opencl/source/helpers/hardware_commands_helper.h"
34 #include "opencl/source/helpers/mipmap.h"
35 #include "opencl/source/helpers/queue_helpers.h"
36 #include "opencl/source/mem_obj/buffer.h"
37 #include "opencl/source/mem_obj/image.h"
38 #include "opencl/source/program/printf_handler.h"
39
40 #include "CL/cl_ext.h"
41
42 #include <limits>
43 #include <map>
44
45 namespace NEO {
46
47 // Global table of create functions
48 CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE] = {};
49
create(Context * context,ClDevice * device,const cl_queue_properties * properties,bool internalUsage,cl_int & retVal)50 CommandQueue *CommandQueue::create(Context *context,
51 ClDevice *device,
52 const cl_queue_properties *properties,
53 bool internalUsage,
54 cl_int &retVal) {
55 retVal = CL_SUCCESS;
56
57 auto funcCreate = commandQueueFactory[device->getRenderCoreFamily()];
58 DEBUG_BREAK_IF(nullptr == funcCreate);
59
60 return funcCreate(context, device, properties, internalUsage);
61 }
62
CommandQueue(Context * context,ClDevice * device,const cl_queue_properties * properties,bool internalUsage)63 CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage)
64 : context(context), device(device) {
65 if (context) {
66 context->incRefInternal();
67 }
68
69 commandQueueProperties = getCmdQueueProperties<cl_command_queue_properties>(properties);
70 flushStamp.reset(new FlushStampTracker(true));
71
72 if (device) {
73 auto &hwInfo = device->getHardwareInfo();
74 auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
75 auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
76
77 gpgpuEngine = &device->getDefaultEngine();
78
79 UNRECOVERABLE_IF(gpgpuEngine->getEngineType() >= aub_stream::EngineType::NUM_ENGINES);
80
81 bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) &&
82 hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS);
83
84 if (bcsAllowed || gpgpuEngine->commandStreamReceiver->peekTimestampPacketWriteEnabled()) {
85 timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
86 deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
87 }
88 if (bcsAllowed) {
89 auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice();
90 auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine();
91 auto bcsEngineType = EngineHelpers::getBcsEngineType(hwInfo, device->getDeviceBitfield(), selectorCopyEngine, internalUsage);
92 bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular);
93 bcsEngineTypes.push_back(bcsEngineType);
94 }
95 }
96
97 storeProperties(properties);
98 processProperties(properties);
99 }
100
~CommandQueue()101 CommandQueue::~CommandQueue() {
102 if (virtualEvent) {
103 UNRECOVERABLE_IF(this->virtualEvent->getCommandQueue() != this && this->virtualEvent->getCommandQueue() != nullptr);
104 virtualEvent->decRefInternal();
105 }
106
107 if (device) {
108 auto storageForAllocation = gpgpuEngine->commandStreamReceiver->getInternalAllocationStorage();
109
110 if (commandStream) {
111 storageForAllocation->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream->getGraphicsAllocation()), REUSABLE_ALLOCATION);
112 }
113 delete commandStream;
114
115 if (this->perfCountersEnabled) {
116 device->getPerformanceCounters()->shutdown();
117 }
118
119 if (auto mainBcs = bcsEngines[0]; mainBcs != nullptr) {
120 auto &selectorCopyEngine = device->getNearestGenericSubDevice(0)->getSelectorCopyEngine();
121 EngineHelpers::releaseBcsEngineType(mainBcs->getEngineType(), selectorCopyEngine);
122 }
123 }
124
125 timestampPacketContainer.reset();
126 //for normal queue, decrement ref count on context
127 //special queue is owned by context so ref count doesn't have to be decremented
128 if (context && !isSpecialCommandQueue) {
129 context->decRefInternal();
130 }
131 gtpinRemoveCommandQueue(this);
132 }
133
getGpgpuCommandStreamReceiver() const134 CommandStreamReceiver &CommandQueue::getGpgpuCommandStreamReceiver() const {
135 return *gpgpuEngine->commandStreamReceiver;
136 }
137
getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const138 CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const {
139 const EngineControl *engine = this->bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)];
140 if (engine == nullptr) {
141 return nullptr;
142 } else {
143 return engine->commandStreamReceiver;
144 }
145 }
146
getBcsForAuxTranslation() const147 CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const {
148 for (const EngineControl *engine : this->bcsEngines) {
149 if (engine != nullptr) {
150 return engine->commandStreamReceiver;
151 }
152 }
153 return nullptr;
154 }
155
selectCsrForBuiltinOperation(const CsrSelectionArgs & args) const156 CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const {
157 if (isCopyOnly) {
158 return *getBcsCommandStreamReceiver(bcsEngineTypes[0]);
159 }
160
161 if (!blitEnqueueAllowed(args)) {
162 return getGpgpuCommandStreamReceiver();
163 }
164
165 bool preferBcs = true;
166 aub_stream::EngineType preferredBcsEngineType = aub_stream::EngineType::NUM_ENGINES;
167 switch (args.direction) {
168 case TransferDirection::LocalToLocal: {
169 const auto &clHwHelper = ClHwHelper::get(device->getHardwareInfo().platform.eRenderCoreFamily);
170 preferBcs = clHwHelper.preferBlitterForLocalToLocalTransfers();
171 if (auto flag = DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.get(); flag != -1) {
172 preferBcs = static_cast<bool>(flag);
173 }
174 if (preferBcs) {
175 preferredBcsEngineType = aub_stream::EngineType::ENGINE_BCS;
176 }
177 break;
178 }
179 case TransferDirection::HostToHost:
180 case TransferDirection::HostToLocal:
181 case TransferDirection::LocalToHost: {
182 preferBcs = true;
183 preferredBcsEngineType = EngineHelpers::getBcsEngineType(device->getHardwareInfo(), device->getDeviceBitfield(),
184 device->getSelectorCopyEngine(), false);
185 break;
186 }
187 default:
188 UNRECOVERABLE_IF(true);
189 }
190
191 CommandStreamReceiver *selectedCsr = nullptr;
192 if (preferBcs) {
193 selectedCsr = getBcsCommandStreamReceiver(preferredBcsEngineType);
194 if (selectedCsr == nullptr && !bcsEngineTypes.empty()) {
195 selectedCsr = getBcsCommandStreamReceiver(bcsEngineTypes[0]);
196 }
197 }
198 if (selectedCsr == nullptr) {
199 selectedCsr = &getGpgpuCommandStreamReceiver();
200 }
201
202 UNRECOVERABLE_IF(selectedCsr == nullptr);
203 return *selectedCsr;
204 }
205
getDevice() const206 Device &CommandQueue::getDevice() const noexcept {
207 return device->getDevice();
208 }
209
getHwTag() const210 uint32_t CommandQueue::getHwTag() const {
211 uint32_t tag = *getHwTagAddress();
212 return tag;
213 }
214
getHwTagAddress() const215 volatile uint32_t *CommandQueue::getHwTagAddress() const {
216 return getGpgpuCommandStreamReceiver().getTagAddress();
217 }
218
isCompleted(uint32_t gpgpuTaskCount,CopyEngineState bcsState) const219 bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const {
220 DEBUG_BREAK_IF(getHwTag() == CompletionStamp::notReady);
221
222 if (getGpgpuCommandStreamReceiver().testTaskCountReady(getHwTagAddress(), gpgpuTaskCount)) {
223 if (bcsState.isValid()) {
224 return *getBcsCommandStreamReceiver(bcsState.engineType)->getTagAddress() >= peekBcsTaskCount(bcsState.engineType);
225 }
226
227 return true;
228 }
229
230 return false;
231 }
232
waitUntilComplete(uint32_t gpgpuTaskCountToWait,Range<CopyEngineState> copyEnginesToWait,FlushStamp flushStampToWait,bool useQuickKmdSleep,bool cleanTemporaryAllocationList,bool skipWait)233 void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
234 WAIT_ENTER()
235
236 DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
237 DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
238
239 if (!skipWait) {
240 bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
241
242 getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
243 flushStampToWait,
244 useQuickKmdSleep,
245 forcePowerSavingMode);
246 DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
247
248 if (gtpinIsGTPinInitialized()) {
249 gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
250 }
251
252 for (const CopyEngineState ©Engine : copyEnginesToWait) {
253 auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
254 bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
255 bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
256 }
257 }
258
259 if (cleanTemporaryAllocationList) {
260 getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
261 } else {
262 getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
263 }
264
265 WAIT_LEAVE()
266 }
267
isQueueBlocked()268 bool CommandQueue::isQueueBlocked() {
269 TakeOwnershipWrapper<CommandQueue> takeOwnershipWrapper(*this);
270 //check if we have user event and if so, if it is in blocked state.
271 if (this->virtualEvent) {
272 auto executionStatus = this->virtualEvent->peekExecutionStatus();
273 if (executionStatus <= CL_SUBMITTED) {
274 UNRECOVERABLE_IF(this->virtualEvent == nullptr);
275
276 if (this->virtualEvent->isStatusCompletedByTermination(executionStatus) == false) {
277 taskCount = this->virtualEvent->peekTaskCount();
278 flushStamp->setStamp(this->virtualEvent->flushStamp->peekStamp());
279 taskLevel = this->virtualEvent->taskLevel;
280 // If this isn't an OOQ, update the taskLevel for the queue
281 if (!isOOQEnabled()) {
282 taskLevel++;
283 }
284 } else {
285 //at this point we may reset queue TaskCount, since all command previous to this were aborted
286 taskCount = 0;
287 flushStamp->setStamp(0);
288 taskLevel = getGpgpuCommandStreamReceiver().peekTaskLevel();
289 }
290
291 FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "isQueueBlocked taskLevel change from", taskLevel, "to new from virtualEvent", this->virtualEvent, "new tasklevel", this->virtualEvent->taskLevel.load());
292
293 //close the access to virtual event, driver added only 1 ref count.
294 this->virtualEvent->decRefInternal();
295 this->virtualEvent = nullptr;
296 return false;
297 }
298 return true;
299 }
300 return false;
301 }
302
getCommandQueueInfo(cl_command_queue_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet)303 cl_int CommandQueue::getCommandQueueInfo(cl_command_queue_info paramName,
304 size_t paramValueSize,
305 void *paramValue,
306 size_t *paramValueSizeRet) {
307 return getQueueInfo<CommandQueue>(this, paramName, paramValueSize, paramValue, paramValueSizeRet);
308 }
309
getTaskLevelFromWaitList(uint32_t taskLevel,cl_uint numEventsInWaitList,const cl_event * eventWaitList)310 uint32_t CommandQueue::getTaskLevelFromWaitList(uint32_t taskLevel,
311 cl_uint numEventsInWaitList,
312 const cl_event *eventWaitList) {
313 for (auto iEvent = 0u; iEvent < numEventsInWaitList; ++iEvent) {
314 auto pEvent = (Event *)(eventWaitList[iEvent]);
315 uint32_t eventTaskLevel = pEvent->taskLevel;
316 taskLevel = std::max(taskLevel, eventTaskLevel);
317 }
318 return taskLevel;
319 }
320
getCS(size_t minRequiredSize)321 LinearStream &CommandQueue::getCS(size_t minRequiredSize) {
322 DEBUG_BREAK_IF(nullptr == device);
323
324 if (!commandStream) {
325 commandStream = new LinearStream(nullptr);
326 }
327
328 minRequiredSize += CSRequirements::minCommandQueueCommandStreamSize;
329 constexpr static auto additionalAllocationSize = CSRequirements::minCommandQueueCommandStreamSize + CSRequirements::csOverfetchSize;
330 getGpgpuCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, minRequiredSize, additionalAllocationSize);
331 return *commandStream;
332 }
333
enqueueAcquireSharedObjects(cl_uint numObjects,const cl_mem * memObjects,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * oclEvent,cl_uint cmdType)334 cl_int CommandQueue::enqueueAcquireSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
335 if ((memObjects == nullptr && numObjects != 0) || (memObjects != nullptr && numObjects == 0)) {
336 return CL_INVALID_VALUE;
337 }
338
339 for (unsigned int object = 0; object < numObjects; object++) {
340 auto memObject = castToObject<MemObj>(memObjects[object]);
341 if (memObject == nullptr || memObject->peekSharingHandler() == nullptr) {
342 return CL_INVALID_MEM_OBJECT;
343 }
344
345 int result = memObject->peekSharingHandler()->acquire(memObject, getDevice().getRootDeviceIndex());
346 if (result != CL_SUCCESS) {
347 return result;
348 }
349 memObject->acquireCount++;
350 }
351 auto status = enqueueMarkerWithWaitList(
352 numEventsInWaitList,
353 eventWaitList,
354 oclEvent);
355
356 if (oclEvent) {
357 castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
358 }
359
360 return status;
361 }
362
enqueueReleaseSharedObjects(cl_uint numObjects,const cl_mem * memObjects,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * oclEvent,cl_uint cmdType)363 cl_int CommandQueue::enqueueReleaseSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
364 if ((memObjects == nullptr && numObjects != 0) || (memObjects != nullptr && numObjects == 0)) {
365 return CL_INVALID_VALUE;
366 }
367
368 for (unsigned int object = 0; object < numObjects; object++) {
369 auto memObject = castToObject<MemObj>(memObjects[object]);
370 if (memObject == nullptr || memObject->peekSharingHandler() == nullptr) {
371 return CL_INVALID_MEM_OBJECT;
372 }
373
374 memObject->peekSharingHandler()->release(memObject, getDevice().getRootDeviceIndex());
375 DEBUG_BREAK_IF(memObject->acquireCount <= 0);
376 memObject->acquireCount--;
377 }
378 auto status = enqueueMarkerWithWaitList(
379 numEventsInWaitList,
380 eventWaitList,
381 oclEvent);
382
383 if (oclEvent) {
384 castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
385 }
386 return status;
387 }
388
updateFromCompletionStamp(const CompletionStamp & completionStamp,Event * outEvent)389 void CommandQueue::updateFromCompletionStamp(const CompletionStamp &completionStamp, Event *outEvent) {
390 DEBUG_BREAK_IF(this->taskLevel > completionStamp.taskLevel);
391 DEBUG_BREAK_IF(this->taskCount > completionStamp.taskCount);
392 if (completionStamp.taskCount != CompletionStamp::notReady) {
393 taskCount = completionStamp.taskCount;
394 }
395 flushStamp->setStamp(completionStamp.flushStamp);
396 this->taskLevel = completionStamp.taskLevel;
397
398 if (outEvent) {
399 outEvent->updateCompletionStamp(completionStamp.taskCount, outEvent->peekBcsTaskCountFromCommandQueue(), completionStamp.taskLevel, completionStamp.flushStamp);
400 FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", outEvent, "taskLevel", outEvent->taskLevel.load());
401 }
402 }
403
setPerfCountersEnabled()404 bool CommandQueue::setPerfCountersEnabled() {
405 DEBUG_BREAK_IF(device == nullptr);
406
407 auto perfCounters = device->getPerformanceCounters();
408 bool isCcsEngine = EngineHelpers::isCcs(getGpgpuEngine().osContext->getEngineType());
409
410 perfCountersEnabled = perfCounters->enable(isCcsEngine);
411
412 if (!perfCountersEnabled) {
413 perfCounters->shutdown();
414 }
415
416 return perfCountersEnabled;
417 }
418
getPerfCounters()419 PerformanceCounters *CommandQueue::getPerfCounters() {
420 return device->getPerformanceCounters();
421 }
422
enqueueWriteMemObjForUnmap(MemObj * memObj,void * mappedPtr,EventsRequest & eventsRequest)423 cl_int CommandQueue::enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest) {
424 cl_int retVal = CL_SUCCESS;
425
426 MapInfo unmapInfo;
427 if (!memObj->findMappedPtr(mappedPtr, unmapInfo)) {
428 return CL_INVALID_VALUE;
429 }
430
431 if (!unmapInfo.readOnly) {
432 memObj->getMapAllocation(getDevice().getRootDeviceIndex())->setAubWritable(true, GraphicsAllocation::defaultBank);
433 memObj->getMapAllocation(getDevice().getRootDeviceIndex())->setTbxWritable(true, GraphicsAllocation::defaultBank);
434
435 if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
436 auto buffer = castToObject<Buffer>(memObj);
437
438 retVal = enqueueWriteBuffer(buffer, CL_FALSE, unmapInfo.offset[0], unmapInfo.size[0], mappedPtr, memObj->getMapAllocation(getDevice().getRootDeviceIndex()),
439 eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
440 } else {
441 auto image = castToObjectOrAbort<Image>(memObj);
442 size_t writeOrigin[4] = {unmapInfo.offset[0], unmapInfo.offset[1], unmapInfo.offset[2], 0};
443 auto mipIdx = getMipLevelOriginIdx(image->peekClMemObjType());
444 UNRECOVERABLE_IF(mipIdx >= 4);
445 writeOrigin[mipIdx] = unmapInfo.mipLevel;
446 retVal = enqueueWriteImage(image, CL_FALSE, writeOrigin, &unmapInfo.size[0],
447 image->getHostPtrRowPitch(), image->getHostPtrSlicePitch(), mappedPtr, memObj->getMapAllocation(getDevice().getRootDeviceIndex()),
448 eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
449 }
450 } else {
451 retVal = enqueueMarkerWithWaitList(eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
452 }
453
454 if (retVal == CL_SUCCESS) {
455 memObj->removeMappedPtr(mappedPtr);
456 if (eventsRequest.outEvent) {
457 auto event = castToObject<Event>(*eventsRequest.outEvent);
458 event->setCmdType(CL_COMMAND_UNMAP_MEM_OBJECT);
459 }
460 }
461 return retVal;
462 }
463
enqueueReadMemObjForMap(TransferProperties & transferProperties,EventsRequest & eventsRequest,cl_int & errcodeRet)464 void *CommandQueue::enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet) {
465 void *basePtr = transferProperties.memObj->getBasePtrForMap(getDevice().getRootDeviceIndex());
466 size_t mapPtrOffset = transferProperties.memObj->calculateOffsetForMapping(transferProperties.offset) + transferProperties.mipPtrOffset;
467 if (transferProperties.memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
468 mapPtrOffset += transferProperties.memObj->getOffset();
469 }
470 void *returnPtr = ptrOffset(basePtr, mapPtrOffset);
471
472 if (!transferProperties.memObj->addMappedPtr(returnPtr, transferProperties.memObj->calculateMappedPtrLength(transferProperties.size),
473 transferProperties.mapFlags, transferProperties.size, transferProperties.offset, transferProperties.mipLevel,
474 transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()))) {
475 errcodeRet = CL_INVALID_OPERATION;
476 return nullptr;
477 }
478
479 if (transferProperties.mapFlags == CL_MAP_WRITE_INVALIDATE_REGION) {
480 errcodeRet = enqueueMarkerWithWaitList(eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
481 } else {
482 if (transferProperties.memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
483 auto buffer = castToObject<Buffer>(transferProperties.memObj);
484 errcodeRet = enqueueReadBuffer(buffer, transferProperties.blocking, transferProperties.offset[0], transferProperties.size[0],
485 returnPtr, transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()), eventsRequest.numEventsInWaitList,
486 eventsRequest.eventWaitList, eventsRequest.outEvent);
487 } else {
488 auto image = castToObjectOrAbort<Image>(transferProperties.memObj);
489 size_t readOrigin[4] = {transferProperties.offset[0], transferProperties.offset[1], transferProperties.offset[2], 0};
490 auto mipIdx = getMipLevelOriginIdx(image->peekClMemObjType());
491 UNRECOVERABLE_IF(mipIdx >= 4);
492 readOrigin[mipIdx] = transferProperties.mipLevel;
493 errcodeRet = enqueueReadImage(image, transferProperties.blocking, readOrigin, &transferProperties.size[0],
494 image->getHostPtrRowPitch(), image->getHostPtrSlicePitch(),
495 returnPtr, transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()), eventsRequest.numEventsInWaitList,
496 eventsRequest.eventWaitList, eventsRequest.outEvent);
497 }
498 }
499
500 if (errcodeRet != CL_SUCCESS) {
501 transferProperties.memObj->removeMappedPtr(returnPtr);
502 return nullptr;
503 }
504 if (eventsRequest.outEvent) {
505 auto event = castToObject<Event>(*eventsRequest.outEvent);
506 event->setCmdType(transferProperties.cmdType);
507 }
508 return returnPtr;
509 }
510
enqueueMapMemObject(TransferProperties & transferProperties,EventsRequest & eventsRequest,cl_int & errcodeRet)511 void *CommandQueue::enqueueMapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet) {
512 if (transferProperties.memObj->mappingOnCpuAllowed()) {
513 return cpuDataTransferHandler(transferProperties, eventsRequest, errcodeRet);
514 } else {
515 return enqueueReadMemObjForMap(transferProperties, eventsRequest, errcodeRet);
516 }
517 }
518
enqueueUnmapMemObject(TransferProperties & transferProperties,EventsRequest & eventsRequest)519 cl_int CommandQueue::enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest) {
520 cl_int retVal = CL_SUCCESS;
521 if (transferProperties.memObj->mappingOnCpuAllowed()) {
522 cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
523 } else {
524 retVal = enqueueWriteMemObjForUnmap(transferProperties.memObj, transferProperties.ptr, eventsRequest);
525 }
526 return retVal;
527 }
528
enqueueMapBuffer(Buffer * buffer,cl_bool blockingMap,cl_map_flags mapFlags,size_t offset,size_t size,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_int & errcodeRet)529 void *CommandQueue::enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap,
530 cl_map_flags mapFlags, size_t offset,
531 size_t size, cl_uint numEventsInWaitList,
532 const cl_event *eventWaitList, cl_event *event,
533 cl_int &errcodeRet) {
534 TransferProperties transferProperties(buffer, CL_COMMAND_MAP_BUFFER, mapFlags, blockingMap != CL_FALSE, &offset, &size, nullptr, false, getDevice().getRootDeviceIndex());
535 EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
536
537 return enqueueMapMemObject(transferProperties, eventsRequest, errcodeRet);
538 }
539
enqueueMapImage(Image * image,cl_bool blockingMap,cl_map_flags mapFlags,const size_t * origin,const size_t * region,size_t * imageRowPitch,size_t * imageSlicePitch,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_int & errcodeRet)540 void *CommandQueue::enqueueMapImage(Image *image, cl_bool blockingMap,
541 cl_map_flags mapFlags, const size_t *origin,
542 const size_t *region, size_t *imageRowPitch,
543 size_t *imageSlicePitch,
544 cl_uint numEventsInWaitList,
545 const cl_event *eventWaitList, cl_event *event,
546 cl_int &errcodeRet) {
547 TransferProperties transferProperties(image, CL_COMMAND_MAP_IMAGE, mapFlags, blockingMap != CL_FALSE,
548 const_cast<size_t *>(origin), const_cast<size_t *>(region), nullptr, false, getDevice().getRootDeviceIndex());
549 EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
550
551 if (image->isMemObjZeroCopy() && image->mappingOnCpuAllowed()) {
552 GetInfoHelper::set(imageSlicePitch, image->getImageDesc().image_slice_pitch);
553 if (image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
554 // There are differences in qPitch programming between Gen8 vs Gen9+ devices.
555 // For Gen8 qPitch is distance in rows while Gen9+ it is in pixels.
556 // Minimum value of qPitch is 4 and this causes slicePitch = 4*rowPitch on Gen8.
557 // To allow zero-copy we have to tell what is correct value rowPitch which should equal to slicePitch.
558 GetInfoHelper::set(imageRowPitch, image->getImageDesc().image_slice_pitch);
559 } else {
560 GetInfoHelper::set(imageRowPitch, image->getImageDesc().image_row_pitch);
561 }
562 } else {
563 GetInfoHelper::set(imageSlicePitch, image->getHostPtrSlicePitch());
564 GetInfoHelper::set(imageRowPitch, image->getHostPtrRowPitch());
565 }
566 if (Image::hasSlices(image->peekClMemObjType()) == false) {
567 GetInfoHelper::set(imageSlicePitch, static_cast<size_t>(0));
568 }
569 return enqueueMapMemObject(transferProperties, eventsRequest, errcodeRet);
570 }
571
enqueueUnmapMemObject(MemObj * memObj,void * mappedPtr,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)572 cl_int CommandQueue::enqueueUnmapMemObject(MemObj *memObj, void *mappedPtr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) {
573 TransferProperties transferProperties(memObj, CL_COMMAND_UNMAP_MEM_OBJECT, 0, false, nullptr, nullptr, mappedPtr, false, getDevice().getRootDeviceIndex());
574 EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
575
576 return enqueueUnmapMemObject(transferProperties, eventsRequest);
577 }
578
enqueueBlockedMapUnmapOperation(const cl_event * eventWaitList,size_t numEventsInWaitlist,MapOperationType opType,MemObj * memObj,MemObjSizeArray & copySize,MemObjOffsetArray & copyOffset,bool readOnly,EventBuilder & externalEventBuilder)579 void CommandQueue::enqueueBlockedMapUnmapOperation(const cl_event *eventWaitList,
580 size_t numEventsInWaitlist,
581 MapOperationType opType,
582 MemObj *memObj,
583 MemObjSizeArray ©Size,
584 MemObjOffsetArray ©Offset,
585 bool readOnly,
586 EventBuilder &externalEventBuilder) {
587 EventBuilder internalEventBuilder;
588 EventBuilder *eventBuilder;
589 // check if event will be exposed externally
590 if (externalEventBuilder.getEvent()) {
591 externalEventBuilder.getEvent()->incRefInternal();
592 eventBuilder = &externalEventBuilder;
593 } else {
594 // it will be an internal event
595 internalEventBuilder.create<VirtualEvent>(this, context);
596 eventBuilder = &internalEventBuilder;
597 }
598
599 //store task data in event
600 auto cmd = std::unique_ptr<Command>(new CommandMapUnmap(opType, *memObj, copySize, copyOffset, readOnly, *this));
601 eventBuilder->getEvent()->setCommand(std::move(cmd));
602
603 //bind output event with input events
604 eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitlist));
605 eventBuilder->addParentEvent(this->virtualEvent);
606 eventBuilder->finalize();
607
608 if (this->virtualEvent) {
609 this->virtualEvent->decRefInternal();
610 }
611 this->virtualEvent = eventBuilder->getEvent();
612 }
613
setupDebugSurface(Kernel * kernel)614 bool CommandQueue::setupDebugSurface(Kernel *kernel) {
615 auto debugSurface = getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
616
617 DEBUG_BREAK_IF(!kernel->usesBindfulAddressingForBuffers());
618 auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
619 kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful);
620 void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
621 size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
622 Buffer::setSurfaceState(&device->getDevice(), surfaceState, false, false, sizeToPatch,
623 addressToPatch, 0, debugSurface, 0, 0,
624 kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
625 kernel->areMultipleSubDevicesInContext());
626 return true;
627 }
628
validateCapability(cl_command_queue_capabilities_intel capability) const629 bool CommandQueue::validateCapability(cl_command_queue_capabilities_intel capability) const {
630 return this->queueCapabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL || isValueSet(this->queueCapabilities, capability);
631 }
632
validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList,const cl_event * waitList) const633 bool CommandQueue::validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList, const cl_event *waitList) const {
634 for (cl_uint eventIndex = 0u; eventIndex < numEventsInWaitList; eventIndex++) {
635 const Event *event = castToObject<Event>(waitList[eventIndex]);
636 if (event->isUserEvent()) {
637 continue;
638 }
639
640 const CommandQueue *eventCommandQueue = event->getCommandQueue();
641 const bool crossQueue = this != eventCommandQueue;
642 const cl_command_queue_capabilities_intel createCap = crossQueue ? CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL
643 : CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL;
644 const cl_command_queue_capabilities_intel waitCap = crossQueue ? CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL
645 : CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL;
646 if (!validateCapability(waitCap) || !eventCommandQueue->validateCapability(createCap)) {
647 return false;
648 }
649 }
650
651 return true;
652 }
653
validateCapabilityForOperation(cl_command_queue_capabilities_intel capability,cl_uint numEventsInWaitList,const cl_event * waitList,const cl_event * outEvent) const654 bool CommandQueue::validateCapabilityForOperation(cl_command_queue_capabilities_intel capability,
655 cl_uint numEventsInWaitList,
656 const cl_event *waitList,
657 const cl_event *outEvent) const {
658 const bool operationValid = validateCapability(capability);
659 const bool waitListValid = validateCapabilitiesForEventWaitList(numEventsInWaitList, waitList);
660 const bool outEventValid = outEvent == nullptr ||
661 validateCapability(CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL) ||
662 validateCapability(CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL);
663 return operationValid && waitListValid && outEventValid;
664 }
665
getQueueFamilyIndex() const666 cl_uint CommandQueue::getQueueFamilyIndex() const {
667 if (isQueueFamilySelected()) {
668 return queueFamilyIndex;
669 } else {
670 const auto &hwInfo = device->getHardwareInfo();
671 const auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
672 const auto engineGroupType = hwHelper.getEngineGroupType(gpgpuEngine->getEngineType(), gpgpuEngine->getEngineUsage(), hwInfo);
673 const auto familyIndex = device->getDevice().getEngineGroupIndexFromEngineGroupType(engineGroupType);
674 return static_cast<cl_uint>(familyIndex);
675 }
676 }
677
updateBcsTaskCount(aub_stream::EngineType bcsEngineType,uint32_t newBcsTaskCount)678 void CommandQueue::updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint32_t newBcsTaskCount) {
679 CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
680 state.engineType = bcsEngineType;
681 state.taskCount = newBcsTaskCount;
682 }
683
peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const684 uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const {
685 const CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
686 DEBUG_BREAK_IF(!state.isValid());
687 return state.taskCount;
688 }
689
isTextureCacheFlushNeeded(uint32_t commandType) const690 bool CommandQueue::isTextureCacheFlushNeeded(uint32_t commandType) const {
691 return commandType == CL_COMMAND_COPY_IMAGE && getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled();
692 }
693
getIndirectHeap(IndirectHeap::Type heapType,size_t minRequiredSize)694 IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize) {
695 return getGpgpuCommandStreamReceiver().getIndirectHeap(heapType, minRequiredSize);
696 }
697
allocateHeapMemory(IndirectHeap::Type heapType,size_t minRequiredSize,IndirectHeap * & indirectHeap)698 void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType, size_t minRequiredSize, IndirectHeap *&indirectHeap) {
699 getGpgpuCommandStreamReceiver().allocateHeapMemory(heapType, minRequiredSize, indirectHeap);
700 }
701
releaseIndirectHeap(IndirectHeap::Type heapType)702 void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) {
703 getGpgpuCommandStreamReceiver().releaseIndirectHeap(heapType);
704 }
705
obtainNewTimestampPacketNodes(size_t numberOfNodes,TimestampPacketContainer & previousNodes,bool clearAllDependencies,CommandStreamReceiver & csr)706 void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, CommandStreamReceiver &csr) {
707 TagAllocatorBase *allocator = csr.getTimestampPacketAllocator();
708
709 previousNodes.swapNodes(*timestampPacketContainer);
710
711 if ((previousNodes.peekNodes().size() > 0) && (previousNodes.peekNodes()[0]->getAllocator() != allocator)) {
712 clearAllDependencies = false;
713 }
714
715 if (clearAllDependencies) {
716 previousNodes.moveNodesToNewContainer(*deferredTimestampPackets);
717 }
718
719 DEBUG_BREAK_IF(timestampPacketContainer->peekNodes().size() > 0);
720
721 for (size_t i = 0; i < numberOfNodes; i++) {
722 timestampPacketContainer->add(allocator->getTag());
723 }
724 }
725
estimateTimestampPacketNodesCount(const MultiDispatchInfo & dispatchInfo) const726 size_t CommandQueue::estimateTimestampPacketNodesCount(const MultiDispatchInfo &dispatchInfo) const {
727 size_t nodesCount = dispatchInfo.size();
728 auto mainKernel = dispatchInfo.peekMainKernel();
729 if (obtainTimestampPacketForCacheFlush(mainKernel->requiresCacheFlushCommand(*this))) {
730 nodesCount++;
731 }
732 return nodesCount;
733 }
734
bufferCpuCopyAllowed(Buffer * buffer,cl_command_type commandType,cl_bool blocking,size_t size,void * ptr,cl_uint numEventsInWaitList,const cl_event * eventWaitList)735 bool CommandQueue::bufferCpuCopyAllowed(Buffer *buffer, cl_command_type commandType, cl_bool blocking, size_t size, void *ptr,
736 cl_uint numEventsInWaitList, const cl_event *eventWaitList) {
737
738 auto debugVariableSet = false;
739 // Requested by debug variable or allowed by Buffer
740 if (CL_COMMAND_READ_BUFFER == commandType && DebugManager.flags.DoCpuCopyOnReadBuffer.get() != -1) {
741 if (DebugManager.flags.DoCpuCopyOnReadBuffer.get() == 0) {
742 return false;
743 }
744 debugVariableSet = true;
745 }
746 if (CL_COMMAND_WRITE_BUFFER == commandType && DebugManager.flags.DoCpuCopyOnWriteBuffer.get() != -1) {
747 if (DebugManager.flags.DoCpuCopyOnWriteBuffer.get() == 0) {
748 return false;
749 }
750 debugVariableSet = true;
751 }
752
753 //if we are blocked by user events, we can't service the call on CPU
754 if (Event::checkUserEventDependencies(numEventsInWaitList, eventWaitList)) {
755 return false;
756 }
757
758 //check if buffer is compatible
759 if (!buffer->isReadWriteOnCpuAllowed(device->getDevice())) {
760 return false;
761 }
762
763 if (buffer->getMemoryManager() && buffer->getMemoryManager()->isCpuCopyRequired(ptr)) {
764 return true;
765 }
766
767 if (debugVariableSet) {
768 return true;
769 }
770
771 //non blocking transfers are not expected to be serviced by CPU
772 //we do not want to artifically stall the pipeline to allow CPU access
773 if (blocking == CL_FALSE) {
774 return false;
775 }
776
777 //check if it is beneficial to do transfer on CPU
778 if (!buffer->isReadWriteOnCpuPreferred(ptr, size, getDevice())) {
779 return false;
780 }
781
782 //make sure that event wait list is empty
783 if (numEventsInWaitList == 0) {
784 return true;
785 }
786
787 return false;
788 }
789
queueDependenciesClearRequired() const790 bool CommandQueue::queueDependenciesClearRequired() const {
791 return isOOQEnabled() || DebugManager.flags.OmitTimestampPacketDependencies.get();
792 }
793
blitEnqueueAllowed(const CsrSelectionArgs & args) const794 bool CommandQueue::blitEnqueueAllowed(const CsrSelectionArgs &args) const {
795 bool blitEnqueueAllowed = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() || this->isCopyOnly;
796 if (DebugManager.flags.EnableBlitterForEnqueueOperations.get() != -1) {
797 blitEnqueueAllowed = DebugManager.flags.EnableBlitterForEnqueueOperations.get();
798 }
799 if (!blitEnqueueAllowed) {
800 return false;
801 }
802
803 switch (args.cmdType) {
804 case CL_COMMAND_READ_BUFFER:
805 case CL_COMMAND_WRITE_BUFFER:
806 case CL_COMMAND_COPY_BUFFER:
807 case CL_COMMAND_READ_BUFFER_RECT:
808 case CL_COMMAND_WRITE_BUFFER_RECT:
809 case CL_COMMAND_COPY_BUFFER_RECT:
810 case CL_COMMAND_SVM_MEMCPY:
811 case CL_COMMAND_SVM_MAP:
812 case CL_COMMAND_SVM_UNMAP:
813 return true;
814 case CL_COMMAND_READ_IMAGE:
815 return blitEnqueueImageAllowed(args.srcResource.imageOrigin, args.size, *args.srcResource.image);
816 case CL_COMMAND_WRITE_IMAGE:
817 return blitEnqueueImageAllowed(args.dstResource.imageOrigin, args.size, *args.dstResource.image);
818
819 case CL_COMMAND_COPY_IMAGE:
820 return blitEnqueueImageAllowed(args.srcResource.imageOrigin, args.size, *args.srcResource.image) &&
821 blitEnqueueImageAllowed(args.dstResource.imageOrigin, args.size, *args.dstResource.image);
822
823 default:
824 return false;
825 }
826 }
827
blitEnqueueImageAllowed(const size_t * origin,const size_t * region,const Image & image) const828 bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *region, const Image &image) const {
829 const auto &hwInfo = device->getHardwareInfo();
830 const auto &hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
831 auto blitEnqueueImageAllowed = hwInfoConfig->isBlitterForImagesSupported();
832
833 if (DebugManager.flags.EnableBlitterForEnqueueImageOperations.get() != -1) {
834 blitEnqueueImageAllowed = DebugManager.flags.EnableBlitterForEnqueueImageOperations.get();
835 }
836
837 blitEnqueueImageAllowed &= (origin[0] + region[0] <= BlitterConstants::maxBlitWidth) && (origin[1] + region[1] <= BlitterConstants::maxBlitHeight);
838 blitEnqueueImageAllowed &= !isMipMapped(image.getImageDesc());
839
840 const auto &defaultGmm = image.getGraphicsAllocation(device->getRootDeviceIndex())->getDefaultGmm();
841 if (defaultGmm != nullptr) {
842 auto isTile64 = defaultGmm->gmmResourceInfo->getResourceFlags()->Info.Tile64;
843 auto imageType = image.getImageDesc().image_type;
844 if (isTile64 && (imageType == CL_MEM_OBJECT_IMAGE3D)) {
845 blitEnqueueImageAllowed &= hwInfoConfig->isTile64With3DSurfaceOnBCSSupported(hwInfo);
846 }
847 }
848
849 return blitEnqueueImageAllowed;
850 }
851
isBlockedCommandStreamRequired(uint32_t commandType,const EventsRequest & eventsRequest,bool blockedQueue,bool isMarkerWithProfiling) const852 bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
853 if (!blockedQueue) {
854 return false;
855 }
856
857 if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
858 return true;
859 }
860
861 if (CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) {
862 auto timestampPacketWriteEnabled = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled();
863 if (timestampPacketWriteEnabled || context->getRootDeviceIndices().size() > 1) {
864
865 for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
866 auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
867 if (timestampPacketWriteEnabled && waitlistEvent->getTimestampPacketNodes()) {
868 return true;
869 }
870 if (waitlistEvent->getCommandQueue() && waitlistEvent->getCommandQueue()->getDevice().getRootDeviceIndex() != this->getDevice().getRootDeviceIndex()) {
871 return true;
872 }
873 }
874 }
875 }
876
877 return false;
878 }
879
storeProperties(const cl_queue_properties * properties)880 void CommandQueue::storeProperties(const cl_queue_properties *properties) {
881 if (properties) {
882 for (size_t i = 0; properties[i] != 0; i += 2) {
883 propertiesVector.push_back(properties[i]);
884 propertiesVector.push_back(properties[i + 1]);
885 }
886 propertiesVector.push_back(0);
887 }
888 }
889
processProperties(const cl_queue_properties * properties)890 void CommandQueue::processProperties(const cl_queue_properties *properties) {
891 if (properties != nullptr) {
892 bool specificEngineSelected = false;
893 cl_uint selectedQueueFamilyIndex = std::numeric_limits<uint32_t>::max();
894 cl_uint selectedQueueIndex = std::numeric_limits<uint32_t>::max();
895
896 for (auto currentProperties = properties; *currentProperties != 0; currentProperties += 2) {
897 switch (*currentProperties) {
898 case CL_QUEUE_FAMILY_INTEL:
899 selectedQueueFamilyIndex = static_cast<cl_uint>(*(currentProperties + 1));
900 specificEngineSelected = true;
901 break;
902 case CL_QUEUE_INDEX_INTEL:
903 selectedQueueIndex = static_cast<cl_uint>(*(currentProperties + 1));
904 specificEngineSelected = true;
905 break;
906 }
907 }
908
909 if (specificEngineSelected) {
910 this->queueFamilySelected = true;
911 if (!getDevice().hasRootCsr()) {
912 const auto &engine = getDevice().getRegularEngineGroups()[selectedQueueFamilyIndex].engines[selectedQueueIndex];
913 auto engineType = engine.getEngineType();
914 auto engineUsage = engine.getEngineUsage();
915 if ((DebugManager.flags.EngineUsageHint.get() != -1) &&
916 (getDevice().tryGetEngine(engineType, static_cast<EngineUsage>(DebugManager.flags.EngineUsageHint.get())) != nullptr)) {
917 engineUsage = static_cast<EngineUsage>(DebugManager.flags.EngineUsageHint.get());
918 }
919 this->overrideEngine(engineType, engineUsage);
920 this->queueCapabilities = getClDevice().getDeviceInfo().queueFamilyProperties[selectedQueueFamilyIndex].capabilities;
921 this->queueFamilyIndex = selectedQueueFamilyIndex;
922 this->queueIndexWithinFamily = selectedQueueIndex;
923 }
924 }
925 }
926 requiresCacheFlushAfterWalker = device && (device->getDeviceInfo().parentDevice != nullptr);
927 }
928
overrideEngine(aub_stream::EngineType engineType,EngineUsage engineUsage)929 void CommandQueue::overrideEngine(aub_stream::EngineType engineType, EngineUsage engineUsage) {
930 const HardwareInfo &hwInfo = getDevice().getHardwareInfo();
931 const HwHelper &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
932 const EngineGroupType engineGroupType = hwHelper.getEngineGroupType(engineType, engineUsage, hwInfo);
933 const bool isEngineCopyOnly = hwHelper.isCopyOnlyEngineType(engineGroupType);
934
935 if (isEngineCopyOnly) {
936 std::fill(bcsEngines.begin(), bcsEngines.end(), nullptr);
937 bcsEngines[EngineHelpers::getBcsIndex(engineType)] = &device->getEngine(engineType, EngineUsage::Regular);
938 bcsEngineTypes = {engineType};
939 timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
940 deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
941 isCopyOnly = true;
942 } else {
943 gpgpuEngine = &device->getEngine(engineType, engineUsage);
944 }
945 }
946
aubCaptureHook(bool & blocking,bool & clearAllDependencies,const MultiDispatchInfo & multiDispatchInfo)947 void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo) {
948 if (DebugManager.flags.AUBDumpSubCaptureMode.get()) {
949 auto status = getGpgpuCommandStreamReceiver().checkAndActivateAubSubCapture(multiDispatchInfo.empty() ? "" : multiDispatchInfo.peekMainKernel()->getDescriptor().kernelMetadata.kernelName);
950 if (!status.isActive) {
951 // make each enqueue blocking when subcapture is not active to split batch buffer
952 blocking = true;
953 } else if (!status.wasActiveInPreviousEnqueue) {
954 // omit timestamp packet dependencies dependencies upon subcapture activation
955 clearAllDependencies = true;
956 }
957 }
958
959 if (getGpgpuCommandStreamReceiver().getType() > CommandStreamReceiverType::CSR_HW) {
960 for (auto &dispatchInfo : multiDispatchInfo) {
961 auto &kernelName = dispatchInfo.getKernel()->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName;
962 getGpgpuCommandStreamReceiver().addAubComment(kernelName.c_str());
963 }
964 }
965 }
966
isWaitForTimestampsEnabled()967 bool CommandQueue::isWaitForTimestampsEnabled() {
968 auto &hwHelper = HwHelper::get(getDevice().getHardwareInfo().platform.eRenderCoreFamily);
969 auto enabled = CommandQueue::isTimestampWaitEnabled();
970 enabled &= hwHelper.isTimestampWaitSupported();
971
972 switch (DebugManager.flags.EnableTimestampWait.get()) {
973 case 0:
974 enabled = false;
975 break;
976 case 1:
977 enabled = getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled();
978 break;
979 case 2:
980 enabled = getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled();
981 break;
982 case 3:
983 enabled = getGpgpuCommandStreamReceiver().isAnyDirectSubmissionEnabled();
984 break;
985 case 4:
986 enabled = true;
987 break;
988 }
989
990 return enabled;
991 }
992
waitForAllEngines(bool blockedQueue,PrintfHandler * printfHandler,bool cleanTemporaryAllocationsList)993 void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
994 if (blockedQueue) {
995 while (isQueueBlocked()) {
996 }
997 }
998
999 auto waitedOnTimestamps = waitForTimestamps(taskCount);
1000
1001 TimestampPacketContainer nodesToRelease;
1002 if (deferredTimestampPackets) {
1003 deferredTimestampPackets->swapNodes(nodesToRelease);
1004 }
1005
1006 StackVec<CopyEngineState, bcsInfoMaskSize> activeBcsStates{};
1007 for (CopyEngineState &state : this->bcsStates) {
1008 if (state.isValid()) {
1009 activeBcsStates.push_back(state);
1010 }
1011 }
1012 waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
1013
1014 if (printfHandler) {
1015 printfHandler->printEnqueueOutput();
1016 }
1017 }
1018
1019 } // namespace NEO
1020