1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #include "opencl/source/command_queue/command_queue.h"
9 
10 #include "shared/source/command_stream/command_stream_receiver.h"
11 #include "shared/source/helpers/aligned_memory.h"
12 #include "shared/source/helpers/array_count.h"
13 #include "shared/source/helpers/engine_node_helper.h"
14 #include "shared/source/helpers/get_info.h"
15 #include "shared/source/helpers/ptr_math.h"
16 #include "shared/source/helpers/string.h"
17 #include "shared/source/helpers/timestamp_packet.h"
18 #include "shared/source/memory_manager/internal_allocation_storage.h"
19 #include "shared/source/os_interface/hw_info_config.h"
20 #include "shared/source/os_interface/os_context.h"
21 #include "shared/source/utilities/api_intercept.h"
22 #include "shared/source/utilities/tag_allocator.h"
23 
24 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
25 #include "opencl/source/cl_device/cl_device.h"
26 #include "opencl/source/context/context.h"
27 #include "opencl/source/device_queue/device_queue.h"
28 #include "opencl/source/event/event_builder.h"
29 #include "opencl/source/event/user_event.h"
30 #include "opencl/source/gtpin/gtpin_notify.h"
31 #include "opencl/source/helpers/cl_hw_helper.h"
32 #include "opencl/source/helpers/convert_color.h"
33 #include "opencl/source/helpers/hardware_commands_helper.h"
34 #include "opencl/source/helpers/mipmap.h"
35 #include "opencl/source/helpers/queue_helpers.h"
36 #include "opencl/source/mem_obj/buffer.h"
37 #include "opencl/source/mem_obj/image.h"
38 #include "opencl/source/program/printf_handler.h"
39 
40 #include "CL/cl_ext.h"
41 
42 #include <limits>
43 #include <map>
44 
45 namespace NEO {
46 
47 // Global table of create functions
48 CommandQueueCreateFunc commandQueueFactory[IGFX_MAX_CORE] = {};
49 
create(Context * context,ClDevice * device,const cl_queue_properties * properties,bool internalUsage,cl_int & retVal)50 CommandQueue *CommandQueue::create(Context *context,
51                                    ClDevice *device,
52                                    const cl_queue_properties *properties,
53                                    bool internalUsage,
54                                    cl_int &retVal) {
55     retVal = CL_SUCCESS;
56 
57     auto funcCreate = commandQueueFactory[device->getRenderCoreFamily()];
58     DEBUG_BREAK_IF(nullptr == funcCreate);
59 
60     return funcCreate(context, device, properties, internalUsage);
61 }
62 
CommandQueue(Context * context,ClDevice * device,const cl_queue_properties * properties,bool internalUsage)63 CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage)
64     : context(context), device(device) {
65     if (context) {
66         context->incRefInternal();
67     }
68 
69     commandQueueProperties = getCmdQueueProperties<cl_command_queue_properties>(properties);
70     flushStamp.reset(new FlushStampTracker(true));
71 
72     if (device) {
73         auto &hwInfo = device->getHardwareInfo();
74         auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
75         auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
76 
77         gpgpuEngine = &device->getDefaultEngine();
78 
79         UNRECOVERABLE_IF(gpgpuEngine->getEngineType() >= aub_stream::EngineType::NUM_ENGINES);
80 
81         bool bcsAllowed = hwInfoConfig->isBlitterFullySupported(hwInfo) &&
82                           hwHelper.isSubDeviceEngineSupported(hwInfo, device->getDeviceBitfield(), aub_stream::EngineType::ENGINE_BCS);
83 
84         if (bcsAllowed || gpgpuEngine->commandStreamReceiver->peekTimestampPacketWriteEnabled()) {
85             timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
86             deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
87         }
88         if (bcsAllowed) {
89             auto &neoDevice = device->getNearestGenericSubDevice(0)->getDevice();
90             auto &selectorCopyEngine = neoDevice.getSelectorCopyEngine();
91             auto bcsEngineType = EngineHelpers::getBcsEngineType(hwInfo, device->getDeviceBitfield(), selectorCopyEngine, internalUsage);
92             bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)] = neoDevice.tryGetEngine(bcsEngineType, EngineUsage::Regular);
93             bcsEngineTypes.push_back(bcsEngineType);
94         }
95     }
96 
97     storeProperties(properties);
98     processProperties(properties);
99 }
100 
~CommandQueue()101 CommandQueue::~CommandQueue() {
102     if (virtualEvent) {
103         UNRECOVERABLE_IF(this->virtualEvent->getCommandQueue() != this && this->virtualEvent->getCommandQueue() != nullptr);
104         virtualEvent->decRefInternal();
105     }
106 
107     if (device) {
108         auto storageForAllocation = gpgpuEngine->commandStreamReceiver->getInternalAllocationStorage();
109 
110         if (commandStream) {
111             storageForAllocation->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream->getGraphicsAllocation()), REUSABLE_ALLOCATION);
112         }
113         delete commandStream;
114 
115         if (this->perfCountersEnabled) {
116             device->getPerformanceCounters()->shutdown();
117         }
118 
119         if (auto mainBcs = bcsEngines[0]; mainBcs != nullptr) {
120             auto &selectorCopyEngine = device->getNearestGenericSubDevice(0)->getSelectorCopyEngine();
121             EngineHelpers::releaseBcsEngineType(mainBcs->getEngineType(), selectorCopyEngine);
122         }
123     }
124 
125     timestampPacketContainer.reset();
126     //for normal queue, decrement ref count on context
127     //special queue is owned by context so ref count doesn't have to be decremented
128     if (context && !isSpecialCommandQueue) {
129         context->decRefInternal();
130     }
131     gtpinRemoveCommandQueue(this);
132 }
133 
getGpgpuCommandStreamReceiver() const134 CommandStreamReceiver &CommandQueue::getGpgpuCommandStreamReceiver() const {
135     return *gpgpuEngine->commandStreamReceiver;
136 }
137 
getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const138 CommandStreamReceiver *CommandQueue::getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType) const {
139     const EngineControl *engine = this->bcsEngines[EngineHelpers::getBcsIndex(bcsEngineType)];
140     if (engine == nullptr) {
141         return nullptr;
142     } else {
143         return engine->commandStreamReceiver;
144     }
145 }
146 
getBcsForAuxTranslation() const147 CommandStreamReceiver *CommandQueue::getBcsForAuxTranslation() const {
148     for (const EngineControl *engine : this->bcsEngines) {
149         if (engine != nullptr) {
150             return engine->commandStreamReceiver;
151         }
152     }
153     return nullptr;
154 }
155 
selectCsrForBuiltinOperation(const CsrSelectionArgs & args) const156 CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelectionArgs &args) const {
157     if (isCopyOnly) {
158         return *getBcsCommandStreamReceiver(bcsEngineTypes[0]);
159     }
160 
161     if (!blitEnqueueAllowed(args)) {
162         return getGpgpuCommandStreamReceiver();
163     }
164 
165     bool preferBcs = true;
166     aub_stream::EngineType preferredBcsEngineType = aub_stream::EngineType::NUM_ENGINES;
167     switch (args.direction) {
168     case TransferDirection::LocalToLocal: {
169         const auto &clHwHelper = ClHwHelper::get(device->getHardwareInfo().platform.eRenderCoreFamily);
170         preferBcs = clHwHelper.preferBlitterForLocalToLocalTransfers();
171         if (auto flag = DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.get(); flag != -1) {
172             preferBcs = static_cast<bool>(flag);
173         }
174         if (preferBcs) {
175             preferredBcsEngineType = aub_stream::EngineType::ENGINE_BCS;
176         }
177         break;
178     }
179     case TransferDirection::HostToHost:
180     case TransferDirection::HostToLocal:
181     case TransferDirection::LocalToHost: {
182         preferBcs = true;
183         preferredBcsEngineType = EngineHelpers::getBcsEngineType(device->getHardwareInfo(), device->getDeviceBitfield(),
184                                                                  device->getSelectorCopyEngine(), false);
185         break;
186     }
187     default:
188         UNRECOVERABLE_IF(true);
189     }
190 
191     CommandStreamReceiver *selectedCsr = nullptr;
192     if (preferBcs) {
193         selectedCsr = getBcsCommandStreamReceiver(preferredBcsEngineType);
194         if (selectedCsr == nullptr && !bcsEngineTypes.empty()) {
195             selectedCsr = getBcsCommandStreamReceiver(bcsEngineTypes[0]);
196         }
197     }
198     if (selectedCsr == nullptr) {
199         selectedCsr = &getGpgpuCommandStreamReceiver();
200     }
201 
202     UNRECOVERABLE_IF(selectedCsr == nullptr);
203     return *selectedCsr;
204 }
205 
getDevice() const206 Device &CommandQueue::getDevice() const noexcept {
207     return device->getDevice();
208 }
209 
getHwTag() const210 uint32_t CommandQueue::getHwTag() const {
211     uint32_t tag = *getHwTagAddress();
212     return tag;
213 }
214 
getHwTagAddress() const215 volatile uint32_t *CommandQueue::getHwTagAddress() const {
216     return getGpgpuCommandStreamReceiver().getTagAddress();
217 }
218 
isCompleted(uint32_t gpgpuTaskCount,CopyEngineState bcsState) const219 bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState) const {
220     DEBUG_BREAK_IF(getHwTag() == CompletionStamp::notReady);
221 
222     if (getGpgpuCommandStreamReceiver().testTaskCountReady(getHwTagAddress(), gpgpuTaskCount)) {
223         if (bcsState.isValid()) {
224             return *getBcsCommandStreamReceiver(bcsState.engineType)->getTagAddress() >= peekBcsTaskCount(bcsState.engineType);
225         }
226 
227         return true;
228     }
229 
230     return false;
231 }
232 
waitUntilComplete(uint32_t gpgpuTaskCountToWait,Range<CopyEngineState> copyEnginesToWait,FlushStamp flushStampToWait,bool useQuickKmdSleep,bool cleanTemporaryAllocationList,bool skipWait)233 void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool cleanTemporaryAllocationList, bool skipWait) {
234     WAIT_ENTER()
235 
236     DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
237     DBG_LOG(LogTaskCounts, __FUNCTION__, "Line: ", __LINE__, "Current taskCount:", getHwTag());
238 
239     if (!skipWait) {
240         bool forcePowerSavingMode = this->throttle == QueueThrottle::LOW;
241 
242         getGpgpuCommandStreamReceiver().waitForTaskCountWithKmdNotifyFallback(gpgpuTaskCountToWait,
243                                                                               flushStampToWait,
244                                                                               useQuickKmdSleep,
245                                                                               forcePowerSavingMode);
246         DEBUG_BREAK_IF(getHwTag() < gpgpuTaskCountToWait);
247 
248         if (gtpinIsGTPinInitialized()) {
249             gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
250         }
251 
252         for (const CopyEngineState &copyEngine : copyEnginesToWait) {
253             auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
254             bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
255             bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
256         }
257     }
258 
259     if (cleanTemporaryAllocationList) {
260         getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
261     } else {
262         getGpgpuCommandStreamReceiver().waitForTaskCount(gpgpuTaskCountToWait);
263     }
264 
265     WAIT_LEAVE()
266 }
267 
isQueueBlocked()268 bool CommandQueue::isQueueBlocked() {
269     TakeOwnershipWrapper<CommandQueue> takeOwnershipWrapper(*this);
270     //check if we have user event and if so, if it is in blocked state.
271     if (this->virtualEvent) {
272         auto executionStatus = this->virtualEvent->peekExecutionStatus();
273         if (executionStatus <= CL_SUBMITTED) {
274             UNRECOVERABLE_IF(this->virtualEvent == nullptr);
275 
276             if (this->virtualEvent->isStatusCompletedByTermination(executionStatus) == false) {
277                 taskCount = this->virtualEvent->peekTaskCount();
278                 flushStamp->setStamp(this->virtualEvent->flushStamp->peekStamp());
279                 taskLevel = this->virtualEvent->taskLevel;
280                 // If this isn't an OOQ, update the taskLevel for the queue
281                 if (!isOOQEnabled()) {
282                     taskLevel++;
283                 }
284             } else {
285                 //at this point we may reset queue TaskCount, since all command previous to this were aborted
286                 taskCount = 0;
287                 flushStamp->setStamp(0);
288                 taskLevel = getGpgpuCommandStreamReceiver().peekTaskLevel();
289             }
290 
291             FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "isQueueBlocked taskLevel change from", taskLevel, "to new from virtualEvent", this->virtualEvent, "new tasklevel", this->virtualEvent->taskLevel.load());
292 
293             //close the access to virtual event, driver added only 1 ref count.
294             this->virtualEvent->decRefInternal();
295             this->virtualEvent = nullptr;
296             return false;
297         }
298         return true;
299     }
300     return false;
301 }
302 
getCommandQueueInfo(cl_command_queue_info paramName,size_t paramValueSize,void * paramValue,size_t * paramValueSizeRet)303 cl_int CommandQueue::getCommandQueueInfo(cl_command_queue_info paramName,
304                                          size_t paramValueSize,
305                                          void *paramValue,
306                                          size_t *paramValueSizeRet) {
307     return getQueueInfo<CommandQueue>(this, paramName, paramValueSize, paramValue, paramValueSizeRet);
308 }
309 
getTaskLevelFromWaitList(uint32_t taskLevel,cl_uint numEventsInWaitList,const cl_event * eventWaitList)310 uint32_t CommandQueue::getTaskLevelFromWaitList(uint32_t taskLevel,
311                                                 cl_uint numEventsInWaitList,
312                                                 const cl_event *eventWaitList) {
313     for (auto iEvent = 0u; iEvent < numEventsInWaitList; ++iEvent) {
314         auto pEvent = (Event *)(eventWaitList[iEvent]);
315         uint32_t eventTaskLevel = pEvent->taskLevel;
316         taskLevel = std::max(taskLevel, eventTaskLevel);
317     }
318     return taskLevel;
319 }
320 
getCS(size_t minRequiredSize)321 LinearStream &CommandQueue::getCS(size_t minRequiredSize) {
322     DEBUG_BREAK_IF(nullptr == device);
323 
324     if (!commandStream) {
325         commandStream = new LinearStream(nullptr);
326     }
327 
328     minRequiredSize += CSRequirements::minCommandQueueCommandStreamSize;
329     constexpr static auto additionalAllocationSize = CSRequirements::minCommandQueueCommandStreamSize + CSRequirements::csOverfetchSize;
330     getGpgpuCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, minRequiredSize, additionalAllocationSize);
331     return *commandStream;
332 }
333 
enqueueAcquireSharedObjects(cl_uint numObjects,const cl_mem * memObjects,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * oclEvent,cl_uint cmdType)334 cl_int CommandQueue::enqueueAcquireSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
335     if ((memObjects == nullptr && numObjects != 0) || (memObjects != nullptr && numObjects == 0)) {
336         return CL_INVALID_VALUE;
337     }
338 
339     for (unsigned int object = 0; object < numObjects; object++) {
340         auto memObject = castToObject<MemObj>(memObjects[object]);
341         if (memObject == nullptr || memObject->peekSharingHandler() == nullptr) {
342             return CL_INVALID_MEM_OBJECT;
343         }
344 
345         int result = memObject->peekSharingHandler()->acquire(memObject, getDevice().getRootDeviceIndex());
346         if (result != CL_SUCCESS) {
347             return result;
348         }
349         memObject->acquireCount++;
350     }
351     auto status = enqueueMarkerWithWaitList(
352         numEventsInWaitList,
353         eventWaitList,
354         oclEvent);
355 
356     if (oclEvent) {
357         castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
358     }
359 
360     return status;
361 }
362 
enqueueReleaseSharedObjects(cl_uint numObjects,const cl_mem * memObjects,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * oclEvent,cl_uint cmdType)363 cl_int CommandQueue::enqueueReleaseSharedObjects(cl_uint numObjects, const cl_mem *memObjects, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *oclEvent, cl_uint cmdType) {
364     if ((memObjects == nullptr && numObjects != 0) || (memObjects != nullptr && numObjects == 0)) {
365         return CL_INVALID_VALUE;
366     }
367 
368     for (unsigned int object = 0; object < numObjects; object++) {
369         auto memObject = castToObject<MemObj>(memObjects[object]);
370         if (memObject == nullptr || memObject->peekSharingHandler() == nullptr) {
371             return CL_INVALID_MEM_OBJECT;
372         }
373 
374         memObject->peekSharingHandler()->release(memObject, getDevice().getRootDeviceIndex());
375         DEBUG_BREAK_IF(memObject->acquireCount <= 0);
376         memObject->acquireCount--;
377     }
378     auto status = enqueueMarkerWithWaitList(
379         numEventsInWaitList,
380         eventWaitList,
381         oclEvent);
382 
383     if (oclEvent) {
384         castToObjectOrAbort<Event>(*oclEvent)->setCmdType(cmdType);
385     }
386     return status;
387 }
388 
updateFromCompletionStamp(const CompletionStamp & completionStamp,Event * outEvent)389 void CommandQueue::updateFromCompletionStamp(const CompletionStamp &completionStamp, Event *outEvent) {
390     DEBUG_BREAK_IF(this->taskLevel > completionStamp.taskLevel);
391     DEBUG_BREAK_IF(this->taskCount > completionStamp.taskCount);
392     if (completionStamp.taskCount != CompletionStamp::notReady) {
393         taskCount = completionStamp.taskCount;
394     }
395     flushStamp->setStamp(completionStamp.flushStamp);
396     this->taskLevel = completionStamp.taskLevel;
397 
398     if (outEvent) {
399         outEvent->updateCompletionStamp(completionStamp.taskCount, outEvent->peekBcsTaskCountFromCommandQueue(), completionStamp.taskLevel, completionStamp.flushStamp);
400         FileLoggerInstance().log(DebugManager.flags.EventsDebugEnable.get(), "updateCompletionStamp Event", outEvent, "taskLevel", outEvent->taskLevel.load());
401     }
402 }
403 
setPerfCountersEnabled()404 bool CommandQueue::setPerfCountersEnabled() {
405     DEBUG_BREAK_IF(device == nullptr);
406 
407     auto perfCounters = device->getPerformanceCounters();
408     bool isCcsEngine = EngineHelpers::isCcs(getGpgpuEngine().osContext->getEngineType());
409 
410     perfCountersEnabled = perfCounters->enable(isCcsEngine);
411 
412     if (!perfCountersEnabled) {
413         perfCounters->shutdown();
414     }
415 
416     return perfCountersEnabled;
417 }
418 
getPerfCounters()419 PerformanceCounters *CommandQueue::getPerfCounters() {
420     return device->getPerformanceCounters();
421 }
422 
enqueueWriteMemObjForUnmap(MemObj * memObj,void * mappedPtr,EventsRequest & eventsRequest)423 cl_int CommandQueue::enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest) {
424     cl_int retVal = CL_SUCCESS;
425 
426     MapInfo unmapInfo;
427     if (!memObj->findMappedPtr(mappedPtr, unmapInfo)) {
428         return CL_INVALID_VALUE;
429     }
430 
431     if (!unmapInfo.readOnly) {
432         memObj->getMapAllocation(getDevice().getRootDeviceIndex())->setAubWritable(true, GraphicsAllocation::defaultBank);
433         memObj->getMapAllocation(getDevice().getRootDeviceIndex())->setTbxWritable(true, GraphicsAllocation::defaultBank);
434 
435         if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
436             auto buffer = castToObject<Buffer>(memObj);
437 
438             retVal = enqueueWriteBuffer(buffer, CL_FALSE, unmapInfo.offset[0], unmapInfo.size[0], mappedPtr, memObj->getMapAllocation(getDevice().getRootDeviceIndex()),
439                                         eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
440         } else {
441             auto image = castToObjectOrAbort<Image>(memObj);
442             size_t writeOrigin[4] = {unmapInfo.offset[0], unmapInfo.offset[1], unmapInfo.offset[2], 0};
443             auto mipIdx = getMipLevelOriginIdx(image->peekClMemObjType());
444             UNRECOVERABLE_IF(mipIdx >= 4);
445             writeOrigin[mipIdx] = unmapInfo.mipLevel;
446             retVal = enqueueWriteImage(image, CL_FALSE, writeOrigin, &unmapInfo.size[0],
447                                        image->getHostPtrRowPitch(), image->getHostPtrSlicePitch(), mappedPtr, memObj->getMapAllocation(getDevice().getRootDeviceIndex()),
448                                        eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
449         }
450     } else {
451         retVal = enqueueMarkerWithWaitList(eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
452     }
453 
454     if (retVal == CL_SUCCESS) {
455         memObj->removeMappedPtr(mappedPtr);
456         if (eventsRequest.outEvent) {
457             auto event = castToObject<Event>(*eventsRequest.outEvent);
458             event->setCmdType(CL_COMMAND_UNMAP_MEM_OBJECT);
459         }
460     }
461     return retVal;
462 }
463 
enqueueReadMemObjForMap(TransferProperties & transferProperties,EventsRequest & eventsRequest,cl_int & errcodeRet)464 void *CommandQueue::enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet) {
465     void *basePtr = transferProperties.memObj->getBasePtrForMap(getDevice().getRootDeviceIndex());
466     size_t mapPtrOffset = transferProperties.memObj->calculateOffsetForMapping(transferProperties.offset) + transferProperties.mipPtrOffset;
467     if (transferProperties.memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
468         mapPtrOffset += transferProperties.memObj->getOffset();
469     }
470     void *returnPtr = ptrOffset(basePtr, mapPtrOffset);
471 
472     if (!transferProperties.memObj->addMappedPtr(returnPtr, transferProperties.memObj->calculateMappedPtrLength(transferProperties.size),
473                                                  transferProperties.mapFlags, transferProperties.size, transferProperties.offset, transferProperties.mipLevel,
474                                                  transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()))) {
475         errcodeRet = CL_INVALID_OPERATION;
476         return nullptr;
477     }
478 
479     if (transferProperties.mapFlags == CL_MAP_WRITE_INVALIDATE_REGION) {
480         errcodeRet = enqueueMarkerWithWaitList(eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, eventsRequest.outEvent);
481     } else {
482         if (transferProperties.memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
483             auto buffer = castToObject<Buffer>(transferProperties.memObj);
484             errcodeRet = enqueueReadBuffer(buffer, transferProperties.blocking, transferProperties.offset[0], transferProperties.size[0],
485                                            returnPtr, transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()), eventsRequest.numEventsInWaitList,
486                                            eventsRequest.eventWaitList, eventsRequest.outEvent);
487         } else {
488             auto image = castToObjectOrAbort<Image>(transferProperties.memObj);
489             size_t readOrigin[4] = {transferProperties.offset[0], transferProperties.offset[1], transferProperties.offset[2], 0};
490             auto mipIdx = getMipLevelOriginIdx(image->peekClMemObjType());
491             UNRECOVERABLE_IF(mipIdx >= 4);
492             readOrigin[mipIdx] = transferProperties.mipLevel;
493             errcodeRet = enqueueReadImage(image, transferProperties.blocking, readOrigin, &transferProperties.size[0],
494                                           image->getHostPtrRowPitch(), image->getHostPtrSlicePitch(),
495                                           returnPtr, transferProperties.memObj->getMapAllocation(getDevice().getRootDeviceIndex()), eventsRequest.numEventsInWaitList,
496                                           eventsRequest.eventWaitList, eventsRequest.outEvent);
497         }
498     }
499 
500     if (errcodeRet != CL_SUCCESS) {
501         transferProperties.memObj->removeMappedPtr(returnPtr);
502         return nullptr;
503     }
504     if (eventsRequest.outEvent) {
505         auto event = castToObject<Event>(*eventsRequest.outEvent);
506         event->setCmdType(transferProperties.cmdType);
507     }
508     return returnPtr;
509 }
510 
enqueueMapMemObject(TransferProperties & transferProperties,EventsRequest & eventsRequest,cl_int & errcodeRet)511 void *CommandQueue::enqueueMapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet) {
512     if (transferProperties.memObj->mappingOnCpuAllowed()) {
513         return cpuDataTransferHandler(transferProperties, eventsRequest, errcodeRet);
514     } else {
515         return enqueueReadMemObjForMap(transferProperties, eventsRequest, errcodeRet);
516     }
517 }
518 
enqueueUnmapMemObject(TransferProperties & transferProperties,EventsRequest & eventsRequest)519 cl_int CommandQueue::enqueueUnmapMemObject(TransferProperties &transferProperties, EventsRequest &eventsRequest) {
520     cl_int retVal = CL_SUCCESS;
521     if (transferProperties.memObj->mappingOnCpuAllowed()) {
522         cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
523     } else {
524         retVal = enqueueWriteMemObjForUnmap(transferProperties.memObj, transferProperties.ptr, eventsRequest);
525     }
526     return retVal;
527 }
528 
enqueueMapBuffer(Buffer * buffer,cl_bool blockingMap,cl_map_flags mapFlags,size_t offset,size_t size,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_int & errcodeRet)529 void *CommandQueue::enqueueMapBuffer(Buffer *buffer, cl_bool blockingMap,
530                                      cl_map_flags mapFlags, size_t offset,
531                                      size_t size, cl_uint numEventsInWaitList,
532                                      const cl_event *eventWaitList, cl_event *event,
533                                      cl_int &errcodeRet) {
534     TransferProperties transferProperties(buffer, CL_COMMAND_MAP_BUFFER, mapFlags, blockingMap != CL_FALSE, &offset, &size, nullptr, false, getDevice().getRootDeviceIndex());
535     EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
536 
537     return enqueueMapMemObject(transferProperties, eventsRequest, errcodeRet);
538 }
539 
enqueueMapImage(Image * image,cl_bool blockingMap,cl_map_flags mapFlags,const size_t * origin,const size_t * region,size_t * imageRowPitch,size_t * imageSlicePitch,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,cl_int & errcodeRet)540 void *CommandQueue::enqueueMapImage(Image *image, cl_bool blockingMap,
541                                     cl_map_flags mapFlags, const size_t *origin,
542                                     const size_t *region, size_t *imageRowPitch,
543                                     size_t *imageSlicePitch,
544                                     cl_uint numEventsInWaitList,
545                                     const cl_event *eventWaitList, cl_event *event,
546                                     cl_int &errcodeRet) {
547     TransferProperties transferProperties(image, CL_COMMAND_MAP_IMAGE, mapFlags, blockingMap != CL_FALSE,
548                                           const_cast<size_t *>(origin), const_cast<size_t *>(region), nullptr, false, getDevice().getRootDeviceIndex());
549     EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
550 
551     if (image->isMemObjZeroCopy() && image->mappingOnCpuAllowed()) {
552         GetInfoHelper::set(imageSlicePitch, image->getImageDesc().image_slice_pitch);
553         if (image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
554             // There are differences in qPitch programming between Gen8 vs Gen9+ devices.
555             // For Gen8 qPitch is distance in rows while Gen9+ it is in pixels.
556             // Minimum value of qPitch is 4 and this causes slicePitch = 4*rowPitch on Gen8.
557             // To allow zero-copy we have to tell what is correct value rowPitch which should equal to slicePitch.
558             GetInfoHelper::set(imageRowPitch, image->getImageDesc().image_slice_pitch);
559         } else {
560             GetInfoHelper::set(imageRowPitch, image->getImageDesc().image_row_pitch);
561         }
562     } else {
563         GetInfoHelper::set(imageSlicePitch, image->getHostPtrSlicePitch());
564         GetInfoHelper::set(imageRowPitch, image->getHostPtrRowPitch());
565     }
566     if (Image::hasSlices(image->peekClMemObjType()) == false) {
567         GetInfoHelper::set(imageSlicePitch, static_cast<size_t>(0));
568     }
569     return enqueueMapMemObject(transferProperties, eventsRequest, errcodeRet);
570 }
571 
enqueueUnmapMemObject(MemObj * memObj,void * mappedPtr,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)572 cl_int CommandQueue::enqueueUnmapMemObject(MemObj *memObj, void *mappedPtr, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) {
573     TransferProperties transferProperties(memObj, CL_COMMAND_UNMAP_MEM_OBJECT, 0, false, nullptr, nullptr, mappedPtr, false, getDevice().getRootDeviceIndex());
574     EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
575 
576     return enqueueUnmapMemObject(transferProperties, eventsRequest);
577 }
578 
enqueueBlockedMapUnmapOperation(const cl_event * eventWaitList,size_t numEventsInWaitlist,MapOperationType opType,MemObj * memObj,MemObjSizeArray & copySize,MemObjOffsetArray & copyOffset,bool readOnly,EventBuilder & externalEventBuilder)579 void CommandQueue::enqueueBlockedMapUnmapOperation(const cl_event *eventWaitList,
580                                                    size_t numEventsInWaitlist,
581                                                    MapOperationType opType,
582                                                    MemObj *memObj,
583                                                    MemObjSizeArray &copySize,
584                                                    MemObjOffsetArray &copyOffset,
585                                                    bool readOnly,
586                                                    EventBuilder &externalEventBuilder) {
587     EventBuilder internalEventBuilder;
588     EventBuilder *eventBuilder;
589     // check if event will be exposed externally
590     if (externalEventBuilder.getEvent()) {
591         externalEventBuilder.getEvent()->incRefInternal();
592         eventBuilder = &externalEventBuilder;
593     } else {
594         // it will be an internal event
595         internalEventBuilder.create<VirtualEvent>(this, context);
596         eventBuilder = &internalEventBuilder;
597     }
598 
599     //store task data in event
600     auto cmd = std::unique_ptr<Command>(new CommandMapUnmap(opType, *memObj, copySize, copyOffset, readOnly, *this));
601     eventBuilder->getEvent()->setCommand(std::move(cmd));
602 
603     //bind output event with input events
604     eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventWaitList, numEventsInWaitlist));
605     eventBuilder->addParentEvent(this->virtualEvent);
606     eventBuilder->finalize();
607 
608     if (this->virtualEvent) {
609         this->virtualEvent->decRefInternal();
610     }
611     this->virtualEvent = eventBuilder->getEvent();
612 }
613 
setupDebugSurface(Kernel * kernel)614 bool CommandQueue::setupDebugSurface(Kernel *kernel) {
615     auto debugSurface = getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
616 
617     DEBUG_BREAK_IF(!kernel->usesBindfulAddressingForBuffers());
618     auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap()),
619                                   kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful);
620     void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
621     size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
622     Buffer::setSurfaceState(&device->getDevice(), surfaceState, false, false, sizeToPatch,
623                             addressToPatch, 0, debugSurface, 0, 0,
624                             kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
625                             kernel->areMultipleSubDevicesInContext());
626     return true;
627 }
628 
validateCapability(cl_command_queue_capabilities_intel capability) const629 bool CommandQueue::validateCapability(cl_command_queue_capabilities_intel capability) const {
630     return this->queueCapabilities == CL_QUEUE_DEFAULT_CAPABILITIES_INTEL || isValueSet(this->queueCapabilities, capability);
631 }
632 
validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList,const cl_event * waitList) const633 bool CommandQueue::validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList, const cl_event *waitList) const {
634     for (cl_uint eventIndex = 0u; eventIndex < numEventsInWaitList; eventIndex++) {
635         const Event *event = castToObject<Event>(waitList[eventIndex]);
636         if (event->isUserEvent()) {
637             continue;
638         }
639 
640         const CommandQueue *eventCommandQueue = event->getCommandQueue();
641         const bool crossQueue = this != eventCommandQueue;
642         const cl_command_queue_capabilities_intel createCap = crossQueue ? CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL
643                                                                          : CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL;
644         const cl_command_queue_capabilities_intel waitCap = crossQueue ? CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL
645                                                                        : CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL;
646         if (!validateCapability(waitCap) || !eventCommandQueue->validateCapability(createCap)) {
647             return false;
648         }
649     }
650 
651     return true;
652 }
653 
validateCapabilityForOperation(cl_command_queue_capabilities_intel capability,cl_uint numEventsInWaitList,const cl_event * waitList,const cl_event * outEvent) const654 bool CommandQueue::validateCapabilityForOperation(cl_command_queue_capabilities_intel capability,
655                                                   cl_uint numEventsInWaitList,
656                                                   const cl_event *waitList,
657                                                   const cl_event *outEvent) const {
658     const bool operationValid = validateCapability(capability);
659     const bool waitListValid = validateCapabilitiesForEventWaitList(numEventsInWaitList, waitList);
660     const bool outEventValid = outEvent == nullptr ||
661                                validateCapability(CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL) ||
662                                validateCapability(CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL);
663     return operationValid && waitListValid && outEventValid;
664 }
665 
getQueueFamilyIndex() const666 cl_uint CommandQueue::getQueueFamilyIndex() const {
667     if (isQueueFamilySelected()) {
668         return queueFamilyIndex;
669     } else {
670         const auto &hwInfo = device->getHardwareInfo();
671         const auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
672         const auto engineGroupType = hwHelper.getEngineGroupType(gpgpuEngine->getEngineType(), gpgpuEngine->getEngineUsage(), hwInfo);
673         const auto familyIndex = device->getDevice().getEngineGroupIndexFromEngineGroupType(engineGroupType);
674         return static_cast<cl_uint>(familyIndex);
675     }
676 }
677 
updateBcsTaskCount(aub_stream::EngineType bcsEngineType,uint32_t newBcsTaskCount)678 void CommandQueue::updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint32_t newBcsTaskCount) {
679     CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
680     state.engineType = bcsEngineType;
681     state.taskCount = newBcsTaskCount;
682 }
683 
peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const684 uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const {
685     const CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)];
686     DEBUG_BREAK_IF(!state.isValid());
687     return state.taskCount;
688 }
689 
isTextureCacheFlushNeeded(uint32_t commandType) const690 bool CommandQueue::isTextureCacheFlushNeeded(uint32_t commandType) const {
691     return commandType == CL_COMMAND_COPY_IMAGE && getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled();
692 }
693 
getIndirectHeap(IndirectHeap::Type heapType,size_t minRequiredSize)694 IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize) {
695     return getGpgpuCommandStreamReceiver().getIndirectHeap(heapType, minRequiredSize);
696 }
697 
allocateHeapMemory(IndirectHeap::Type heapType,size_t minRequiredSize,IndirectHeap * & indirectHeap)698 void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType, size_t minRequiredSize, IndirectHeap *&indirectHeap) {
699     getGpgpuCommandStreamReceiver().allocateHeapMemory(heapType, minRequiredSize, indirectHeap);
700 }
701 
releaseIndirectHeap(IndirectHeap::Type heapType)702 void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) {
703     getGpgpuCommandStreamReceiver().releaseIndirectHeap(heapType);
704 }
705 
obtainNewTimestampPacketNodes(size_t numberOfNodes,TimestampPacketContainer & previousNodes,bool clearAllDependencies,CommandStreamReceiver & csr)706 void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, CommandStreamReceiver &csr) {
707     TagAllocatorBase *allocator = csr.getTimestampPacketAllocator();
708 
709     previousNodes.swapNodes(*timestampPacketContainer);
710 
711     if ((previousNodes.peekNodes().size() > 0) && (previousNodes.peekNodes()[0]->getAllocator() != allocator)) {
712         clearAllDependencies = false;
713     }
714 
715     if (clearAllDependencies) {
716         previousNodes.moveNodesToNewContainer(*deferredTimestampPackets);
717     }
718 
719     DEBUG_BREAK_IF(timestampPacketContainer->peekNodes().size() > 0);
720 
721     for (size_t i = 0; i < numberOfNodes; i++) {
722         timestampPacketContainer->add(allocator->getTag());
723     }
724 }
725 
estimateTimestampPacketNodesCount(const MultiDispatchInfo & dispatchInfo) const726 size_t CommandQueue::estimateTimestampPacketNodesCount(const MultiDispatchInfo &dispatchInfo) const {
727     size_t nodesCount = dispatchInfo.size();
728     auto mainKernel = dispatchInfo.peekMainKernel();
729     if (obtainTimestampPacketForCacheFlush(mainKernel->requiresCacheFlushCommand(*this))) {
730         nodesCount++;
731     }
732     return nodesCount;
733 }
734 
bufferCpuCopyAllowed(Buffer * buffer,cl_command_type commandType,cl_bool blocking,size_t size,void * ptr,cl_uint numEventsInWaitList,const cl_event * eventWaitList)735 bool CommandQueue::bufferCpuCopyAllowed(Buffer *buffer, cl_command_type commandType, cl_bool blocking, size_t size, void *ptr,
736                                         cl_uint numEventsInWaitList, const cl_event *eventWaitList) {
737 
738     auto debugVariableSet = false;
739     // Requested by debug variable or allowed by Buffer
740     if (CL_COMMAND_READ_BUFFER == commandType && DebugManager.flags.DoCpuCopyOnReadBuffer.get() != -1) {
741         if (DebugManager.flags.DoCpuCopyOnReadBuffer.get() == 0) {
742             return false;
743         }
744         debugVariableSet = true;
745     }
746     if (CL_COMMAND_WRITE_BUFFER == commandType && DebugManager.flags.DoCpuCopyOnWriteBuffer.get() != -1) {
747         if (DebugManager.flags.DoCpuCopyOnWriteBuffer.get() == 0) {
748             return false;
749         }
750         debugVariableSet = true;
751     }
752 
753     //if we are blocked by user events, we can't service the call on CPU
754     if (Event::checkUserEventDependencies(numEventsInWaitList, eventWaitList)) {
755         return false;
756     }
757 
758     //check if buffer is compatible
759     if (!buffer->isReadWriteOnCpuAllowed(device->getDevice())) {
760         return false;
761     }
762 
763     if (buffer->getMemoryManager() && buffer->getMemoryManager()->isCpuCopyRequired(ptr)) {
764         return true;
765     }
766 
767     if (debugVariableSet) {
768         return true;
769     }
770 
771     //non blocking transfers are not expected to be serviced by CPU
772     //we do not want to artifically stall the pipeline to allow CPU access
773     if (blocking == CL_FALSE) {
774         return false;
775     }
776 
777     //check if it is beneficial to do transfer on CPU
778     if (!buffer->isReadWriteOnCpuPreferred(ptr, size, getDevice())) {
779         return false;
780     }
781 
782     //make sure that event wait list is empty
783     if (numEventsInWaitList == 0) {
784         return true;
785     }
786 
787     return false;
788 }
789 
queueDependenciesClearRequired() const790 bool CommandQueue::queueDependenciesClearRequired() const {
791     return isOOQEnabled() || DebugManager.flags.OmitTimestampPacketDependencies.get();
792 }
793 
blitEnqueueAllowed(const CsrSelectionArgs & args) const794 bool CommandQueue::blitEnqueueAllowed(const CsrSelectionArgs &args) const {
795     bool blitEnqueueAllowed = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() || this->isCopyOnly;
796     if (DebugManager.flags.EnableBlitterForEnqueueOperations.get() != -1) {
797         blitEnqueueAllowed = DebugManager.flags.EnableBlitterForEnqueueOperations.get();
798     }
799     if (!blitEnqueueAllowed) {
800         return false;
801     }
802 
803     switch (args.cmdType) {
804     case CL_COMMAND_READ_BUFFER:
805     case CL_COMMAND_WRITE_BUFFER:
806     case CL_COMMAND_COPY_BUFFER:
807     case CL_COMMAND_READ_BUFFER_RECT:
808     case CL_COMMAND_WRITE_BUFFER_RECT:
809     case CL_COMMAND_COPY_BUFFER_RECT:
810     case CL_COMMAND_SVM_MEMCPY:
811     case CL_COMMAND_SVM_MAP:
812     case CL_COMMAND_SVM_UNMAP:
813         return true;
814     case CL_COMMAND_READ_IMAGE:
815         return blitEnqueueImageAllowed(args.srcResource.imageOrigin, args.size, *args.srcResource.image);
816     case CL_COMMAND_WRITE_IMAGE:
817         return blitEnqueueImageAllowed(args.dstResource.imageOrigin, args.size, *args.dstResource.image);
818 
819     case CL_COMMAND_COPY_IMAGE:
820         return blitEnqueueImageAllowed(args.srcResource.imageOrigin, args.size, *args.srcResource.image) &&
821                blitEnqueueImageAllowed(args.dstResource.imageOrigin, args.size, *args.dstResource.image);
822 
823     default:
824         return false;
825     }
826 }
827 
blitEnqueueImageAllowed(const size_t * origin,const size_t * region,const Image & image) const828 bool CommandQueue::blitEnqueueImageAllowed(const size_t *origin, const size_t *region, const Image &image) const {
829     const auto &hwInfo = device->getHardwareInfo();
830     const auto &hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily);
831     auto blitEnqueueImageAllowed = hwInfoConfig->isBlitterForImagesSupported();
832 
833     if (DebugManager.flags.EnableBlitterForEnqueueImageOperations.get() != -1) {
834         blitEnqueueImageAllowed = DebugManager.flags.EnableBlitterForEnqueueImageOperations.get();
835     }
836 
837     blitEnqueueImageAllowed &= (origin[0] + region[0] <= BlitterConstants::maxBlitWidth) && (origin[1] + region[1] <= BlitterConstants::maxBlitHeight);
838     blitEnqueueImageAllowed &= !isMipMapped(image.getImageDesc());
839 
840     const auto &defaultGmm = image.getGraphicsAllocation(device->getRootDeviceIndex())->getDefaultGmm();
841     if (defaultGmm != nullptr) {
842         auto isTile64 = defaultGmm->gmmResourceInfo->getResourceFlags()->Info.Tile64;
843         auto imageType = image.getImageDesc().image_type;
844         if (isTile64 && (imageType == CL_MEM_OBJECT_IMAGE3D)) {
845             blitEnqueueImageAllowed &= hwInfoConfig->isTile64With3DSurfaceOnBCSSupported(hwInfo);
846         }
847     }
848 
849     return blitEnqueueImageAllowed;
850 }
851 
isBlockedCommandStreamRequired(uint32_t commandType,const EventsRequest & eventsRequest,bool blockedQueue,bool isMarkerWithProfiling) const852 bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const EventsRequest &eventsRequest, bool blockedQueue, bool isMarkerWithProfiling) const {
853     if (!blockedQueue) {
854         return false;
855     }
856 
857     if (isCacheFlushCommand(commandType) || !isCommandWithoutKernel(commandType) || isMarkerWithProfiling) {
858         return true;
859     }
860 
861     if (CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) {
862         auto timestampPacketWriteEnabled = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled();
863         if (timestampPacketWriteEnabled || context->getRootDeviceIndices().size() > 1) {
864 
865             for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
866                 auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
867                 if (timestampPacketWriteEnabled && waitlistEvent->getTimestampPacketNodes()) {
868                     return true;
869                 }
870                 if (waitlistEvent->getCommandQueue() && waitlistEvent->getCommandQueue()->getDevice().getRootDeviceIndex() != this->getDevice().getRootDeviceIndex()) {
871                     return true;
872                 }
873             }
874         }
875     }
876 
877     return false;
878 }
879 
storeProperties(const cl_queue_properties * properties)880 void CommandQueue::storeProperties(const cl_queue_properties *properties) {
881     if (properties) {
882         for (size_t i = 0; properties[i] != 0; i += 2) {
883             propertiesVector.push_back(properties[i]);
884             propertiesVector.push_back(properties[i + 1]);
885         }
886         propertiesVector.push_back(0);
887     }
888 }
889 
processProperties(const cl_queue_properties * properties)890 void CommandQueue::processProperties(const cl_queue_properties *properties) {
891     if (properties != nullptr) {
892         bool specificEngineSelected = false;
893         cl_uint selectedQueueFamilyIndex = std::numeric_limits<uint32_t>::max();
894         cl_uint selectedQueueIndex = std::numeric_limits<uint32_t>::max();
895 
896         for (auto currentProperties = properties; *currentProperties != 0; currentProperties += 2) {
897             switch (*currentProperties) {
898             case CL_QUEUE_FAMILY_INTEL:
899                 selectedQueueFamilyIndex = static_cast<cl_uint>(*(currentProperties + 1));
900                 specificEngineSelected = true;
901                 break;
902             case CL_QUEUE_INDEX_INTEL:
903                 selectedQueueIndex = static_cast<cl_uint>(*(currentProperties + 1));
904                 specificEngineSelected = true;
905                 break;
906             }
907         }
908 
909         if (specificEngineSelected) {
910             this->queueFamilySelected = true;
911             if (!getDevice().hasRootCsr()) {
912                 const auto &engine = getDevice().getRegularEngineGroups()[selectedQueueFamilyIndex].engines[selectedQueueIndex];
913                 auto engineType = engine.getEngineType();
914                 auto engineUsage = engine.getEngineUsage();
915                 if ((DebugManager.flags.EngineUsageHint.get() != -1) &&
916                     (getDevice().tryGetEngine(engineType, static_cast<EngineUsage>(DebugManager.flags.EngineUsageHint.get())) != nullptr)) {
917                     engineUsage = static_cast<EngineUsage>(DebugManager.flags.EngineUsageHint.get());
918                 }
919                 this->overrideEngine(engineType, engineUsage);
920                 this->queueCapabilities = getClDevice().getDeviceInfo().queueFamilyProperties[selectedQueueFamilyIndex].capabilities;
921                 this->queueFamilyIndex = selectedQueueFamilyIndex;
922                 this->queueIndexWithinFamily = selectedQueueIndex;
923             }
924         }
925     }
926     requiresCacheFlushAfterWalker = device && (device->getDeviceInfo().parentDevice != nullptr);
927 }
928 
overrideEngine(aub_stream::EngineType engineType,EngineUsage engineUsage)929 void CommandQueue::overrideEngine(aub_stream::EngineType engineType, EngineUsage engineUsage) {
930     const HardwareInfo &hwInfo = getDevice().getHardwareInfo();
931     const HwHelper &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
932     const EngineGroupType engineGroupType = hwHelper.getEngineGroupType(engineType, engineUsage, hwInfo);
933     const bool isEngineCopyOnly = hwHelper.isCopyOnlyEngineType(engineGroupType);
934 
935     if (isEngineCopyOnly) {
936         std::fill(bcsEngines.begin(), bcsEngines.end(), nullptr);
937         bcsEngines[EngineHelpers::getBcsIndex(engineType)] = &device->getEngine(engineType, EngineUsage::Regular);
938         bcsEngineTypes = {engineType};
939         timestampPacketContainer = std::make_unique<TimestampPacketContainer>();
940         deferredTimestampPackets = std::make_unique<TimestampPacketContainer>();
941         isCopyOnly = true;
942     } else {
943         gpgpuEngine = &device->getEngine(engineType, engineUsage);
944     }
945 }
946 
aubCaptureHook(bool & blocking,bool & clearAllDependencies,const MultiDispatchInfo & multiDispatchInfo)947 void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo) {
948     if (DebugManager.flags.AUBDumpSubCaptureMode.get()) {
949         auto status = getGpgpuCommandStreamReceiver().checkAndActivateAubSubCapture(multiDispatchInfo.empty() ? "" : multiDispatchInfo.peekMainKernel()->getDescriptor().kernelMetadata.kernelName);
950         if (!status.isActive) {
951             // make each enqueue blocking when subcapture is not active to split batch buffer
952             blocking = true;
953         } else if (!status.wasActiveInPreviousEnqueue) {
954             // omit timestamp packet dependencies dependencies upon subcapture activation
955             clearAllDependencies = true;
956         }
957     }
958 
959     if (getGpgpuCommandStreamReceiver().getType() > CommandStreamReceiverType::CSR_HW) {
960         for (auto &dispatchInfo : multiDispatchInfo) {
961             auto &kernelName = dispatchInfo.getKernel()->getKernelInfo().kernelDescriptor.kernelMetadata.kernelName;
962             getGpgpuCommandStreamReceiver().addAubComment(kernelName.c_str());
963         }
964     }
965 }
966 
isWaitForTimestampsEnabled()967 bool CommandQueue::isWaitForTimestampsEnabled() {
968     auto &hwHelper = HwHelper::get(getDevice().getHardwareInfo().platform.eRenderCoreFamily);
969     auto enabled = CommandQueue::isTimestampWaitEnabled();
970     enabled &= hwHelper.isTimestampWaitSupported();
971 
972     switch (DebugManager.flags.EnableTimestampWait.get()) {
973     case 0:
974         enabled = false;
975         break;
976     case 1:
977         enabled = getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled();
978         break;
979     case 2:
980         enabled = getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled();
981         break;
982     case 3:
983         enabled = getGpgpuCommandStreamReceiver().isAnyDirectSubmissionEnabled();
984         break;
985     case 4:
986         enabled = true;
987         break;
988     }
989 
990     return enabled;
991 }
992 
waitForAllEngines(bool blockedQueue,PrintfHandler * printfHandler,bool cleanTemporaryAllocationsList)993 void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler, bool cleanTemporaryAllocationsList) {
994     if (blockedQueue) {
995         while (isQueueBlocked()) {
996         }
997     }
998 
999     auto waitedOnTimestamps = waitForTimestamps(taskCount);
1000 
1001     TimestampPacketContainer nodesToRelease;
1002     if (deferredTimestampPackets) {
1003         deferredTimestampPackets->swapNodes(nodesToRelease);
1004     }
1005 
1006     StackVec<CopyEngineState, bcsInfoMaskSize> activeBcsStates{};
1007     for (CopyEngineState &state : this->bcsStates) {
1008         if (state.isValid()) {
1009             activeBcsStates.push_back(state);
1010         }
1011     }
1012     waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
1013 
1014     if (printfHandler) {
1015         printfHandler->printEnqueueOutput();
1016     }
1017 }
1018 
1019 } // namespace NEO
1020