1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 #include "shared/source/built_ins/built_ins.h"
10 #include "shared/source/command_stream/command_stream_receiver.h"
11 #include "shared/source/helpers/array_count.h"
12 #include "shared/source/helpers/engine_node_helper.h"
13 #include "shared/source/helpers/local_work_size.h"
14 #include "shared/source/helpers/pipe_control_args.h"
15 #include "shared/source/memory_manager/internal_allocation_storage.h"
16 #include "shared/source/memory_manager/memory_manager.h"
17 #include "shared/source/memory_manager/surface.h"
18 #include "shared/source/os_interface/os_context.h"
19 #include "shared/source/program/sync_buffer_handler.h"
20 #include "shared/source/program/sync_buffer_handler.inl"
21 #include "shared/source/utilities/range.h"
22 #include "shared/source/utilities/tag_allocator.h"
23 
24 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
25 #include "opencl/source/builtin_kernels_simulation/scheduler_simulation.h"
26 #include "opencl/source/command_queue/command_queue_hw.h"
27 #include "opencl/source/command_queue/gpgpu_walker.h"
28 #include "opencl/source/command_queue/hardware_interface.h"
29 #include "opencl/source/event/event_builder.h"
30 #include "opencl/source/event/user_event.h"
31 #include "opencl/source/gtpin/gtpin_notify.h"
32 #include "opencl/source/helpers/cl_blit_properties.h"
33 #include "opencl/source/helpers/cl_hw_helper.h"
34 #include "opencl/source/helpers/cl_preemption_helper.h"
35 #include "opencl/source/helpers/dispatch_info_builder.h"
36 #include "opencl/source/helpers/enqueue_properties.h"
37 #include "opencl/source/helpers/hardware_commands_helper.h"
38 #include "opencl/source/helpers/task_information.h"
39 #include "opencl/source/mem_obj/buffer.h"
40 #include "opencl/source/mem_obj/image.h"
41 #include "opencl/source/memory_manager/migration_controller.h"
42 #include "opencl/source/program/block_kernel_manager.h"
43 #include "opencl/source/program/printf_handler.h"
44 #include "opencl/source/utilities/cl_logger.h"
45 
46 #include <algorithm>
47 #include <new>
48 
49 namespace NEO {
50 
51 template <typename GfxFamily>
52 template <uint32_t commandType, size_t surfaceCount>
enqueueHandler(Surface * (& surfaces)[surfaceCount],bool blocking,Kernel * kernel,cl_uint workDim,const size_t globalOffsets[3],const size_t workItems[3],const size_t * localWorkSizesIn,const size_t * enqueuedWorkSizes,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)53 void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount],
54                                                bool blocking,
55                                                Kernel *kernel,
56                                                cl_uint workDim,
57                                                const size_t globalOffsets[3],
58                                                const size_t workItems[3],
59                                                const size_t *localWorkSizesIn,
60                                                const size_t *enqueuedWorkSizes,
61                                                cl_uint numEventsInWaitList,
62                                                const cl_event *eventWaitList,
63                                                cl_event *event) {
64     BuiltInOwnershipWrapper builtInLock;
65     KernelObjsForAuxTranslation kernelObjsForAuxTranslation;
66     MultiDispatchInfo multiDispatchInfo(kernel);
67 
68     auto auxTranslationMode = AuxTranslationMode::None;
69 
70     if (DebugManager.flags.ForceDispatchScheduler.get()) {
71         forceDispatchScheduler(multiDispatchInfo);
72     } else {
73 
74         kernel->updateAuxTranslationRequired();
75         if (kernel->isAuxTranslationRequired()) {
76             kernel->fillWithKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
77             multiDispatchInfo.setKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
78 
79             if (!kernelObjsForAuxTranslation.empty()) {
80                 auxTranslationMode = HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo());
81             }
82         }
83 
84         if (AuxTranslationMode::Builtin == auxTranslationMode) {
85             auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice());
86             builtInLock.takeOwnership(builder, this->context);
87 
88             dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux);
89         }
90 
91         if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
92             DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder(getClDevice());
93             builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3<size_t>{0, 0, 0}, localWorkSizesIn);
94             builder.setKernel(kernel);
95             builder.bake(multiDispatchInfo);
96         } else {
97             auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
98             builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets);
99 
100             if (multiDispatchInfo.size() == 0) {
101                 return;
102             }
103         }
104 
105         if (AuxTranslationMode::Builtin == auxTranslationMode) {
106             UNRECOVERABLE_IF(kernel->isParentKernel);
107             dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux);
108         }
109     }
110 
111     if (AuxTranslationMode::Blit == auxTranslationMode) {
112         setupBlitAuxTranslation(multiDispatchInfo);
113     }
114 
115     enqueueHandler<commandType>(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event);
116 }
117 
118 template <typename GfxFamily>
forceDispatchScheduler(NEO::MultiDispatchInfo & multiDispatchInfo)119 void CommandQueueHw<GfxFamily>::forceDispatchScheduler(NEO::MultiDispatchInfo &multiDispatchInfo) {
120     SchedulerKernel &scheduler = getContext().getSchedulerKernel();
121 
122     auto devQueue = this->getContext().getDefaultDeviceQueue();
123     DeviceQueueHw<GfxFamily> *devQueueHw = castToObjectOrAbort<DeviceQueueHw<GfxFamily>>(devQueue);
124 
125     DispatchInfo dispatchInfo(devQueue->getClDevice(), &scheduler, 1, Vec3<size_t>(scheduler.getGws(), 1, 1), Vec3<size_t>(scheduler.getLws(), 1, 1), Vec3<size_t>(0, 0, 0));
126     Vec3<size_t> workGroupCount = generateWorkgroupsNumber(dispatchInfo.getGWS(), dispatchInfo.getEnqueuedWorkgroupSize());
127     dispatchInfo.setTotalNumberOfWorkgroups(workGroupCount);
128     dispatchInfo.setNumberOfWorkgroups(workGroupCount);
129 
130     scheduler.createReflectionSurface();
131     GraphicsAllocation *reflectionSurface = scheduler.getKernelReflectionSurface();
132 
133     devQueueHw->resetDeviceQueue();
134 
135     scheduler.setArgs(devQueueHw->getQueueBuffer(),
136                       devQueueHw->getStackBuffer(),
137                       devQueueHw->getEventPoolBuffer(),
138                       devQueueHw->getSlbBuffer(),
139                       devQueueHw->getDshBuffer(),
140                       reflectionSurface,
141                       devQueueHw->getQueueStorageBuffer(),
142                       this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation());
143 
144     multiDispatchInfo.push(dispatchInfo);
145 }
146 
147 template <typename GfxFamily>
148 template <uint32_t commandType>
enqueueHandler(Surface ** surfacesForResidency,size_t numSurfaceForResidency,bool blocking,const MultiDispatchInfo & multiDispatchInfo,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)149 void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
150                                                size_t numSurfaceForResidency,
151                                                bool blocking,
152                                                const MultiDispatchInfo &multiDispatchInfo,
153                                                cl_uint numEventsInWaitList,
154                                                const cl_event *eventWaitList,
155                                                cl_event *event) {
156     if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) {
157         enqueueHandler<CL_COMMAND_MARKER>(nullptr, 0, blocking, multiDispatchInfo,
158                                           numEventsInWaitList, eventWaitList, event);
159         if (event) {
160             castToObjectOrAbort<Event>(*event)->setCmdType(commandType);
161         }
162         return;
163     }
164 
165     Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
166     auto devQueue = this->getContext().getDefaultDeviceQueue();
167     DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
168 
169     TagNodeBase *hwTimeStamps = nullptr;
170     CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
171     auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
172 
173     EventBuilder eventBuilder;
174     setupEvent(eventBuilder, event, commandType);
175 
176     bool isMarkerWithProfiling = (CL_COMMAND_MARKER == commandType) && (eventBuilder.getEvent() && eventBuilder.getEvent()->isProfilingEnabled());
177 
178     std::unique_ptr<KernelOperation> blockedCommandsData;
179     std::unique_ptr<PrintfHandler> printfHandler;
180     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
181 
182     auto blockQueue = false;
183     auto taskLevel = 0u;
184     obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType);
185 
186     if (parentKernel && !blockQueue) {
187         while (!devQueueHw->isEMCriticalSectionFree())
188             ;
189     }
190 
191     enqueueHandlerHook(commandType, multiDispatchInfo);
192 
193     bool clearDependenciesForSubCapture = false;
194     aubCaptureHook(blocking, clearDependenciesForSubCapture, multiDispatchInfo);
195 
196     bool clearAllDependencies = (queueDependenciesClearRequired() || clearDependenciesForSubCapture);
197 
198     if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
199         blocking = true;
200     }
201 
202     TimestampPacketDependencies timestampPacketDependencies;
203     EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
204     CsrDependencies csrDeps;
205     BlitPropertiesContainer blitPropertiesContainer;
206 
207     if (this->context->getRootDeviceIndices().size() > 1) {
208         eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
209     }
210 
211     bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
212 
213     if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
214         if (!clearDependenciesForSubCapture) {
215             eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr);
216         }
217 
218         auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();
219 
220         size_t nodesCount = 0u;
221         if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) {
222             nodesCount = 1;
223         } else if (!multiDispatchInfo.empty()) {
224             nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
225         }
226 
227         if (isCacheFlushForBcsRequired() && enqueueWithBlitAuxTranslation) {
228             // Cache flush for aux translation is always required (if supported)
229             timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
230         }
231 
232         if (nodesCount > 0) {
233             obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver);
234             csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
235         }
236     }
237 
238     auto &commandStream = *obtainCommandStream<commandType>(csrDeps, false, blockQueue, multiDispatchInfo, eventsRequest,
239                                                             blockedCommandsData, surfacesForResidency, numSurfaceForResidency, isMarkerWithProfiling);
240     auto commandStreamStart = commandStream.getUsed();
241 
242     if (this->context->getRootDeviceIndices().size() > 1) {
243         TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
244     }
245 
246     if (enqueueWithBlitAuxTranslation) {
247         processDispatchForBlitAuxTranslation(*getBcsForAuxTranslation(), multiDispatchInfo, blitPropertiesContainer,
248                                              timestampPacketDependencies, eventsRequest, blockQueue);
249     }
250 
251     if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
252         eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
253         eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
254         eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes);
255     }
256 
257     bool flushDependenciesForNonKernelCommand = false;
258 
259     if (multiDispatchInfo.empty() == false) {
260         processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
261                                                hwTimeStamps, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(),
262                                                timestampPacketDependencies);
263     } else if (isCacheFlushCommand(commandType)) {
264         processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
265     } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
266         if (CL_COMMAND_BARRIER == commandType) {
267             computeCommandStreamReceiver.requestStallingCommandsOnNextFlush();
268         }
269 
270         for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
271             auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
272             if (waitlistEvent->getTimestampPacketNodes()) {
273                 flushDependenciesForNonKernelCommand = true;
274                 if (eventBuilder.getEvent()) {
275                     eventBuilder.getEvent()->addTimestampPacketNodes(*waitlistEvent->getTimestampPacketNodes());
276                 }
277             }
278         }
279 
280         if (flushDependenciesForNonKernelCommand) {
281             TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps);
282         }
283 
284         if (isMarkerWithProfiling) {
285             if (numEventsInWaitList == 0) {
286                 computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
287             }
288             processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps);
289         }
290     } else if (isMarkerWithProfiling) {
291         processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps);
292     }
293 
294     CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
295     const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
296                                               flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer);
297 
298     bool migratedMemory = false;
299 
300     if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) {
301         for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) {
302             MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second);
303             migratedMemory = true;
304         }
305     }
306     if (!blockQueue) {
307         if (parentKernel) {
308             processDeviceEnqueue(devQueueHw, multiDispatchInfo, hwTimeStamps, blocking);
309         }
310 
311         if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) {
312             csrDeps.makeResident(computeCommandStreamReceiver);
313 
314             completionStamp = enqueueNonBlocked<commandType>(
315                 surfacesForResidency,
316                 numSurfaceForResidency,
317                 commandStream,
318                 commandStreamStart,
319                 blocking,
320                 clearDependenciesForSubCapture,
321                 multiDispatchInfo,
322                 enqueueProperties,
323                 timestampPacketDependencies,
324                 eventsRequest,
325                 eventBuilder,
326                 taskLevel,
327                 printfHandler.get(),
328                 getBcsForAuxTranslation());
329 
330             if (parentKernel) {
331                 computeCommandStreamReceiver.setMediaVFEStateDirty(true);
332 
333                 if (devQueueHw->getSchedulerReturnInstance() > 0) {
334                     waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
335                     this->runSchedulerSimulation(*devQueueHw, *parentKernel);
336                 }
337             }
338         } else if (enqueueProperties.isFlushWithoutKernelRequired()) {
339             completionStamp = enqueueCommandWithoutKernel(
340                 surfacesForResidency,
341                 numSurfaceForResidency,
342                 &commandStream,
343                 commandStreamStart,
344                 blocking,
345                 enqueueProperties,
346                 timestampPacketDependencies,
347                 eventsRequest,
348                 eventBuilder,
349                 taskLevel,
350                 csrDeps,
351                 nullptr);
352         } else {
353             UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission);
354 
355             auto maxTaskCountCurrentRootDevice = this->taskCount;
356 
357             for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
358                 auto event = castToObject<Event>(eventWaitList[eventId]);
359 
360                 if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) {
361                     maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount());
362                 }
363             }
364 
365             //inherit data from event_wait_list and previous packets
366             completionStamp.flushStamp = this->flushStamp->peekStamp();
367             completionStamp.taskCount = maxTaskCountCurrentRootDevice;
368             completionStamp.taskLevel = taskLevel;
369 
370             if (eventBuilder.getEvent() && isProfilingEnabled()) {
371                 eventBuilder.getEvent()->setSubmitTimeStamp();
372                 eventBuilder.getEvent()->setStartTimeStamp();
373             }
374         }
375         if (eventBuilder.getEvent()) {
376             eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
377         }
378 
379         this->latestSentEnqueueType = enqueueProperties.operation;
380     }
381     updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
382 
383     if (blockQueue) {
384         if (parentKernel) {
385             size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
386             blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
387         }
388 
389         enqueueBlocked(commandType,
390                        surfacesForResidency,
391                        numSurfaceForResidency,
392                        multiDispatchInfo,
393                        timestampPacketDependencies,
394                        blockedCommandsData,
395                        enqueueProperties,
396                        eventsRequest,
397                        eventBuilder,
398                        std::move(printfHandler),
399                        nullptr);
400     }
401 
402     if (deferredTimestampPackets.get()) {
403         timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
404     }
405 
406     queueOwnership.unlock();
407     commandStreamReceiverOwnership.unlock();
408 
409     if (blocking) {
410         auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams();
411         if (builtinOpParams.userPtrForPostOperationCpuCopy) {
412             waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), false);
413             auto hostPtrAlloc = builtinOpParams.transferAllocation;
414             UNRECOVERABLE_IF(nullptr == hostPtrAlloc);
415             auto size = hostPtrAlloc->getUnderlyingBufferSize();
416             [[maybe_unused]] int cpuCopyStatus = memcpy_s(builtinOpParams.userPtrForPostOperationCpuCopy, size, hostPtrAlloc->getUnderlyingBuffer(), size);
417             DEBUG_BREAK_IF(cpuCopyStatus != 0);
418             waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
419         } else {
420             waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
421         }
422     }
423     if (migratedMemory) {
424         computeCommandStreamReceiver.flushBatchedSubmissions();
425     }
426 }
427 
428 template <typename GfxFamily>
429 template <uint32_t commandType>
processDispatchForKernels(const MultiDispatchInfo & multiDispatchInfo,std::unique_ptr<PrintfHandler> & printfHandler,Event * event,TagNodeBase * & hwTimeStamps,bool blockQueue,DeviceQueueHw<GfxFamily> * devQueueHw,CsrDependencies & csrDeps,KernelOperation * blockedCommandsData,TimestampPacketDependencies & timestampPacketDependencies)430 void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
431                                                           std::unique_ptr<PrintfHandler> &printfHandler,
432                                                           Event *event,
433                                                           TagNodeBase *&hwTimeStamps,
434                                                           bool blockQueue,
435                                                           DeviceQueueHw<GfxFamily> *devQueueHw,
436                                                           CsrDependencies &csrDeps,
437                                                           KernelOperation *blockedCommandsData,
438                                                           TimestampPacketDependencies &timestampPacketDependencies) {
439     TagNodeBase *hwPerfCounter = nullptr;
440     getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
441 
442     printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
443     if (printfHandler) {
444         printfHandler->prepareDispatch(multiDispatchInfo);
445     }
446 
447     if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
448         auto &gws = multiDispatchInfo.begin()->getGWS();
449         auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
450         size_t workGroupsCount = (gws.x * gws.y * gws.z) /
451                                  (lws.x * lws.y * lws.z);
452         device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
453     }
454 
455     if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
456         if (multiDispatchInfo.peekMainKernel()->isKernelDebugEnabled()) {
457             setupDebugSurface(multiDispatchInfo.peekMainKernel());
458         }
459     }
460 
461     if (event && this->isProfilingEnabled()) {
462         // Get allocation for timestamps
463         hwTimeStamps = event->getHwTimeStampNode();
464     }
465 
466     if (auto parentKernel = multiDispatchInfo.peekParentKernel()) {
467         parentKernel->createReflectionSurface();
468         parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
469         parentKernel->patchEventPool(context->getDefaultDeviceQueue());
470         parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
471         if (!blockQueue) {
472             devQueueHw->resetDeviceQueue();
473             devQueueHw->acquireEMCriticalSection();
474         }
475     }
476 
477     if (event && this->isPerfCountersEnabled()) {
478         hwPerfCounter = event->getHwPerfCounterNode();
479     }
480 
481     HardwareInterface<GfxFamily>::dispatchWalker(
482         *this,
483         multiDispatchInfo,
484         csrDeps,
485         blockedCommandsData,
486         hwTimeStamps,
487         hwPerfCounter,
488         &timestampPacketDependencies,
489         timestampPacketContainer.get(),
490         commandType);
491 
492     if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
493         for (auto &dispatchInfo : multiDispatchInfo) {
494             for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) {
495                 getGpgpuCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData);
496             }
497         }
498     }
499 
500     getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize());
501 }
502 
503 template <typename GfxFamily>
processDispatchForBlitEnqueue(CommandStreamReceiver & blitCommandStreamReceiver,const MultiDispatchInfo & multiDispatchInfo,TimestampPacketDependencies & timestampPacketDependencies,const EventsRequest & eventsRequest,LinearStream * commandStream,uint32_t commandType,bool queueBlocked)504 BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandStreamReceiver &blitCommandStreamReceiver,
505                                                                         const MultiDispatchInfo &multiDispatchInfo,
506                                                                         TimestampPacketDependencies &timestampPacketDependencies,
507                                                                         const EventsRequest &eventsRequest, LinearStream *commandStream,
508                                                                         uint32_t commandType, bool queueBlocked) {
509     auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType);
510 
511     auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver,
512                                                                 multiDispatchInfo.peekBuiltinOpParams());
513     if (!queueBlocked) {
514         eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, blitCommandStreamReceiver,
515                                                                      CsrDependencies::DependenciesType::All);
516 
517         blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.cacheFlushNodes);
518         blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
519         blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
520     }
521 
522     auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
523     blitProperties.outputTimestampPacket = currentTimestampPacketNode;
524 
525     if (commandStream) {
526         if (timestampPacketDependencies.cacheFlushNodes.peekNodes().size() > 0) {
527             auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
528             const auto &hwInfo = device->getHardwareInfo();
529             PipeControlArgs args;
530             args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed(true, hwInfo);
531             MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
532                 *commandStream,
533                 GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
534                 cacheFlushTimestampPacketGpuAddress,
535                 0,
536                 hwInfo,
537                 args);
538         }
539 
540         TimestampPacketHelper::programSemaphore<GfxFamily>(*commandStream, *currentTimestampPacketNode);
541     }
542     return blitProperties;
543 }
544 
545 template <typename GfxFamily>
processDispatchForBlitAuxTranslation(CommandStreamReceiver & bcsCsr,const MultiDispatchInfo & multiDispatchInfo,BlitPropertiesContainer & blitPropertiesContainer,TimestampPacketDependencies & timestampPacketDependencies,const EventsRequest & eventsRequest,bool queueBlocked)546 void CommandQueueHw<GfxFamily>::processDispatchForBlitAuxTranslation(CommandStreamReceiver &bcsCsr,
547                                                                      const MultiDispatchInfo &multiDispatchInfo,
548                                                                      BlitPropertiesContainer &blitPropertiesContainer,
549                                                                      TimestampPacketDependencies &timestampPacketDependencies,
550                                                                      const EventsRequest &eventsRequest, bool queueBlocked) {
551     auto rootDeviceIndex = getDevice().getRootDeviceIndex();
552     auto nodesAllocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
553     auto numKernelObjs = multiDispatchInfo.getKernelObjsForAuxTranslation()->size();
554     blitPropertiesContainer.resize(numKernelObjs * 2);
555 
556     auto bufferIndex = 0;
557     for (auto &kernelObj : *multiDispatchInfo.getKernelObjsForAuxTranslation()) {
558         GraphicsAllocation *allocation = nullptr;
559         if (kernelObj.type == KernelObjForAuxTranslation::Type::MEM_OBJ) {
560             auto buffer = static_cast<Buffer *>(kernelObj.object);
561             allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
562         } else {
563             DEBUG_BREAK_IF(kernelObj.type != KernelObjForAuxTranslation::Type::GFX_ALLOC);
564             allocation = static_cast<GraphicsAllocation *>(kernelObj.object);
565         }
566         {
567             // Aux to NonAux
568             blitPropertiesContainer[bufferIndex] = BlitProperties::constructPropertiesForAuxTranslation(
569                 AuxTranslationDirection::AuxToNonAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
570             auto auxToNonAuxNode = nodesAllocator->getTag();
571             timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode);
572         }
573 
574         {
575             // NonAux to Aux
576             blitPropertiesContainer[bufferIndex + numKernelObjs] = BlitProperties::constructPropertiesForAuxTranslation(
577                 AuxTranslationDirection::NonAuxToAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
578             auto nonAuxToAuxNode = nodesAllocator->getTag();
579             timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode);
580         }
581         bufferIndex++;
582     }
583 
584     if (!queueBlocked) {
585         CsrDependencies csrDeps;
586         eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
587         BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies,
588                                                            *this->timestampPacketContainer, csrDeps,
589                                                            getGpgpuCommandStreamReceiver(), bcsCsr);
590     }
591 
592     eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
593 }
594 
595 template <typename GfxFamily>
processDispatchForCacheFlush(Surface ** surfaces,size_t numSurfaces,LinearStream * commandStream,CsrDependencies & csrDeps)596 void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
597                                                              size_t numSurfaces,
598                                                              LinearStream *commandStream,
599                                                              CsrDependencies &csrDeps) {
600 
601     TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDeps);
602 
603     uint64_t postSyncAddress = 0;
604     if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
605         auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0);
606         timestampPacketNodeForPostSync->setProfilingCapable(false);
607         postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
608     }
609 
610     submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress);
611 }
612 
613 template <typename GfxFamily>
processDispatchForMarker(CommandQueue & commandQueue,LinearStream * commandStream,EventsRequest & eventsRequest,CsrDependencies & csrDeps)614 void CommandQueueHw<GfxFamily>::processDispatchForMarker(CommandQueue &commandQueue,
615                                                          LinearStream *commandStream,
616                                                          EventsRequest &eventsRequest,
617                                                          CsrDependencies &csrDeps) {
618     auto event = castToObjectOrAbort<Event>(*eventsRequest.outEvent);
619 
620     TagNodeBase *hwTimeStamps = nullptr;
621     TagNodeBase *hwPerfCounter = nullptr;
622 
623     hwTimeStamps = event->getHwTimeStampNode();
624 
625     HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
626     HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
627     getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation());
628 }
629 
630 template <typename GfxFamily>
processDispatchForMarkerWithTimestampPacket(CommandQueue & commandQueue,LinearStream * commandStream,EventsRequest & eventsRequest,CsrDependencies & csrDeps)631 void CommandQueueHw<GfxFamily>::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue,
632                                                                             LinearStream *commandStream,
633                                                                             EventsRequest &eventsRequest,
634                                                                             CsrDependencies &csrDeps) {
635     auto currentTimestampPacketNode = commandQueue.getTimestampPacketContainer()->peekNodes().at(0);
636 
637     auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode);
638     auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode);
639 
640     EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress);
641     EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress);
642 
643     auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode);
644     auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode);
645 
646     EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress);
647     EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress);
648 }
649 
650 template <typename GfxFamily>
processDeviceEnqueue(DeviceQueueHw<GfxFamily> * devQueueHw,const MultiDispatchInfo & multiDispatchInfo,TagNodeBase * hwTimeStamps,bool & blocking)651 void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *devQueueHw,
652                                                      const MultiDispatchInfo &multiDispatchInfo,
653                                                      TagNodeBase *hwTimeStamps,
654                                                      bool &blocking) {
655     auto parentKernel = multiDispatchInfo.peekParentKernel();
656     size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
657     bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType());
658 
659     uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
660     devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
661                                             *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
662                                             parentKernel,
663                                             (uint32_t)multiDispatchInfo.size(),
664                                             getGpgpuCommandStreamReceiver().getTagAllocation()->getGpuAddress(),
665                                             taskCount,
666                                             hwTimeStamps,
667                                             isCcsUsed);
668 
669     SchedulerKernel &scheduler = getContext().getSchedulerKernel();
670 
671     scheduler.setArgs(devQueueHw->getQueueBuffer(),
672                       devQueueHw->getStackBuffer(),
673                       devQueueHw->getEventPoolBuffer(),
674                       devQueueHw->getSlbBuffer(),
675                       devQueueHw->getDshBuffer(),
676                       parentKernel->getKernelReflectionSurface(),
677                       devQueueHw->getQueueStorageBuffer(),
678                       this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(),
679                       devQueueHw->getDebugQueue());
680 
681     auto preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
682     GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
683         *this->commandStream,
684         *devQueueHw,
685         preemptionMode,
686         scheduler,
687         &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
688         devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
689         isCcsUsed);
690 
691     scheduler.makeResident(getGpgpuCommandStreamReceiver());
692 
693     parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getGpgpuCommandStreamReceiver());
694 
695     if (parentKernel->isAuxTranslationRequired()) {
696         blocking = true;
697     }
698 }
699 
700 template <typename GfxFamily>
obtainTaskLevelAndBlockedStatus(unsigned int & taskLevel,cl_uint & numEventsInWaitList,const cl_event * & eventWaitList,bool & blockQueueStatus,unsigned int commandType)701 void CommandQueueHw<GfxFamily>::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) {
702     auto isQueueBlockedStatus = isQueueBlocked();
703     taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
704     blockQueueStatus = (taskLevel == CompletionStamp::notReady) || isQueueBlockedStatus;
705 
706     auto taskLevelUpdateRequired = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType);
707     if (taskLevelUpdateRequired) {
708         taskLevel++;
709         this->taskLevel = taskLevel;
710     }
711 
712     DBG_LOG(EventsDebugEnable, "blockQueue", blockQueueStatus, "virtualEvent", virtualEvent, "taskLevel", taskLevel);
713 }
714 
715 template <typename GfxFamily>
isTaskLevelUpdateRequired(const uint32_t & taskLevel,const cl_event * eventWaitList,const cl_uint & numEventsInWaitList,unsigned int commandType)716 bool CommandQueueHw<GfxFamily>::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) {
717     bool updateTaskLevel = true;
718     //if we are blocked by user event then no update
719     if (taskLevel == CompletionStamp::notReady) {
720         updateTaskLevel = false;
721     }
722     //if we are executing command without kernel then it will inherit state from
723     //previous commands, barrier is exception
724     if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) {
725         updateTaskLevel = false;
726     }
727     //ooq special cases starts here
728     if (this->isOOQEnabled()) {
729         //if no wait list and barrier , do not update task level
730         if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) {
731             updateTaskLevel = false;
732         }
733         //if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue
734         if (eventWaitList != nullptr) {
735             auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList);
736             taskLevelFromEvents++;
737             if (taskLevelFromEvents <= this->taskLevel) {
738                 updateTaskLevel = false;
739             }
740         }
741     }
742     return updateTaskLevel;
743 }
744 
745 template <typename GfxFamily>
746 template <uint32_t commandType>
enqueueNonBlocked(Surface ** surfaces,size_t surfaceCount,LinearStream & commandStream,size_t commandStreamStart,bool & blocking,bool clearDependenciesForSubCapture,const MultiDispatchInfo & multiDispatchInfo,const EnqueueProperties & enqueueProperties,TimestampPacketDependencies & timestampPacketDependencies,EventsRequest & eventsRequest,EventBuilder & eventBuilder,uint32_t taskLevel,PrintfHandler * printfHandler,CommandStreamReceiver * bcsCsr)747 CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
748     Surface **surfaces,
749     size_t surfaceCount,
750     LinearStream &commandStream,
751     size_t commandStreamStart,
752     bool &blocking,
753     bool clearDependenciesForSubCapture,
754     const MultiDispatchInfo &multiDispatchInfo,
755     const EnqueueProperties &enqueueProperties,
756     TimestampPacketDependencies &timestampPacketDependencies,
757     EventsRequest &eventsRequest,
758     EventBuilder &eventBuilder,
759     uint32_t taskLevel,
760     PrintfHandler *printfHandler,
761     CommandStreamReceiver *bcsCsr) {
762 
763     UNRECOVERABLE_IF(multiDispatchInfo.empty());
764 
765     auto implicitFlush = false;
766 
767     if (printfHandler) {
768         blocking = true;
769         printfHandler->makeResident(getGpgpuCommandStreamReceiver());
770     }
771 
772     if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
773         device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
774     }
775 
776     if (timestampPacketContainer) {
777         timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
778         timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
779         timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
780     }
781 
782     bool anyUncacheableArgs = false;
783     auto requiresCoherency = false;
784     for (auto surface : CreateRange(surfaces, surfaceCount)) {
785         surface->makeResident(getGpgpuCommandStreamReceiver());
786         requiresCoherency |= surface->IsCoherent;
787         if (!surface->allowsL3Caching()) {
788             anyUncacheableArgs = true;
789         }
790     }
791 
792     auto mediaSamplerRequired = false;
793     uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber;
794     auto specialPipelineSelectMode = false;
795     Kernel *kernel = nullptr;
796     bool auxTranslationRequired = false;
797     bool useGlobalAtomics = false;
798 
799     for (auto &dispatchInfo : multiDispatchInfo) {
800         if (kernel != dispatchInfo.getKernel()) {
801             kernel = dispatchInfo.getKernel();
802         } else {
803             continue;
804         }
805         kernel->makeResident(getGpgpuCommandStreamReceiver());
806         requiresCoherency |= kernel->requiresCoherency();
807         mediaSamplerRequired |= kernel->isVmeKernel();
808         auto numGrfRequiredByKernel = static_cast<uint32_t>(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.numGrfRequired);
809         numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel);
810         specialPipelineSelectMode |= kernel->requiresSpecialPipelineSelectMode();
811         auxTranslationRequired |= kernel->isAuxTranslationRequired();
812         if (kernel->hasUncacheableStatelessArgs()) {
813             anyUncacheableArgs = true;
814         }
815 
816         if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) {
817             useGlobalAtomics = true;
818         }
819     }
820 
821     if (mediaSamplerRequired) {
822         DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false);
823     }
824 
825     if (isProfilingEnabled() && eventBuilder.getEvent()) {
826         eventBuilder.getEvent()->setSubmitTimeStamp();
827 
828         auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
829         if (hwTimestampNode) {
830             getGpgpuCommandStreamReceiver().makeResident(*hwTimestampNode->getBaseGraphicsAllocation());
831         }
832 
833         if (isPerfCountersEnabled()) {
834             getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterNode()->getBaseGraphicsAllocation());
835         }
836     }
837 
838     IndirectHeap *dsh = nullptr;
839     IndirectHeap *ioh = nullptr;
840 
841     if (multiDispatchInfo.peekParentKernel()) {
842         DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(this->getContext().getDefaultDeviceQueue());
843         DEBUG_BREAK_IF(pDevQueue == nullptr);
844         dsh = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
845         // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
846         ioh = dsh;
847         implicitFlush = true;
848     } else {
849         dsh = &getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u);
850         ioh = &getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u);
851     }
852 
853     auto allocNeedsFlushDC = false;
854     if (!device->isFullRangeSvm()) {
855         if (std::any_of(getGpgpuCommandStreamReceiver().getResidencyAllocations().begin(), getGpgpuCommandStreamReceiver().getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) {
856             allocNeedsFlushDC = true;
857         }
858     }
859 
860     auto memoryCompressionState = getGpgpuCommandStreamReceiver().getMemoryCompressionState(auxTranslationRequired, device->getHardwareInfo());
861 
862     DispatchFlags dispatchFlags(
863         {},                                                                                         //csrDependencies
864         &timestampPacketDependencies.barrierNodes,                                                  //barrierTimestampPacketNodes
865         {},                                                                                         //pipelineSelectArgs
866         this->flushStamp->getStampReference(),                                                      //flushStampReference
867         getThrottle(),                                                                              //throttle
868         ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo),                     //preemptionMode
869         numGrfRequired,                                                                             //numGrfRequired
870         L3CachingSettings::l3CacheOn,                                                               //l3CacheSettings
871         kernel->getThreadArbitrationPolicy(),                                                       //threadArbitrationPolicy
872         kernel->getAdditionalKernelExecInfo(),                                                      //additionalKernelExecInfo
873         kernel->getExecutionType(),                                                                 //kernelExecutionType
874         memoryCompressionState,                                                                     //memoryCompressionState
875         getSliceCount(),                                                                            //sliceCount
876         blocking,                                                                                   //blocking
877         shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC,                             //dcFlush
878         multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel(),                        //useSLM
879         true,                                                                                       //guardCommandBufferWithPipeControl
880         commandType == CL_COMMAND_NDRANGE_KERNEL,                                                   //GSBA32BitRequired
881         requiresCoherency,                                                                          //requiresCoherency
882         (QueuePriority::LOW == priority),                                                           //lowPriority
883         implicitFlush,                                                                              //implicitFlush
884         !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
885         false,                                                                                      //epilogueRequired
886         false,                                                                                      //usePerDssBackedBuffer
887         kernel->isSingleSubdevicePreferred(),                                                       //useSingleSubdevice
888         useGlobalAtomics,                                                                           //useGlobalAtomics
889         kernel->areMultipleSubDevicesInContext(),                                                   //areMultipleSubDevicesInContext
890         kernel->requiresMemoryMigration(),                                                          //memoryMigrationRequired
891         isTextureCacheFlushNeeded(commandType));                                                    //textureCacheFlush
892 
893     dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
894     dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
895 
896     if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) {
897         eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
898         dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
899     }
900 
901     DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
902 
903     if (anyUncacheableArgs) {
904         dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff;
905     } else if (!kernel->areStatelessWritesUsed()) {
906         dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On;
907     }
908 
909     if (this->dispatchHints != 0) {
910         dispatchFlags.engineHints = this->dispatchHints;
911         dispatchFlags.epilogueRequired = true;
912     }
913 
914     if (gtpinIsGTPinInitialized()) {
915         gtpinNotifyPreFlushTask(this);
916     }
917 
918     if (enqueueProperties.blitPropertiesContainer->size() > 0) {
919         const auto newTaskCount = bcsCsr->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
920         this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
921         dispatchFlags.implicitFlush = true;
922     }
923 
924     PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stdout, "preemption = %d.\n", static_cast<int>(dispatchFlags.preemptionMode));
925     CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask(
926         commandStream,
927         commandStreamStart,
928         *dsh,
929         *ioh,
930         getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
931         taskLevel,
932         dispatchFlags,
933         getDevice());
934 
935     if (gtpinIsGTPinInitialized()) {
936         gtpinNotifyFlushTask(completionStamp.taskCount);
937     }
938 
939     return completionStamp;
940 }
941 
942 template <typename GfxFamily>
enqueueBlocked(uint32_t commandType,Surface ** surfaces,size_t surfaceCount,const MultiDispatchInfo & multiDispatchInfo,TimestampPacketDependencies & timestampPacketDependencies,std::unique_ptr<KernelOperation> & blockedCommandsData,const EnqueueProperties & enqueueProperties,EventsRequest & eventsRequest,EventBuilder & externalEventBuilder,std::unique_ptr<PrintfHandler> && printfHandler,CommandStreamReceiver * bcsCsr)943 void CommandQueueHw<GfxFamily>::enqueueBlocked(
944     uint32_t commandType,
945     Surface **surfaces,
946     size_t surfaceCount,
947     const MultiDispatchInfo &multiDispatchInfo,
948     TimestampPacketDependencies &timestampPacketDependencies,
949     std::unique_ptr<KernelOperation> &blockedCommandsData,
950     const EnqueueProperties &enqueueProperties,
951     EventsRequest &eventsRequest,
952     EventBuilder &externalEventBuilder,
953     std::unique_ptr<PrintfHandler> &&printfHandler,
954     CommandStreamReceiver *bcsCsr) {
955 
956     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
957 
958     //store previous virtual event as it will add dependecies to new virtual event
959     if (this->virtualEvent) {
960         DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent);
961     }
962 
963     EventBuilder internalEventBuilder;
964     EventBuilder *eventBuilder;
965     // check if event will be exposed externally
966     if (externalEventBuilder.getEvent()) {
967         externalEventBuilder.getEvent()->incRefInternal();
968         eventBuilder = &externalEventBuilder;
969         DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent);
970     } else {
971         // it will be an internal event
972         internalEventBuilder.create<VirtualEvent>(this, context);
973         eventBuilder = &internalEventBuilder;
974         DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
975     }
976     auto outEvent = eventBuilder->getEvent();
977 
978     //update queue taskCount
979     taskCount = outEvent->getCompletionStamp();
980 
981     std::unique_ptr<Command> command;
982     bool storeTimestampPackets = false;
983 
984     if (blockedCommandsData) {
985         if (enqueueProperties.blitPropertiesContainer) {
986             blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer;
987             blockedCommandsData->bcsCsr = bcsCsr;
988             blockedCommandsData->blitEnqueue = true;
989         }
990 
991         storeTimestampPackets = (timestampPacketContainer != nullptr);
992     }
993 
994     if (enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel) {
995         command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData);
996     } else {
997         //store task data in event
998         std::vector<Surface *> allSurfaces;
999         Kernel *kernel = nullptr;
1000         for (auto &dispatchInfo : multiDispatchInfo) {
1001             if (kernel != dispatchInfo.getKernel()) {
1002                 kernel = dispatchInfo.getKernel();
1003             } else {
1004                 continue;
1005             }
1006             kernel->getResidency(allSurfaces);
1007         }
1008         for (auto &surface : CreateRange(surfaces, surfaceCount)) {
1009             allSurfaces.push_back(surface->duplicate());
1010         }
1011 
1012         PreemptionMode preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
1013         bool slmUsed = multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel();
1014         command = std::make_unique<CommandComputeKernel>(*this,
1015                                                          blockedCommandsData,
1016                                                          allSurfaces,
1017                                                          shouldFlushDC(commandType, printfHandler.get()),
1018                                                          slmUsed,
1019                                                          commandType,
1020                                                          std::move(printfHandler),
1021                                                          preemptionMode,
1022                                                          multiDispatchInfo.peekMainKernel(),
1023                                                          (uint32_t)multiDispatchInfo.size());
1024     }
1025     if (storeTimestampPackets) {
1026         command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
1027         command->setEventsRequest(eventsRequest);
1028     } else if (this->context->getRootDeviceIndices().size() > 1) {
1029         command->setEventsRequest(eventsRequest);
1030     }
1031 
1032     outEvent->setCommand(std::move(command));
1033 
1034     eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
1035     eventBuilder->addParentEvent(this->virtualEvent);
1036     eventBuilder->finalize();
1037 
1038     if (this->virtualEvent) {
1039         this->virtualEvent->decRefInternal();
1040     }
1041 
1042     this->virtualEvent = outEvent;
1043 }
1044 
1045 template <typename GfxFamily>
enqueueCommandWithoutKernel(Surface ** surfaces,size_t surfaceCount,LinearStream * commandStream,size_t commandStreamStart,bool & blocking,const EnqueueProperties & enqueueProperties,TimestampPacketDependencies & timestampPacketDependencies,EventsRequest & eventsRequest,EventBuilder & eventBuilder,uint32_t taskLevel,CsrDependencies & csrDeps,CommandStreamReceiver * bcsCsr)1046 CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
1047     Surface **surfaces,
1048     size_t surfaceCount,
1049     LinearStream *commandStream,
1050     size_t commandStreamStart,
1051     bool &blocking,
1052     const EnqueueProperties &enqueueProperties,
1053     TimestampPacketDependencies &timestampPacketDependencies,
1054     EventsRequest &eventsRequest,
1055     EventBuilder &eventBuilder,
1056     uint32_t taskLevel,
1057     CsrDependencies &csrDeps,
1058     CommandStreamReceiver *bcsCsr) {
1059 
1060     CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
1061     bool flushGpgpuCsr = true;
1062 
1063     if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false)) {
1064         flushGpgpuCsr = false;
1065     } else {
1066         csrDeps.makeResident(getGpgpuCommandStreamReceiver());
1067     }
1068 
1069     if (eventBuilder.getEvent() && isProfilingEnabled()) {
1070         eventBuilder.getEvent()->setSubmitTimeStamp();
1071         eventBuilder.getEvent()->setStartTimeStamp();
1072     }
1073 
1074     if (flushGpgpuCsr) {
1075         if (timestampPacketContainer) {
1076             timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
1077             timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
1078             timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
1079         }
1080 
1081         for (auto surface : CreateRange(surfaces, surfaceCount)) {
1082             surface->makeResident(getGpgpuCommandStreamReceiver());
1083         }
1084 
1085         auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1086         DispatchFlags dispatchFlags(
1087             {},                                                                  //csrDependencies
1088             &timestampPacketDependencies.barrierNodes,                           //barrierTimestampPacketNodes
1089             {},                                                                  //pipelineSelectArgs
1090             flushStamp->getStampReference(),                                     //flushStampReference
1091             getThrottle(),                                                       //throttle
1092             device->getPreemptionMode(),                                         //preemptionMode
1093             GrfConfig::NotApplicable,                                            //numGrfRequired
1094             L3CachingSettings::NotApplicable,                                    //l3CacheSettings
1095             ThreadArbitrationPolicy::NotPresent,                                 //threadArbitrationPolicy
1096             AdditionalKernelExecInfo::NotApplicable,                             //additionalKernelExecInfo
1097             KernelExecutionType::NotApplicable,                                  //kernelExecutionType
1098             MemoryCompressionState::NotApplicable,                               //memoryCompressionState
1099             getSliceCount(),                                                     //sliceCount
1100             blocking,                                                            //blocking
1101             false,                                                               //dcFlush
1102             false,                                                               //useSLM
1103             true,                                                                //guardCommandBufferWithPipeControl
1104             false,                                                               //GSBA32BitRequired
1105             false,                                                               //requiresCoherency
1106             false,                                                               //lowPriority
1107             (enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
1108             getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(),      //outOfOrderExecutionAllowed
1109             false,                                                               //epilogueRequired
1110             false,                                                               //usePerDssBackedBuffer
1111             false,                                                               //useSingleSubdevice
1112             false,                                                               //useGlobalAtomics
1113             context->containsMultipleSubDevices(rootDeviceIndex),                //areMultipleSubDevicesInContext
1114             false,                                                               //memoryMigrationRequired
1115             false);                                                              //textureCacheFlush
1116 
1117         if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
1118             eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
1119             dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
1120         }
1121 
1122         completionStamp = getGpgpuCommandStreamReceiver().flushTask(
1123             *commandStream,
1124             commandStreamStart,
1125             getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
1126             getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
1127             getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
1128             taskLevel,
1129             dispatchFlags,
1130             getDevice());
1131     }
1132 
1133     if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
1134         UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);
1135         const auto newTaskCount = bcsCsr->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
1136         this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
1137     }
1138 
1139     return completionStamp;
1140 }
1141 
1142 template <typename GfxFamily>
computeOffsetsValueForRectCommands(size_t * bufferOffset,size_t * hostOffset,const size_t * bufferOrigin,const size_t * hostOrigin,const size_t * region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch)1143 void CommandQueueHw<GfxFamily>::computeOffsetsValueForRectCommands(size_t *bufferOffset,
1144                                                                    size_t *hostOffset,
1145                                                                    const size_t *bufferOrigin,
1146                                                                    const size_t *hostOrigin,
1147                                                                    const size_t *region,
1148                                                                    size_t bufferRowPitch,
1149                                                                    size_t bufferSlicePitch,
1150                                                                    size_t hostRowPitch,
1151                                                                    size_t hostSlicePitch) {
1152     size_t computedBufferRowPitch = bufferRowPitch ? bufferRowPitch : region[0];
1153     size_t computedBufferSlicePitch = bufferSlicePitch ? bufferSlicePitch : region[1] * computedBufferRowPitch;
1154     size_t computedHostRowPitch = hostRowPitch ? hostRowPitch : region[0];
1155     size_t computedHostSlicePitch = hostSlicePitch ? hostSlicePitch : region[1] * computedHostRowPitch;
1156     *bufferOffset = bufferOrigin[2] * computedBufferSlicePitch + bufferOrigin[1] * computedBufferRowPitch + bufferOrigin[0];
1157     *hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0];
1158 }
1159 
1160 template <typename GfxFamily>
calculateHostPtrSizeForImage(const size_t * region,size_t rowPitch,size_t slicePitch,Image * image)1161 size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) {
1162     auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
1163     auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel;
1164     auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch);
1165 
1166     return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
1167 }
1168 
1169 template <typename GfxFamily>
1170 template <uint32_t cmdType>
enqueueBlit(const MultiDispatchInfo & multiDispatchInfo,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,bool blocking,CommandStreamReceiver & bcsCsr)1171 void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
1172     auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
1173 
1174     EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
1175     EventBuilder eventBuilder;
1176 
1177     setupEvent(eventBuilder, eventsRequest.outEvent, cmdType);
1178     eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
1179 
1180     std::unique_ptr<KernelOperation> blockedCommandsData;
1181     TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
1182 
1183     auto blockQueue = false;
1184     auto taskLevel = 0u;
1185     obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, cmdType);
1186     auto clearAllDependencies = queueDependenciesClearRequired();
1187 
1188     enqueueHandlerHook(cmdType, multiDispatchInfo);
1189     aubCaptureHook(blocking, clearAllDependencies, multiDispatchInfo);
1190 
1191     if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
1192         blocking = true;
1193     }
1194 
1195     TimestampPacketDependencies timestampPacketDependencies;
1196     BlitPropertiesContainer blitPropertiesContainer;
1197     CsrDependencies csrDeps;
1198 
1199     eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
1200     auto allocator = bcsCsr.getTimestampPacketAllocator();
1201 
1202     if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue)) {
1203         timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
1204     }
1205 
1206     if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
1207         timestampPacketDependencies.barrierNodes.add(allocator->getTag());
1208     }
1209 
1210     obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
1211     csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
1212 
1213     LinearStream *gpgpuCommandStream = {};
1214     size_t gpgpuCommandStreamStart = {};
1215     if (isGpgpuSubmissionForBcsRequired(blockQueue)) {
1216         gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false);
1217         gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
1218     }
1219 
1220     if (eventBuilder.getEvent()) {
1221         eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
1222     }
1223 
1224     blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
1225                                                                     eventsRequest, gpgpuCommandStream, cmdType, blockQueue));
1226 
1227     CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
1228 
1229     const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
1230 
1231     if (!blockQueue) {
1232         completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
1233                                                       enqueueProperties, timestampPacketDependencies, eventsRequest,
1234                                                       eventBuilder, taskLevel, csrDeps, &bcsCsr);
1235 
1236         if (eventBuilder.getEvent()) {
1237             eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
1238         }
1239 
1240         this->latestSentEnqueueType = enqueueProperties.operation;
1241     }
1242     updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
1243 
1244     if (blockQueue) {
1245         enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
1246     }
1247 
1248     timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
1249 
1250     queueOwnership.unlock();
1251     commandStreamReceiverOwnership.unlock();
1252 
1253     if (blocking) {
1254         waitForAllEngines(blockQueue, nullptr);
1255     }
1256 }
1257 
1258 template <typename GfxFamily>
1259 template <uint32_t cmdType, size_t surfaceCount>
dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo & dispatchInfo,Surface * (& surfaces)[surfaceCount],EBuiltInOps::Type builtInOperation,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,bool blocking,CommandStreamReceiver & csr)1260 void CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) {
1261     const bool blit = EngineHelpers::isBcs(csr.getOsContext().getEngineType());
1262 
1263     if (blit) {
1264         enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
1265     } else {
1266         auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation,
1267                                                                                 this->getClDevice());
1268         BuiltInOwnershipWrapper builtInLock(builder, this->context);
1269 
1270         builder.buildDispatchInfos(dispatchInfo);
1271 
1272         enqueueHandler<cmdType>(
1273             surfaces,
1274             blocking,
1275             dispatchInfo,
1276             numEventsInWaitList,
1277             eventWaitList,
1278             event);
1279     }
1280 }
1281 
1282 template <typename GfxFamily>
isBlitAuxTranslationRequired(const MultiDispatchInfo & multiDispatchInfo)1283 bool CommandQueueHw<GfxFamily>::isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo) {
1284     return multiDispatchInfo.getKernelObjsForAuxTranslation() &&
1285            (multiDispatchInfo.getKernelObjsForAuxTranslation()->size() > 0) &&
1286            (HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo()) == AuxTranslationMode::Blit);
1287 }
1288 
1289 } // namespace NEO
1290