1 /*
2 * Copyright (C) 2018-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8 #pragma once
9 #include "shared/source/built_ins/built_ins.h"
10 #include "shared/source/command_stream/command_stream_receiver.h"
11 #include "shared/source/helpers/array_count.h"
12 #include "shared/source/helpers/engine_node_helper.h"
13 #include "shared/source/helpers/local_work_size.h"
14 #include "shared/source/helpers/pipe_control_args.h"
15 #include "shared/source/memory_manager/internal_allocation_storage.h"
16 #include "shared/source/memory_manager/memory_manager.h"
17 #include "shared/source/memory_manager/surface.h"
18 #include "shared/source/os_interface/os_context.h"
19 #include "shared/source/program/sync_buffer_handler.h"
20 #include "shared/source/program/sync_buffer_handler.inl"
21 #include "shared/source/utilities/range.h"
22 #include "shared/source/utilities/tag_allocator.h"
23
24 #include "opencl/source/built_ins/builtins_dispatch_builder.h"
25 #include "opencl/source/builtin_kernels_simulation/scheduler_simulation.h"
26 #include "opencl/source/command_queue/command_queue_hw.h"
27 #include "opencl/source/command_queue/gpgpu_walker.h"
28 #include "opencl/source/command_queue/hardware_interface.h"
29 #include "opencl/source/event/event_builder.h"
30 #include "opencl/source/event/user_event.h"
31 #include "opencl/source/gtpin/gtpin_notify.h"
32 #include "opencl/source/helpers/cl_blit_properties.h"
33 #include "opencl/source/helpers/cl_hw_helper.h"
34 #include "opencl/source/helpers/cl_preemption_helper.h"
35 #include "opencl/source/helpers/dispatch_info_builder.h"
36 #include "opencl/source/helpers/enqueue_properties.h"
37 #include "opencl/source/helpers/hardware_commands_helper.h"
38 #include "opencl/source/helpers/task_information.h"
39 #include "opencl/source/mem_obj/buffer.h"
40 #include "opencl/source/mem_obj/image.h"
41 #include "opencl/source/memory_manager/migration_controller.h"
42 #include "opencl/source/program/block_kernel_manager.h"
43 #include "opencl/source/program/printf_handler.h"
44 #include "opencl/source/utilities/cl_logger.h"
45
46 #include <algorithm>
47 #include <new>
48
49 namespace NEO {
50
51 template <typename GfxFamily>
52 template <uint32_t commandType, size_t surfaceCount>
enqueueHandler(Surface * (& surfaces)[surfaceCount],bool blocking,Kernel * kernel,cl_uint workDim,const size_t globalOffsets[3],const size_t workItems[3],const size_t * localWorkSizesIn,const size_t * enqueuedWorkSizes,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)53 void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount],
54 bool blocking,
55 Kernel *kernel,
56 cl_uint workDim,
57 const size_t globalOffsets[3],
58 const size_t workItems[3],
59 const size_t *localWorkSizesIn,
60 const size_t *enqueuedWorkSizes,
61 cl_uint numEventsInWaitList,
62 const cl_event *eventWaitList,
63 cl_event *event) {
64 BuiltInOwnershipWrapper builtInLock;
65 KernelObjsForAuxTranslation kernelObjsForAuxTranslation;
66 MultiDispatchInfo multiDispatchInfo(kernel);
67
68 auto auxTranslationMode = AuxTranslationMode::None;
69
70 if (DebugManager.flags.ForceDispatchScheduler.get()) {
71 forceDispatchScheduler(multiDispatchInfo);
72 } else {
73
74 kernel->updateAuxTranslationRequired();
75 if (kernel->isAuxTranslationRequired()) {
76 kernel->fillWithKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
77 multiDispatchInfo.setKernelObjsForAuxTranslation(kernelObjsForAuxTranslation);
78
79 if (!kernelObjsForAuxTranslation.empty()) {
80 auxTranslationMode = HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo());
81 }
82 }
83
84 if (AuxTranslationMode::Builtin == auxTranslationMode) {
85 auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getClDevice());
86 builtInLock.takeOwnership(builder, this->context);
87
88 dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::AuxToNonAux);
89 }
90
91 if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
92 DispatchInfoBuilder<SplitDispatch::Dim::d3D, SplitDispatch::SplitMode::WalkerSplit> builder(getClDevice());
93 builder.setDispatchGeometry(workDim, workItems, enqueuedWorkSizes, globalOffsets, Vec3<size_t>{0, 0, 0}, localWorkSizesIn);
94 builder.setKernel(kernel);
95 builder.bake(multiDispatchInfo);
96 } else {
97 auto builder = kernel->getKernelInfo().builtinDispatchBuilder;
98 builder->buildDispatchInfos(multiDispatchInfo, kernel, workDim, workItems, enqueuedWorkSizes, globalOffsets);
99
100 if (multiDispatchInfo.size() == 0) {
101 return;
102 }
103 }
104
105 if (AuxTranslationMode::Builtin == auxTranslationMode) {
106 UNRECOVERABLE_IF(kernel->isParentKernel);
107 dispatchAuxTranslationBuiltin(multiDispatchInfo, AuxTranslationDirection::NonAuxToAux);
108 }
109 }
110
111 if (AuxTranslationMode::Blit == auxTranslationMode) {
112 setupBlitAuxTranslation(multiDispatchInfo);
113 }
114
115 enqueueHandler<commandType>(surfaces, blocking, multiDispatchInfo, numEventsInWaitList, eventWaitList, event);
116 }
117
118 template <typename GfxFamily>
forceDispatchScheduler(NEO::MultiDispatchInfo & multiDispatchInfo)119 void CommandQueueHw<GfxFamily>::forceDispatchScheduler(NEO::MultiDispatchInfo &multiDispatchInfo) {
120 SchedulerKernel &scheduler = getContext().getSchedulerKernel();
121
122 auto devQueue = this->getContext().getDefaultDeviceQueue();
123 DeviceQueueHw<GfxFamily> *devQueueHw = castToObjectOrAbort<DeviceQueueHw<GfxFamily>>(devQueue);
124
125 DispatchInfo dispatchInfo(devQueue->getClDevice(), &scheduler, 1, Vec3<size_t>(scheduler.getGws(), 1, 1), Vec3<size_t>(scheduler.getLws(), 1, 1), Vec3<size_t>(0, 0, 0));
126 Vec3<size_t> workGroupCount = generateWorkgroupsNumber(dispatchInfo.getGWS(), dispatchInfo.getEnqueuedWorkgroupSize());
127 dispatchInfo.setTotalNumberOfWorkgroups(workGroupCount);
128 dispatchInfo.setNumberOfWorkgroups(workGroupCount);
129
130 scheduler.createReflectionSurface();
131 GraphicsAllocation *reflectionSurface = scheduler.getKernelReflectionSurface();
132
133 devQueueHw->resetDeviceQueue();
134
135 scheduler.setArgs(devQueueHw->getQueueBuffer(),
136 devQueueHw->getStackBuffer(),
137 devQueueHw->getEventPoolBuffer(),
138 devQueueHw->getSlbBuffer(),
139 devQueueHw->getDshBuffer(),
140 reflectionSurface,
141 devQueueHw->getQueueStorageBuffer(),
142 this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation());
143
144 multiDispatchInfo.push(dispatchInfo);
145 }
146
147 template <typename GfxFamily>
148 template <uint32_t commandType>
enqueueHandler(Surface ** surfacesForResidency,size_t numSurfaceForResidency,bool blocking,const MultiDispatchInfo & multiDispatchInfo,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event)149 void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
150 size_t numSurfaceForResidency,
151 bool blocking,
152 const MultiDispatchInfo &multiDispatchInfo,
153 cl_uint numEventsInWaitList,
154 const cl_event *eventWaitList,
155 cl_event *event) {
156 if (multiDispatchInfo.empty() && !isCommandWithoutKernel(commandType)) {
157 enqueueHandler<CL_COMMAND_MARKER>(nullptr, 0, blocking, multiDispatchInfo,
158 numEventsInWaitList, eventWaitList, event);
159 if (event) {
160 castToObjectOrAbort<Event>(*event)->setCmdType(commandType);
161 }
162 return;
163 }
164
165 Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
166 auto devQueue = this->getContext().getDefaultDeviceQueue();
167 DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
168
169 TagNodeBase *hwTimeStamps = nullptr;
170 CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver();
171 auto commandStreamReceiverOwnership = computeCommandStreamReceiver.obtainUniqueOwnership();
172
173 EventBuilder eventBuilder;
174 setupEvent(eventBuilder, event, commandType);
175
176 bool isMarkerWithProfiling = (CL_COMMAND_MARKER == commandType) && (eventBuilder.getEvent() && eventBuilder.getEvent()->isProfilingEnabled());
177
178 std::unique_ptr<KernelOperation> blockedCommandsData;
179 std::unique_ptr<PrintfHandler> printfHandler;
180 TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
181
182 auto blockQueue = false;
183 auto taskLevel = 0u;
184 obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType);
185
186 if (parentKernel && !blockQueue) {
187 while (!devQueueHw->isEMCriticalSectionFree())
188 ;
189 }
190
191 enqueueHandlerHook(commandType, multiDispatchInfo);
192
193 bool clearDependenciesForSubCapture = false;
194 aubCaptureHook(blocking, clearDependenciesForSubCapture, multiDispatchInfo);
195
196 bool clearAllDependencies = (queueDependenciesClearRequired() || clearDependenciesForSubCapture);
197
198 if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
199 blocking = true;
200 }
201
202 TimestampPacketDependencies timestampPacketDependencies;
203 EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
204 CsrDependencies csrDeps;
205 BlitPropertiesContainer blitPropertiesContainer;
206
207 if (this->context->getRootDeviceIndices().size() > 1) {
208 eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, computeCommandStreamReceiver);
209 }
210
211 bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
212
213 if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
214 if (!clearDependenciesForSubCapture) {
215 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, computeCommandStreamReceiver, CsrDependencies::DependenciesType::OnCsr);
216 }
217
218 auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();
219
220 size_t nodesCount = 0u;
221 if (isCacheFlushCommand(commandType) || isMarkerWithProfiling) {
222 nodesCount = 1;
223 } else if (!multiDispatchInfo.empty()) {
224 nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
225 }
226
227 if (isCacheFlushForBcsRequired() && enqueueWithBlitAuxTranslation) {
228 // Cache flush for aux translation is always required (if supported)
229 timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
230 }
231
232 if (nodesCount > 0) {
233 obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, computeCommandStreamReceiver);
234 csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
235 }
236 }
237
238 auto &commandStream = *obtainCommandStream<commandType>(csrDeps, false, blockQueue, multiDispatchInfo, eventsRequest,
239 blockedCommandsData, surfacesForResidency, numSurfaceForResidency, isMarkerWithProfiling);
240 auto commandStreamStart = commandStream.getUsed();
241
242 if (this->context->getRootDeviceIndices().size() > 1) {
243 TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
244 }
245
246 if (enqueueWithBlitAuxTranslation) {
247 processDispatchForBlitAuxTranslation(*getBcsForAuxTranslation(), multiDispatchInfo, blitPropertiesContainer,
248 timestampPacketDependencies, eventsRequest, blockQueue);
249 }
250
251 if (eventBuilder.getEvent() && computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
252 eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
253 eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes);
254 eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.auxToNonAuxNodes);
255 }
256
257 bool flushDependenciesForNonKernelCommand = false;
258
259 if (multiDispatchInfo.empty() == false) {
260 processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
261 hwTimeStamps, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(),
262 timestampPacketDependencies);
263 } else if (isCacheFlushCommand(commandType)) {
264 processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
265 } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
266 if (CL_COMMAND_BARRIER == commandType) {
267 computeCommandStreamReceiver.requestStallingCommandsOnNextFlush();
268 }
269
270 for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
271 auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
272 if (waitlistEvent->getTimestampPacketNodes()) {
273 flushDependenciesForNonKernelCommand = true;
274 if (eventBuilder.getEvent()) {
275 eventBuilder.getEvent()->addTimestampPacketNodes(*waitlistEvent->getTimestampPacketNodes());
276 }
277 }
278 }
279
280 if (flushDependenciesForNonKernelCommand) {
281 TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps);
282 }
283
284 if (isMarkerWithProfiling) {
285 if (numEventsInWaitList == 0) {
286 computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
287 }
288 processDispatchForMarkerWithTimestampPacket(*this, &commandStream, eventsRequest, csrDeps);
289 }
290 } else if (isMarkerWithProfiling) {
291 processDispatchForMarker(*this, &commandStream, eventsRequest, csrDeps);
292 }
293
294 CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
295 const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType),
296 flushDependenciesForNonKernelCommand, isMarkerWithProfiling, &blitPropertiesContainer);
297
298 bool migratedMemory = false;
299
300 if (!blockQueue && multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->requiresMemoryMigration()) {
301 for (auto &arg : multiDispatchInfo.peekMainKernel()->getMemObjectsToMigrate()) {
302 MigrationController::handleMigration(*this->context, computeCommandStreamReceiver, arg.second);
303 migratedMemory = true;
304 }
305 }
306 if (!blockQueue) {
307 if (parentKernel) {
308 processDeviceEnqueue(devQueueHw, multiDispatchInfo, hwTimeStamps, blocking);
309 }
310
311 if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) {
312 csrDeps.makeResident(computeCommandStreamReceiver);
313
314 completionStamp = enqueueNonBlocked<commandType>(
315 surfacesForResidency,
316 numSurfaceForResidency,
317 commandStream,
318 commandStreamStart,
319 blocking,
320 clearDependenciesForSubCapture,
321 multiDispatchInfo,
322 enqueueProperties,
323 timestampPacketDependencies,
324 eventsRequest,
325 eventBuilder,
326 taskLevel,
327 printfHandler.get(),
328 getBcsForAuxTranslation());
329
330 if (parentKernel) {
331 computeCommandStreamReceiver.setMediaVFEStateDirty(true);
332
333 if (devQueueHw->getSchedulerReturnInstance() > 0) {
334 waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
335 this->runSchedulerSimulation(*devQueueHw, *parentKernel);
336 }
337 }
338 } else if (enqueueProperties.isFlushWithoutKernelRequired()) {
339 completionStamp = enqueueCommandWithoutKernel(
340 surfacesForResidency,
341 numSurfaceForResidency,
342 &commandStream,
343 commandStreamStart,
344 blocking,
345 enqueueProperties,
346 timestampPacketDependencies,
347 eventsRequest,
348 eventBuilder,
349 taskLevel,
350 csrDeps,
351 nullptr);
352 } else {
353 UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission);
354
355 auto maxTaskCountCurrentRootDevice = this->taskCount;
356
357 for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
358 auto event = castToObject<Event>(eventWaitList[eventId]);
359
360 if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) {
361 maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount());
362 }
363 }
364
365 //inherit data from event_wait_list and previous packets
366 completionStamp.flushStamp = this->flushStamp->peekStamp();
367 completionStamp.taskCount = maxTaskCountCurrentRootDevice;
368 completionStamp.taskLevel = taskLevel;
369
370 if (eventBuilder.getEvent() && isProfilingEnabled()) {
371 eventBuilder.getEvent()->setSubmitTimeStamp();
372 eventBuilder.getEvent()->setStartTimeStamp();
373 }
374 }
375 if (eventBuilder.getEvent()) {
376 eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
377 }
378
379 this->latestSentEnqueueType = enqueueProperties.operation;
380 }
381 updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
382
383 if (blockQueue) {
384 if (parentKernel) {
385 size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
386 blockedCommandsData->surfaceStateHeapSizeEM = minSizeSSHForEM;
387 }
388
389 enqueueBlocked(commandType,
390 surfacesForResidency,
391 numSurfaceForResidency,
392 multiDispatchInfo,
393 timestampPacketDependencies,
394 blockedCommandsData,
395 enqueueProperties,
396 eventsRequest,
397 eventBuilder,
398 std::move(printfHandler),
399 nullptr);
400 }
401
402 if (deferredTimestampPackets.get()) {
403 timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
404 }
405
406 queueOwnership.unlock();
407 commandStreamReceiverOwnership.unlock();
408
409 if (blocking) {
410 auto &builtinOpParams = multiDispatchInfo.peekBuiltinOpParams();
411 if (builtinOpParams.userPtrForPostOperationCpuCopy) {
412 waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), false);
413 auto hostPtrAlloc = builtinOpParams.transferAllocation;
414 UNRECOVERABLE_IF(nullptr == hostPtrAlloc);
415 auto size = hostPtrAlloc->getUnderlyingBufferSize();
416 [[maybe_unused]] int cpuCopyStatus = memcpy_s(builtinOpParams.userPtrForPostOperationCpuCopy, size, hostPtrAlloc->getUnderlyingBuffer(), size);
417 DEBUG_BREAK_IF(cpuCopyStatus != 0);
418 waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
419 } else {
420 waitForAllEngines(blockQueue, (blockQueue ? nullptr : printfHandler.get()), true);
421 }
422 }
423 if (migratedMemory) {
424 computeCommandStreamReceiver.flushBatchedSubmissions();
425 }
426 }
427
428 template <typename GfxFamily>
429 template <uint32_t commandType>
processDispatchForKernels(const MultiDispatchInfo & multiDispatchInfo,std::unique_ptr<PrintfHandler> & printfHandler,Event * event,TagNodeBase * & hwTimeStamps,bool blockQueue,DeviceQueueHw<GfxFamily> * devQueueHw,CsrDependencies & csrDeps,KernelOperation * blockedCommandsData,TimestampPacketDependencies & timestampPacketDependencies)430 void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInfo &multiDispatchInfo,
431 std::unique_ptr<PrintfHandler> &printfHandler,
432 Event *event,
433 TagNodeBase *&hwTimeStamps,
434 bool blockQueue,
435 DeviceQueueHw<GfxFamily> *devQueueHw,
436 CsrDependencies &csrDeps,
437 KernelOperation *blockedCommandsData,
438 TimestampPacketDependencies ×tampPacketDependencies) {
439 TagNodeBase *hwPerfCounter = nullptr;
440 getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
441
442 printfHandler.reset(PrintfHandler::create(multiDispatchInfo, *device));
443 if (printfHandler) {
444 printfHandler->prepareDispatch(multiDispatchInfo);
445 }
446
447 if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
448 auto &gws = multiDispatchInfo.begin()->getGWS();
449 auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
450 size_t workGroupsCount = (gws.x * gws.y * gws.z) /
451 (lws.x * lws.y * lws.z);
452 device->getDevice().syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel());
453 }
454
455 if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
456 if (multiDispatchInfo.peekMainKernel()->isKernelDebugEnabled()) {
457 setupDebugSurface(multiDispatchInfo.peekMainKernel());
458 }
459 }
460
461 if (event && this->isProfilingEnabled()) {
462 // Get allocation for timestamps
463 hwTimeStamps = event->getHwTimeStampNode();
464 }
465
466 if (auto parentKernel = multiDispatchInfo.peekParentKernel()) {
467 parentKernel->createReflectionSurface();
468 parentKernel->patchDefaultDeviceQueue(context->getDefaultDeviceQueue());
469 parentKernel->patchEventPool(context->getDefaultDeviceQueue());
470 parentKernel->patchReflectionSurface(context->getDefaultDeviceQueue(), printfHandler.get());
471 if (!blockQueue) {
472 devQueueHw->resetDeviceQueue();
473 devQueueHw->acquireEMCriticalSection();
474 }
475 }
476
477 if (event && this->isPerfCountersEnabled()) {
478 hwPerfCounter = event->getHwPerfCounterNode();
479 }
480
481 HardwareInterface<GfxFamily>::dispatchWalker(
482 *this,
483 multiDispatchInfo,
484 csrDeps,
485 blockedCommandsData,
486 hwTimeStamps,
487 hwPerfCounter,
488 ×tampPacketDependencies,
489 timestampPacketContainer.get(),
490 commandType);
491
492 if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
493 for (auto &dispatchInfo : multiDispatchInfo) {
494 for (auto &patchInfoData : dispatchInfo.getKernel()->getPatchInfoDataList()) {
495 getGpgpuCommandStreamReceiver().getFlatBatchBufferHelper().setPatchInfoData(patchInfoData);
496 }
497 }
498 }
499
500 getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize());
501 }
502
503 template <typename GfxFamily>
processDispatchForBlitEnqueue(CommandStreamReceiver & blitCommandStreamReceiver,const MultiDispatchInfo & multiDispatchInfo,TimestampPacketDependencies & timestampPacketDependencies,const EventsRequest & eventsRequest,LinearStream * commandStream,uint32_t commandType,bool queueBlocked)504 BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(CommandStreamReceiver &blitCommandStreamReceiver,
505 const MultiDispatchInfo &multiDispatchInfo,
506 TimestampPacketDependencies ×tampPacketDependencies,
507 const EventsRequest &eventsRequest, LinearStream *commandStream,
508 uint32_t commandType, bool queueBlocked) {
509 auto blitDirection = ClBlitProperties::obtainBlitDirection(commandType);
510
511 auto blitProperties = ClBlitProperties::constructProperties(blitDirection, blitCommandStreamReceiver,
512 multiDispatchInfo.peekBuiltinOpParams());
513 if (!queueBlocked) {
514 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, blitCommandStreamReceiver,
515 CsrDependencies::DependenciesType::All);
516
517 blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.cacheFlushNodes);
518 blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
519 blitProperties.csrDependencies.timestampPacketContainer.push_back(×tampPacketDependencies.barrierNodes);
520 }
521
522 auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
523 blitProperties.outputTimestampPacket = currentTimestampPacketNode;
524
525 if (commandStream) {
526 if (timestampPacketDependencies.cacheFlushNodes.peekNodes().size() > 0) {
527 auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
528 const auto &hwInfo = device->getHardwareInfo();
529 PipeControlArgs args;
530 args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed(true, hwInfo);
531 MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
532 *commandStream,
533 GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
534 cacheFlushTimestampPacketGpuAddress,
535 0,
536 hwInfo,
537 args);
538 }
539
540 TimestampPacketHelper::programSemaphore<GfxFamily>(*commandStream, *currentTimestampPacketNode);
541 }
542 return blitProperties;
543 }
544
545 template <typename GfxFamily>
processDispatchForBlitAuxTranslation(CommandStreamReceiver & bcsCsr,const MultiDispatchInfo & multiDispatchInfo,BlitPropertiesContainer & blitPropertiesContainer,TimestampPacketDependencies & timestampPacketDependencies,const EventsRequest & eventsRequest,bool queueBlocked)546 void CommandQueueHw<GfxFamily>::processDispatchForBlitAuxTranslation(CommandStreamReceiver &bcsCsr,
547 const MultiDispatchInfo &multiDispatchInfo,
548 BlitPropertiesContainer &blitPropertiesContainer,
549 TimestampPacketDependencies ×tampPacketDependencies,
550 const EventsRequest &eventsRequest, bool queueBlocked) {
551 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
552 auto nodesAllocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
553 auto numKernelObjs = multiDispatchInfo.getKernelObjsForAuxTranslation()->size();
554 blitPropertiesContainer.resize(numKernelObjs * 2);
555
556 auto bufferIndex = 0;
557 for (auto &kernelObj : *multiDispatchInfo.getKernelObjsForAuxTranslation()) {
558 GraphicsAllocation *allocation = nullptr;
559 if (kernelObj.type == KernelObjForAuxTranslation::Type::MEM_OBJ) {
560 auto buffer = static_cast<Buffer *>(kernelObj.object);
561 allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
562 } else {
563 DEBUG_BREAK_IF(kernelObj.type != KernelObjForAuxTranslation::Type::GFX_ALLOC);
564 allocation = static_cast<GraphicsAllocation *>(kernelObj.object);
565 }
566 {
567 // Aux to NonAux
568 blitPropertiesContainer[bufferIndex] = BlitProperties::constructPropertiesForAuxTranslation(
569 AuxTranslationDirection::AuxToNonAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
570 auto auxToNonAuxNode = nodesAllocator->getTag();
571 timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode);
572 }
573
574 {
575 // NonAux to Aux
576 blitPropertiesContainer[bufferIndex + numKernelObjs] = BlitProperties::constructPropertiesForAuxTranslation(
577 AuxTranslationDirection::NonAuxToAux, allocation, getGpgpuCommandStreamReceiver().getClearColorAllocation());
578 auto nonAuxToAuxNode = nodesAllocator->getTag();
579 timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode);
580 }
581 bufferIndex++;
582 }
583
584 if (!queueBlocked) {
585 CsrDependencies csrDeps;
586 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
587 BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies,
588 *this->timestampPacketContainer, csrDeps,
589 getGpgpuCommandStreamReceiver(), bcsCsr);
590 }
591
592 eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
593 }
594
595 template <typename GfxFamily>
processDispatchForCacheFlush(Surface ** surfaces,size_t numSurfaces,LinearStream * commandStream,CsrDependencies & csrDeps)596 void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
597 size_t numSurfaces,
598 LinearStream *commandStream,
599 CsrDependencies &csrDeps) {
600
601 TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDeps);
602
603 uint64_t postSyncAddress = 0;
604 if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
605 auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0);
606 timestampPacketNodeForPostSync->setProfilingCapable(false);
607 postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
608 }
609
610 submitCacheFlush(surfaces, numSurfaces, commandStream, postSyncAddress);
611 }
612
613 template <typename GfxFamily>
processDispatchForMarker(CommandQueue & commandQueue,LinearStream * commandStream,EventsRequest & eventsRequest,CsrDependencies & csrDeps)614 void CommandQueueHw<GfxFamily>::processDispatchForMarker(CommandQueue &commandQueue,
615 LinearStream *commandStream,
616 EventsRequest &eventsRequest,
617 CsrDependencies &csrDeps) {
618 auto event = castToObjectOrAbort<Event>(*eventsRequest.outEvent);
619
620 TagNodeBase *hwTimeStamps = nullptr;
621 TagNodeBase *hwPerfCounter = nullptr;
622
623 hwTimeStamps = event->getHwTimeStampNode();
624
625 HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
626 HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
627 getGpgpuCommandStreamReceiver().makeResident(*hwTimeStamps->getBaseGraphicsAllocation());
628 }
629
630 template <typename GfxFamily>
processDispatchForMarkerWithTimestampPacket(CommandQueue & commandQueue,LinearStream * commandStream,EventsRequest & eventsRequest,CsrDependencies & csrDeps)631 void CommandQueueHw<GfxFamily>::processDispatchForMarkerWithTimestampPacket(CommandQueue &commandQueue,
632 LinearStream *commandStream,
633 EventsRequest &eventsRequest,
634 CsrDependencies &csrDeps) {
635 auto currentTimestampPacketNode = commandQueue.getTimestampPacketContainer()->peekNodes().at(0);
636
637 auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode);
638 auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode);
639
640 EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress);
641 EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress);
642
643 auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode);
644 auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode);
645
646 EncodeStoreMMIO<GfxFamily>::encode(*commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress);
647 EncodeStoreMMIO<GfxFamily>::encode(*commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress);
648 }
649
650 template <typename GfxFamily>
processDeviceEnqueue(DeviceQueueHw<GfxFamily> * devQueueHw,const MultiDispatchInfo & multiDispatchInfo,TagNodeBase * hwTimeStamps,bool & blocking)651 void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *devQueueHw,
652 const MultiDispatchInfo &multiDispatchInfo,
653 TagNodeBase *hwTimeStamps,
654 bool &blocking) {
655 auto parentKernel = multiDispatchInfo.peekParentKernel();
656 size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
657 bool isCcsUsed = EngineHelpers::isCcs(gpgpuEngine->osContext->getEngineType());
658
659 uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
660 devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
661 *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
662 parentKernel,
663 (uint32_t)multiDispatchInfo.size(),
664 getGpgpuCommandStreamReceiver().getTagAllocation()->getGpuAddress(),
665 taskCount,
666 hwTimeStamps,
667 isCcsUsed);
668
669 SchedulerKernel &scheduler = getContext().getSchedulerKernel();
670
671 scheduler.setArgs(devQueueHw->getQueueBuffer(),
672 devQueueHw->getStackBuffer(),
673 devQueueHw->getEventPoolBuffer(),
674 devQueueHw->getSlbBuffer(),
675 devQueueHw->getDshBuffer(),
676 parentKernel->getKernelReflectionSurface(),
677 devQueueHw->getQueueStorageBuffer(),
678 this->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u).getGraphicsAllocation(),
679 devQueueHw->getDebugQueue());
680
681 auto preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
682 GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
683 *this->commandStream,
684 *devQueueHw,
685 preemptionMode,
686 scheduler,
687 &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
688 devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
689 isCcsUsed);
690
691 scheduler.makeResident(getGpgpuCommandStreamReceiver());
692
693 parentKernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(getGpgpuCommandStreamReceiver());
694
695 if (parentKernel->isAuxTranslationRequired()) {
696 blocking = true;
697 }
698 }
699
700 template <typename GfxFamily>
obtainTaskLevelAndBlockedStatus(unsigned int & taskLevel,cl_uint & numEventsInWaitList,const cl_event * & eventWaitList,bool & blockQueueStatus,unsigned int commandType)701 void CommandQueueHw<GfxFamily>::obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) {
702 auto isQueueBlockedStatus = isQueueBlocked();
703 taskLevel = getTaskLevelFromWaitList(this->taskLevel, numEventsInWaitList, eventWaitList);
704 blockQueueStatus = (taskLevel == CompletionStamp::notReady) || isQueueBlockedStatus;
705
706 auto taskLevelUpdateRequired = isTaskLevelUpdateRequired(taskLevel, eventWaitList, numEventsInWaitList, commandType);
707 if (taskLevelUpdateRequired) {
708 taskLevel++;
709 this->taskLevel = taskLevel;
710 }
711
712 DBG_LOG(EventsDebugEnable, "blockQueue", blockQueueStatus, "virtualEvent", virtualEvent, "taskLevel", taskLevel);
713 }
714
715 template <typename GfxFamily>
isTaskLevelUpdateRequired(const uint32_t & taskLevel,const cl_event * eventWaitList,const cl_uint & numEventsInWaitList,unsigned int commandType)716 bool CommandQueueHw<GfxFamily>::isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType) {
717 bool updateTaskLevel = true;
718 //if we are blocked by user event then no update
719 if (taskLevel == CompletionStamp::notReady) {
720 updateTaskLevel = false;
721 }
722 //if we are executing command without kernel then it will inherit state from
723 //previous commands, barrier is exception
724 if (isCommandWithoutKernel(commandType) && commandType != CL_COMMAND_BARRIER) {
725 updateTaskLevel = false;
726 }
727 //ooq special cases starts here
728 if (this->isOOQEnabled()) {
729 //if no wait list and barrier , do not update task level
730 if (eventWaitList == nullptr && commandType != CL_COMMAND_BARRIER) {
731 updateTaskLevel = false;
732 }
733 //if we have waitlist then deduce task level from waitlist and check if it is higher then current task level of queue
734 if (eventWaitList != nullptr) {
735 auto taskLevelFromEvents = getTaskLevelFromWaitList(0, numEventsInWaitList, eventWaitList);
736 taskLevelFromEvents++;
737 if (taskLevelFromEvents <= this->taskLevel) {
738 updateTaskLevel = false;
739 }
740 }
741 }
742 return updateTaskLevel;
743 }
744
745 template <typename GfxFamily>
746 template <uint32_t commandType>
enqueueNonBlocked(Surface ** surfaces,size_t surfaceCount,LinearStream & commandStream,size_t commandStreamStart,bool & blocking,bool clearDependenciesForSubCapture,const MultiDispatchInfo & multiDispatchInfo,const EnqueueProperties & enqueueProperties,TimestampPacketDependencies & timestampPacketDependencies,EventsRequest & eventsRequest,EventBuilder & eventBuilder,uint32_t taskLevel,PrintfHandler * printfHandler,CommandStreamReceiver * bcsCsr)747 CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
748 Surface **surfaces,
749 size_t surfaceCount,
750 LinearStream &commandStream,
751 size_t commandStreamStart,
752 bool &blocking,
753 bool clearDependenciesForSubCapture,
754 const MultiDispatchInfo &multiDispatchInfo,
755 const EnqueueProperties &enqueueProperties,
756 TimestampPacketDependencies ×tampPacketDependencies,
757 EventsRequest &eventsRequest,
758 EventBuilder &eventBuilder,
759 uint32_t taskLevel,
760 PrintfHandler *printfHandler,
761 CommandStreamReceiver *bcsCsr) {
762
763 UNRECOVERABLE_IF(multiDispatchInfo.empty());
764
765 auto implicitFlush = false;
766
767 if (printfHandler) {
768 blocking = true;
769 printfHandler->makeResident(getGpgpuCommandStreamReceiver());
770 }
771
772 if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
773 device->getDevice().syncBufferHandler->makeResident(getGpgpuCommandStreamReceiver());
774 }
775
776 if (timestampPacketContainer) {
777 timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
778 timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
779 timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
780 }
781
782 bool anyUncacheableArgs = false;
783 auto requiresCoherency = false;
784 for (auto surface : CreateRange(surfaces, surfaceCount)) {
785 surface->makeResident(getGpgpuCommandStreamReceiver());
786 requiresCoherency |= surface->IsCoherent;
787 if (!surface->allowsL3Caching()) {
788 anyUncacheableArgs = true;
789 }
790 }
791
792 auto mediaSamplerRequired = false;
793 uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber;
794 auto specialPipelineSelectMode = false;
795 Kernel *kernel = nullptr;
796 bool auxTranslationRequired = false;
797 bool useGlobalAtomics = false;
798
799 for (auto &dispatchInfo : multiDispatchInfo) {
800 if (kernel != dispatchInfo.getKernel()) {
801 kernel = dispatchInfo.getKernel();
802 } else {
803 continue;
804 }
805 kernel->makeResident(getGpgpuCommandStreamReceiver());
806 requiresCoherency |= kernel->requiresCoherency();
807 mediaSamplerRequired |= kernel->isVmeKernel();
808 auto numGrfRequiredByKernel = static_cast<uint32_t>(kernel->getKernelInfo().kernelDescriptor.kernelAttributes.numGrfRequired);
809 numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel);
810 specialPipelineSelectMode |= kernel->requiresSpecialPipelineSelectMode();
811 auxTranslationRequired |= kernel->isAuxTranslationRequired();
812 if (kernel->hasUncacheableStatelessArgs()) {
813 anyUncacheableArgs = true;
814 }
815
816 if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics) {
817 useGlobalAtomics = true;
818 }
819 }
820
821 if (mediaSamplerRequired) {
822 DEBUG_BREAK_IF(device->getDeviceInfo().preemptionSupported != false);
823 }
824
825 if (isProfilingEnabled() && eventBuilder.getEvent()) {
826 eventBuilder.getEvent()->setSubmitTimeStamp();
827
828 auto hwTimestampNode = eventBuilder.getEvent()->getHwTimeStampNode();
829 if (hwTimestampNode) {
830 getGpgpuCommandStreamReceiver().makeResident(*hwTimestampNode->getBaseGraphicsAllocation());
831 }
832
833 if (isPerfCountersEnabled()) {
834 getGpgpuCommandStreamReceiver().makeResident(*eventBuilder.getEvent()->getHwPerfCounterNode()->getBaseGraphicsAllocation());
835 }
836 }
837
838 IndirectHeap *dsh = nullptr;
839 IndirectHeap *ioh = nullptr;
840
841 if (multiDispatchInfo.peekParentKernel()) {
842 DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(this->getContext().getDefaultDeviceQueue());
843 DEBUG_BREAK_IF(pDevQueue == nullptr);
844 dsh = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
845 // In ExecutionModel IOH is the same as DSH to eliminate StateBaseAddress reprogramming for scheduler kernel and blocks.
846 ioh = dsh;
847 implicitFlush = true;
848 } else {
849 dsh = &getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u);
850 ioh = &getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u);
851 }
852
853 auto allocNeedsFlushDC = false;
854 if (!device->isFullRangeSvm()) {
855 if (std::any_of(getGpgpuCommandStreamReceiver().getResidencyAllocations().begin(), getGpgpuCommandStreamReceiver().getResidencyAllocations().end(), [](const auto allocation) { return allocation->isFlushL3Required(); })) {
856 allocNeedsFlushDC = true;
857 }
858 }
859
860 auto memoryCompressionState = getGpgpuCommandStreamReceiver().getMemoryCompressionState(auxTranslationRequired, device->getHardwareInfo());
861
862 DispatchFlags dispatchFlags(
863 {}, //csrDependencies
864 ×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
865 {}, //pipelineSelectArgs
866 this->flushStamp->getStampReference(), //flushStampReference
867 getThrottle(), //throttle
868 ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo), //preemptionMode
869 numGrfRequired, //numGrfRequired
870 L3CachingSettings::l3CacheOn, //l3CacheSettings
871 kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy
872 kernel->getAdditionalKernelExecInfo(), //additionalKernelExecInfo
873 kernel->getExecutionType(), //kernelExecutionType
874 memoryCompressionState, //memoryCompressionState
875 getSliceCount(), //sliceCount
876 blocking, //blocking
877 shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush
878 multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel(), //useSLM
879 true, //guardCommandBufferWithPipeControl
880 commandType == CL_COMMAND_NDRANGE_KERNEL, //GSBA32BitRequired
881 requiresCoherency, //requiresCoherency
882 (QueuePriority::LOW == priority), //lowPriority
883 implicitFlush, //implicitFlush
884 !eventBuilder.getEvent() || getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
885 false, //epilogueRequired
886 false, //usePerDssBackedBuffer
887 kernel->isSingleSubdevicePreferred(), //useSingleSubdevice
888 useGlobalAtomics, //useGlobalAtomics
889 kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext
890 kernel->requiresMemoryMigration(), //memoryMigrationRequired
891 isTextureCacheFlushNeeded(commandType)); //textureCacheFlush
892
893 dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
894 dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
895
896 if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) {
897 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
898 dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
899 }
900
901 DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
902
903 if (anyUncacheableArgs) {
904 dispatchFlags.l3CacheSettings = L3CachingSettings::l3CacheOff;
905 } else if (!kernel->areStatelessWritesUsed()) {
906 dispatchFlags.l3CacheSettings = L3CachingSettings::l3AndL1On;
907 }
908
909 if (this->dispatchHints != 0) {
910 dispatchFlags.engineHints = this->dispatchHints;
911 dispatchFlags.epilogueRequired = true;
912 }
913
914 if (gtpinIsGTPinInitialized()) {
915 gtpinNotifyPreFlushTask(this);
916 }
917
918 if (enqueueProperties.blitPropertiesContainer->size() > 0) {
919 const auto newTaskCount = bcsCsr->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
920 this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
921 dispatchFlags.implicitFlush = true;
922 }
923
924 PRINT_DEBUG_STRING(DebugManager.flags.PrintDebugMessages.get(), stdout, "preemption = %d.\n", static_cast<int>(dispatchFlags.preemptionMode));
925 CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask(
926 commandStream,
927 commandStreamStart,
928 *dsh,
929 *ioh,
930 getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
931 taskLevel,
932 dispatchFlags,
933 getDevice());
934
935 if (gtpinIsGTPinInitialized()) {
936 gtpinNotifyFlushTask(completionStamp.taskCount);
937 }
938
939 return completionStamp;
940 }
941
942 template <typename GfxFamily>
enqueueBlocked(uint32_t commandType,Surface ** surfaces,size_t surfaceCount,const MultiDispatchInfo & multiDispatchInfo,TimestampPacketDependencies & timestampPacketDependencies,std::unique_ptr<KernelOperation> & blockedCommandsData,const EnqueueProperties & enqueueProperties,EventsRequest & eventsRequest,EventBuilder & externalEventBuilder,std::unique_ptr<PrintfHandler> && printfHandler,CommandStreamReceiver * bcsCsr)943 void CommandQueueHw<GfxFamily>::enqueueBlocked(
944 uint32_t commandType,
945 Surface **surfaces,
946 size_t surfaceCount,
947 const MultiDispatchInfo &multiDispatchInfo,
948 TimestampPacketDependencies ×tampPacketDependencies,
949 std::unique_ptr<KernelOperation> &blockedCommandsData,
950 const EnqueueProperties &enqueueProperties,
951 EventsRequest &eventsRequest,
952 EventBuilder &externalEventBuilder,
953 std::unique_ptr<PrintfHandler> &&printfHandler,
954 CommandStreamReceiver *bcsCsr) {
955
956 TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
957
958 //store previous virtual event as it will add dependecies to new virtual event
959 if (this->virtualEvent) {
960 DBG_LOG(EventsDebugEnable, "enqueueBlocked", "previousVirtualEvent", this->virtualEvent);
961 }
962
963 EventBuilder internalEventBuilder;
964 EventBuilder *eventBuilder;
965 // check if event will be exposed externally
966 if (externalEventBuilder.getEvent()) {
967 externalEventBuilder.getEvent()->incRefInternal();
968 eventBuilder = &externalEventBuilder;
969 DBG_LOG(EventsDebugEnable, "enqueueBlocked", "output event as virtualEvent", virtualEvent);
970 } else {
971 // it will be an internal event
972 internalEventBuilder.create<VirtualEvent>(this, context);
973 eventBuilder = &internalEventBuilder;
974 DBG_LOG(EventsDebugEnable, "enqueueBlocked", "new virtualEvent", eventBuilder->getEvent());
975 }
976 auto outEvent = eventBuilder->getEvent();
977
978 //update queue taskCount
979 taskCount = outEvent->getCompletionStamp();
980
981 std::unique_ptr<Command> command;
982 bool storeTimestampPackets = false;
983
984 if (blockedCommandsData) {
985 if (enqueueProperties.blitPropertiesContainer) {
986 blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer;
987 blockedCommandsData->bcsCsr = bcsCsr;
988 blockedCommandsData->blitEnqueue = true;
989 }
990
991 storeTimestampPackets = (timestampPacketContainer != nullptr);
992 }
993
994 if (enqueueProperties.operation != EnqueueProperties::Operation::GpuKernel) {
995 command = std::make_unique<CommandWithoutKernel>(*this, blockedCommandsData);
996 } else {
997 //store task data in event
998 std::vector<Surface *> allSurfaces;
999 Kernel *kernel = nullptr;
1000 for (auto &dispatchInfo : multiDispatchInfo) {
1001 if (kernel != dispatchInfo.getKernel()) {
1002 kernel = dispatchInfo.getKernel();
1003 } else {
1004 continue;
1005 }
1006 kernel->getResidency(allSurfaces);
1007 }
1008 for (auto &surface : CreateRange(surfaces, surfaceCount)) {
1009 allSurfaces.push_back(surface->duplicate());
1010 }
1011
1012 PreemptionMode preemptionMode = ClPreemptionHelper::taskPreemptionMode(getDevice(), multiDispatchInfo);
1013 bool slmUsed = multiDispatchInfo.usesSlm() || multiDispatchInfo.peekParentKernel();
1014 command = std::make_unique<CommandComputeKernel>(*this,
1015 blockedCommandsData,
1016 allSurfaces,
1017 shouldFlushDC(commandType, printfHandler.get()),
1018 slmUsed,
1019 commandType,
1020 std::move(printfHandler),
1021 preemptionMode,
1022 multiDispatchInfo.peekMainKernel(),
1023 (uint32_t)multiDispatchInfo.size());
1024 }
1025 if (storeTimestampPackets) {
1026 command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
1027 command->setEventsRequest(eventsRequest);
1028 } else if (this->context->getRootDeviceIndices().size() > 1) {
1029 command->setEventsRequest(eventsRequest);
1030 }
1031
1032 outEvent->setCommand(std::move(command));
1033
1034 eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));
1035 eventBuilder->addParentEvent(this->virtualEvent);
1036 eventBuilder->finalize();
1037
1038 if (this->virtualEvent) {
1039 this->virtualEvent->decRefInternal();
1040 }
1041
1042 this->virtualEvent = outEvent;
1043 }
1044
1045 template <typename GfxFamily>
enqueueCommandWithoutKernel(Surface ** surfaces,size_t surfaceCount,LinearStream * commandStream,size_t commandStreamStart,bool & blocking,const EnqueueProperties & enqueueProperties,TimestampPacketDependencies & timestampPacketDependencies,EventsRequest & eventsRequest,EventBuilder & eventBuilder,uint32_t taskLevel,CsrDependencies & csrDeps,CommandStreamReceiver * bcsCsr)1046 CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
1047 Surface **surfaces,
1048 size_t surfaceCount,
1049 LinearStream *commandStream,
1050 size_t commandStreamStart,
1051 bool &blocking,
1052 const EnqueueProperties &enqueueProperties,
1053 TimestampPacketDependencies ×tampPacketDependencies,
1054 EventsRequest &eventsRequest,
1055 EventBuilder &eventBuilder,
1056 uint32_t taskLevel,
1057 CsrDependencies &csrDeps,
1058 CommandStreamReceiver *bcsCsr) {
1059
1060 CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
1061 bool flushGpgpuCsr = true;
1062
1063 if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired(false)) {
1064 flushGpgpuCsr = false;
1065 } else {
1066 csrDeps.makeResident(getGpgpuCommandStreamReceiver());
1067 }
1068
1069 if (eventBuilder.getEvent() && isProfilingEnabled()) {
1070 eventBuilder.getEvent()->setSubmitTimeStamp();
1071 eventBuilder.getEvent()->setStartTimeStamp();
1072 }
1073
1074 if (flushGpgpuCsr) {
1075 if (timestampPacketContainer) {
1076 timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
1077 timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
1078 timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
1079 }
1080
1081 for (auto surface : CreateRange(surfaces, surfaceCount)) {
1082 surface->makeResident(getGpgpuCommandStreamReceiver());
1083 }
1084
1085 auto rootDeviceIndex = getDevice().getRootDeviceIndex();
1086 DispatchFlags dispatchFlags(
1087 {}, //csrDependencies
1088 ×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
1089 {}, //pipelineSelectArgs
1090 flushStamp->getStampReference(), //flushStampReference
1091 getThrottle(), //throttle
1092 device->getPreemptionMode(), //preemptionMode
1093 GrfConfig::NotApplicable, //numGrfRequired
1094 L3CachingSettings::NotApplicable, //l3CacheSettings
1095 ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
1096 AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
1097 KernelExecutionType::NotApplicable, //kernelExecutionType
1098 MemoryCompressionState::NotApplicable, //memoryCompressionState
1099 getSliceCount(), //sliceCount
1100 blocking, //blocking
1101 false, //dcFlush
1102 false, //useSLM
1103 true, //guardCommandBufferWithPipeControl
1104 false, //GSBA32BitRequired
1105 false, //requiresCoherency
1106 false, //lowPriority
1107 (enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
1108 getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
1109 false, //epilogueRequired
1110 false, //usePerDssBackedBuffer
1111 false, //useSingleSubdevice
1112 false, //useGlobalAtomics
1113 context->containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
1114 false, //memoryMigrationRequired
1115 false); //textureCacheFlush
1116
1117 if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
1118 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
1119 dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
1120 }
1121
1122 completionStamp = getGpgpuCommandStreamReceiver().flushTask(
1123 *commandStream,
1124 commandStreamStart,
1125 getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
1126 getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
1127 getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
1128 taskLevel,
1129 dispatchFlags,
1130 getDevice());
1131 }
1132
1133 if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
1134 UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);
1135 const auto newTaskCount = bcsCsr->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
1136 this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
1137 }
1138
1139 return completionStamp;
1140 }
1141
1142 template <typename GfxFamily>
computeOffsetsValueForRectCommands(size_t * bufferOffset,size_t * hostOffset,const size_t * bufferOrigin,const size_t * hostOrigin,const size_t * region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch)1143 void CommandQueueHw<GfxFamily>::computeOffsetsValueForRectCommands(size_t *bufferOffset,
1144 size_t *hostOffset,
1145 const size_t *bufferOrigin,
1146 const size_t *hostOrigin,
1147 const size_t *region,
1148 size_t bufferRowPitch,
1149 size_t bufferSlicePitch,
1150 size_t hostRowPitch,
1151 size_t hostSlicePitch) {
1152 size_t computedBufferRowPitch = bufferRowPitch ? bufferRowPitch : region[0];
1153 size_t computedBufferSlicePitch = bufferSlicePitch ? bufferSlicePitch : region[1] * computedBufferRowPitch;
1154 size_t computedHostRowPitch = hostRowPitch ? hostRowPitch : region[0];
1155 size_t computedHostSlicePitch = hostSlicePitch ? hostSlicePitch : region[1] * computedHostRowPitch;
1156 *bufferOffset = bufferOrigin[2] * computedBufferSlicePitch + bufferOrigin[1] * computedBufferRowPitch + bufferOrigin[0];
1157 *hostOffset = hostOrigin[2] * computedHostSlicePitch + hostOrigin[1] * computedHostRowPitch + hostOrigin[0];
1158 }
1159
1160 template <typename GfxFamily>
calculateHostPtrSizeForImage(const size_t * region,size_t rowPitch,size_t slicePitch,Image * image)1161 size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image) {
1162 auto bytesPerPixel = image->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes;
1163 auto dstRowPitch = rowPitch ? rowPitch : region[0] * bytesPerPixel;
1164 auto dstSlicePitch = slicePitch ? slicePitch : ((image->getImageDesc().image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ? 1 : region[1]) * dstRowPitch);
1165
1166 return Image::calculateHostPtrSize(region, dstRowPitch, dstSlicePitch, bytesPerPixel, image->getImageDesc().image_type);
1167 }
1168
1169 template <typename GfxFamily>
1170 template <uint32_t cmdType>
enqueueBlit(const MultiDispatchInfo & multiDispatchInfo,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,bool blocking,CommandStreamReceiver & bcsCsr)1171 void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
1172 auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
1173
1174 EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
1175 EventBuilder eventBuilder;
1176
1177 setupEvent(eventBuilder, eventsRequest.outEvent, cmdType);
1178 eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
1179
1180 std::unique_ptr<KernelOperation> blockedCommandsData;
1181 TakeOwnershipWrapper<CommandQueueHw<GfxFamily>> queueOwnership(*this);
1182
1183 auto blockQueue = false;
1184 auto taskLevel = 0u;
1185 obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, cmdType);
1186 auto clearAllDependencies = queueDependenciesClearRequired();
1187
1188 enqueueHandlerHook(cmdType, multiDispatchInfo);
1189 aubCaptureHook(blocking, clearAllDependencies, multiDispatchInfo);
1190
1191 if (DebugManager.flags.MakeEachEnqueueBlocking.get()) {
1192 blocking = true;
1193 }
1194
1195 TimestampPacketDependencies timestampPacketDependencies;
1196 BlitPropertiesContainer blitPropertiesContainer;
1197 CsrDependencies csrDeps;
1198
1199 eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
1200 auto allocator = bcsCsr.getTimestampPacketAllocator();
1201
1202 if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue)) {
1203 timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
1204 }
1205
1206 if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) {
1207 timestampPacketDependencies.barrierNodes.add(allocator->getTag());
1208 }
1209
1210 obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
1211 csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
1212
1213 LinearStream *gpgpuCommandStream = {};
1214 size_t gpgpuCommandStreamStart = {};
1215 if (isGpgpuSubmissionForBcsRequired(blockQueue)) {
1216 gpgpuCommandStream = obtainCommandStream<cmdType>(csrDeps, true, blockQueue, multiDispatchInfo, eventsRequest, blockedCommandsData, nullptr, 0, false);
1217 gpgpuCommandStreamStart = gpgpuCommandStream->getUsed();
1218 }
1219
1220 if (eventBuilder.getEvent()) {
1221 eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
1222 }
1223
1224 blitPropertiesContainer.push_back(processDispatchForBlitEnqueue(bcsCsr, multiDispatchInfo, timestampPacketDependencies,
1225 eventsRequest, gpgpuCommandStream, cmdType, blockQueue));
1226
1227 CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0};
1228
1229 const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer);
1230
1231 if (!blockQueue) {
1232 completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
1233 enqueueProperties, timestampPacketDependencies, eventsRequest,
1234 eventBuilder, taskLevel, csrDeps, &bcsCsr);
1235
1236 if (eventBuilder.getEvent()) {
1237 eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
1238 }
1239
1240 this->latestSentEnqueueType = enqueueProperties.operation;
1241 }
1242 updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
1243
1244 if (blockQueue) {
1245 enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr);
1246 }
1247
1248 timestampPacketDependencies.moveNodesToNewContainer(*deferredTimestampPackets);
1249
1250 queueOwnership.unlock();
1251 commandStreamReceiverOwnership.unlock();
1252
1253 if (blocking) {
1254 waitForAllEngines(blockQueue, nullptr);
1255 }
1256 }
1257
1258 template <typename GfxFamily>
1259 template <uint32_t cmdType, size_t surfaceCount>
dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo & dispatchInfo,Surface * (& surfaces)[surfaceCount],EBuiltInOps::Type builtInOperation,cl_uint numEventsInWaitList,const cl_event * eventWaitList,cl_event * event,bool blocking,CommandStreamReceiver & csr)1260 void CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr) {
1261 const bool blit = EngineHelpers::isBcs(csr.getOsContext().getEngineType());
1262
1263 if (blit) {
1264 enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
1265 } else {
1266 auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInOperation,
1267 this->getClDevice());
1268 BuiltInOwnershipWrapper builtInLock(builder, this->context);
1269
1270 builder.buildDispatchInfos(dispatchInfo);
1271
1272 enqueueHandler<cmdType>(
1273 surfaces,
1274 blocking,
1275 dispatchInfo,
1276 numEventsInWaitList,
1277 eventWaitList,
1278 event);
1279 }
1280 }
1281
1282 template <typename GfxFamily>
isBlitAuxTranslationRequired(const MultiDispatchInfo & multiDispatchInfo)1283 bool CommandQueueHw<GfxFamily>::isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo) {
1284 return multiDispatchInfo.getKernelObjsForAuxTranslation() &&
1285 (multiDispatchInfo.getKernelObjsForAuxTranslation()->size() > 0) &&
1286 (HwHelperHw<GfxFamily>::get().getAuxTranslationMode(device->getHardwareInfo()) == AuxTranslationMode::Blit);
1287 }
1288
1289 } // namespace NEO
1290