1/*
2 * Copyright (C) 2020-2021 Intel Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 */
7
8#pragma once
9
10#include "shared/source/built_ins/built_ins.h"
11#include "shared/source/built_ins/sip.h"
12#include "shared/source/command_container/command_encoder.h"
13#include "shared/source/command_container/implicit_scaling.h"
14#include "shared/source/command_stream/command_stream_receiver_hw.h"
15#include "shared/source/command_stream/linear_stream.h"
16#include "shared/source/command_stream/preemption.h"
17#include "shared/source/command_stream/thread_arbitration_policy.h"
18#include "shared/source/device/device.h"
19#include "shared/source/helpers/hw_helper.h"
20#include "shared/source/helpers/hw_info.h"
21#include "shared/source/helpers/pipe_control_args.h"
22#include "shared/source/helpers/preamble.h"
23#include "shared/source/memory_manager/memory_manager.h"
24#include "shared/source/memory_manager/residency_container.h"
25#include "shared/source/os_interface/hw_info_config.h"
26#include "shared/source/os_interface/os_context.h"
27#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
28#include "shared/source/unified_memory/unified_memory.h"
29#include "shared/source/utilities/software_tags_manager.h"
30
31#include "level_zero/core/source/cmdlist/cmdlist.h"
32#include "level_zero/core/source/cmdlist/cmdlist_hw.h"
33#include "level_zero/core/source/cmdqueue/cmdqueue_hw.h"
34#include "level_zero/core/source/device/device.h"
35#include "level_zero/core/source/driver/driver_handle_imp.h"
36#include "level_zero/core/source/fence/fence.h"
37#include "level_zero/tools/source/metrics/metric.h"
38
39#include <limits>
40#include <thread>
41
42namespace L0 {
43
44template <GFXCORE_FAMILY gfxCoreFamily>
45ze_result_t CommandQueueHw<gfxCoreFamily>::createFence(const ze_fence_desc_t *desc,
46                                                       ze_fence_handle_t *phFence) {
47    *phFence = Fence::create(this, desc);
48    return ZE_RESULT_SUCCESS;
49}
50
51template <GFXCORE_FAMILY gfxCoreFamily>
52ze_result_t CommandQueueHw<gfxCoreFamily>::destroy() {
53    if (commandStream) {
54        delete commandStream;
55        commandStream = nullptr;
56    }
57    buffers.destroy(this->getDevice());
58    delete this;
59    return ZE_RESULT_SUCCESS;
60}
61
62template <GFXCORE_FAMILY gfxCoreFamily>
63ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
64    uint32_t numCommandLists,
65    ze_command_list_handle_t *phCommandLists,
66    ze_fence_handle_t hFence,
67    bool performMigration) {
68
69    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
70    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
71    using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
72
73    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
74    using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
75
76    using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
77    using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
78
79    auto lockCSR = csr->obtainUniqueOwnership();
80
81    auto anyCommandListWithCooperativeKernels = false;
82    auto anyCommandListWithoutCooperativeKernels = false;
83
84    bool cachedMOCSAllowed = true;
85
86    for (auto i = 0u; i < numCommandLists; i++) {
87        auto commandList = CommandList::fromHandle(phCommandLists[i]);
88        if (peekIsCopyOnlyCommandQueue() != commandList->isCopyOnly()) {
89            return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE;
90        }
91
92        if (this->activeSubDevices < commandList->partitionCount) {
93            return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE;
94        }
95
96        if (commandList->containsCooperativeKernels()) {
97            anyCommandListWithCooperativeKernels = true;
98        } else {
99            anyCommandListWithoutCooperativeKernels = true;
100        }
101        // If the Command List has commands that require uncached MOCS, then any changes to the commands in the queue requires the uncached MOCS
102        if (commandList->requiresQueueUncachedMocs && cachedMOCSAllowed == true) {
103            cachedMOCSAllowed = false;
104        }
105    }
106
107    bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get();
108    if (anyCommandListWithCooperativeKernels && anyCommandListWithoutCooperativeKernels &&
109        (!isMixingRegularAndCooperativeKernelsAllowed)) {
110        return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE;
111    }
112
113    size_t spaceForResidency = 0;
114    size_t preemptionSize = 0u;
115    size_t debuggerCmdsSize = 0;
116    constexpr size_t residencyContainerSpaceForPreemption = 2;
117    constexpr size_t residencyContainerSpaceForFence = 1;
118    constexpr size_t residencyContainerSpaceForTagWrite = 1;
119
120    NEO::Device *neoDevice = device->getNEODevice();
121    auto devicePreemption = device->getDevicePreemptionMode();
122    const bool initialPreemptionMode = commandQueuePreemptionMode == NEO::PreemptionMode::Initial;
123    NEO::PreemptionMode cmdQueuePreemption = commandQueuePreemptionMode;
124    if (initialPreemptionMode) {
125        cmdQueuePreemption = devicePreemption;
126    }
127    NEO::PreemptionMode statePreemption = cmdQueuePreemption;
128
129    const bool stateSipRequired = (initialPreemptionMode && devicePreemption == NEO::PreemptionMode::MidThread) ||
130                                  (neoDevice->getDebugger() && NEO::Debugger::isDebugEnabled(internalUsage));
131
132    if (initialPreemptionMode) {
133        preemptionSize += NEO::PreemptionHelper::getRequiredPreambleSize<GfxFamily>(*neoDevice);
134    }
135
136    if (stateSipRequired) {
137        preemptionSize += NEO::PreemptionHelper::getRequiredStateSipCmdSize<GfxFamily>(*neoDevice, csr->isRcs());
138    }
139
140    preemptionSize += NEO::PreemptionHelper::getRequiredCmdStreamSize<GfxFamily>(devicePreemption, commandQueuePreemptionMode);
141
142    if (NEO::Debugger::isDebugEnabled(internalUsage) && !commandQueueDebugCmdsProgrammed) {
143        debuggerCmdsSize += NEO::PreambleHelper<GfxFamily>::getKernelDebuggingCommandsSize(neoDevice->getSourceLevelDebugger() != nullptr);
144    }
145
146    if (devicePreemption == NEO::PreemptionMode::MidThread) {
147        spaceForResidency += residencyContainerSpaceForPreemption;
148    }
149
150    bool directSubmissionEnabled = isCopyOnlyCommandQueue ? csr->isBlitterDirectSubmissionEnabled() : csr->isDirectSubmissionEnabled();
151    bool programActivePartitionConfig = csr->isProgramActivePartitionConfigRequired();
152
153    L0::Fence *fence = nullptr;
154
155    device->activateMetricGroups();
156
157    size_t totalCmdBuffers = 0;
158    uint32_t perThreadScratchSpaceSize = 0;
159    uint32_t perThreadPrivateScratchSize = 0;
160    NEO::PageFaultManager *pageFaultManager = nullptr;
161    if (performMigration) {
162        pageFaultManager = device->getDriverHandle()->getMemoryManager()->getPageFaultManager();
163        if (pageFaultManager == nullptr) {
164            performMigration = false;
165        }
166    }
167    for (auto i = 0u; i < numCommandLists; i++) {
168        auto commandList = CommandList::fromHandle(phCommandLists[i]);
169
170        bool indirectAllocationsAllowed = commandList->hasIndirectAllocationsAllowed();
171        if (indirectAllocationsAllowed) {
172            UnifiedMemoryControls unifiedMemoryControls = commandList->getUnifiedMemoryControls();
173
174            auto svmAllocsManager = device->getDriverHandle()->getSvmAllocsManager();
175            svmAllocsManager->addInternalAllocationsToResidencyContainer(neoDevice->getRootDeviceIndex(),
176                                                                         commandList->commandContainer.getResidencyContainer(),
177                                                                         unifiedMemoryControls.generateMask());
178        }
179
180        totalCmdBuffers += commandList->commandContainer.getCmdBufferAllocations().size();
181        spaceForResidency += commandList->commandContainer.getResidencyContainer().size();
182        auto commandListPreemption = commandList->getCommandListPreemptionMode();
183        if (statePreemption != commandListPreemption) {
184            if (preemptionCmdSyncProgramming) {
185                preemptionSize += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
186            }
187            preemptionSize += NEO::PreemptionHelper::getRequiredCmdStreamSize<GfxFamily>(commandListPreemption, statePreemption);
188            statePreemption = commandListPreemption;
189        }
190
191        perThreadScratchSpaceSize = std::max(perThreadScratchSpaceSize, commandList->getCommandListPerThreadScratchSize());
192
193        perThreadPrivateScratchSize = std::max(perThreadPrivateScratchSize, commandList->getCommandListPerThreadPrivateScratchSize());
194
195        if (commandList->getCommandListPerThreadScratchSize() != 0 || commandList->getCommandListPerThreadPrivateScratchSize() != 0) {
196            if (commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE) != nullptr) {
197                heapContainer.push_back(commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE)->getGraphicsAllocation());
198            }
199            for (auto element : commandList->commandContainer.sshAllocations) {
200                heapContainer.push_back(element);
201            }
202        }
203
204        partitionCount = std::max(partitionCount, commandList->partitionCount);
205        commandList->csr = csr;
206        commandList->makeResidentAndMigrate(performMigration);
207    }
208
209    size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START);
210    linearStreamSizeEstimate += csr->getCmdsSizeForHardwareContext();
211
212    if (directSubmissionEnabled) {
213        linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_START);
214    } else {
215        linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END);
216    }
217
218    auto csrHw = reinterpret_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(csr);
219    if (programActivePartitionConfig) {
220        linearStreamSizeEstimate += csrHw->getCmdSizeForActivePartitionConfig();
221    }
222
223    const auto &hwInfo = this->device->getHwInfo();
224    if (hFence) {
225        fence = Fence::fromHandle(hFence);
226        spaceForResidency += residencyContainerSpaceForFence;
227        linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
228    }
229
230    spaceForResidency += residencyContainerSpaceForTagWrite;
231
232    csr->getResidencyAllocations().reserve(spaceForResidency);
233
234    auto scratchSpaceController = csr->getScratchSpaceController();
235    bool gsbaStateDirty = false;
236    bool frontEndStateDirty = false;
237    handleScratchSpace(heapContainer,
238                       scratchSpaceController,
239                       gsbaStateDirty, frontEndStateDirty,
240                       perThreadScratchSpaceSize, perThreadPrivateScratchSize);
241
242    auto &streamProperties = csr->getStreamProperties();
243    const auto &hwInfoConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily);
244    auto disableOverdispatch = hwInfoConfig.isDisableOverdispatchAvailable(hwInfo);
245    auto isEngineInstanced = csr->getOsContext().isEngineInstanced();
246    bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
247    if (!isPatchingVfeStateAllowed) {
248        streamProperties.frontEndState.setProperties(anyCommandListWithCooperativeKernels, disableOverdispatch,
249                                                     isEngineInstanced, hwInfo);
250    } else {
251        streamProperties.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced);
252    }
253    frontEndStateDirty |= streamProperties.frontEndState.isDirty();
254
255    gsbaStateDirty |= csr->getGSBAStateDirty();
256    frontEndStateDirty |= csr->getMediaVFEStateDirty();
257    if (!isCopyOnlyCommandQueue) {
258
259        if (!gpgpuEnabled) {
260            linearStreamSizeEstimate += estimatePipelineSelect();
261        }
262
263        linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, numCommandLists, phCommandLists);
264
265        if (gsbaStateDirty) {
266            linearStreamSizeEstimate += estimateStateBaseAddressCmdSize();
267        }
268
269        linearStreamSizeEstimate += preemptionSize + debuggerCmdsSize;
270    }
271
272    if (NEO::DebugManager.flags.EnableSWTags.get()) {
273        linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
274    }
275
276    linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
277
278    size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
279    size_t padding = alignedSize - linearStreamSizeEstimate;
280    reserveLinearStreamSize(alignedSize);
281    NEO::LinearStream child(commandStream->getSpace(alignedSize), alignedSize);
282
283    const auto globalFenceAllocation = csr->getGlobalFenceAllocation();
284    if (globalFenceAllocation) {
285        csr->makeResident(*globalFenceAllocation);
286    }
287
288    const auto workPartitionAllocation = csr->getWorkPartitionAllocation();
289    if (workPartitionAllocation) {
290        csr->makeResident(*workPartitionAllocation);
291    }
292
293    if (NEO::DebugManager.flags.EnableSWTags.get()) {
294        NEO::SWTagsManager *tagsManager = neoDevice->getRootDeviceEnvironment().tagsManager.get();
295        UNRECOVERABLE_IF(tagsManager == nullptr);
296        csr->makeResident(*tagsManager->getBXMLHeapAllocation());
297        csr->makeResident(*tagsManager->getSWTagHeapAllocation());
298        tagsManager->insertBXMLHeapAddress<GfxFamily>(child);
299        tagsManager->insertSWTagHeapAddress<GfxFamily>(child);
300    }
301
302    csr->programHardwareContext(child);
303
304    if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getL0Debugger()) {
305        csr->makeResident(*device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId()));
306    }
307
308    if (!isCopyOnlyCommandQueue) {
309        if (!gpgpuEnabled) {
310            programPipelineSelect(child);
311        }
312
313        if (NEO::Debugger::isDebugEnabled(internalUsage) && !commandQueueDebugCmdsProgrammed && neoDevice->getSourceLevelDebugger()) {
314            NEO::PreambleHelper<GfxFamily>::programKernelDebugging(&child);
315            commandQueueDebugCmdsProgrammed = true;
316        }
317
318        if (gsbaStateDirty) {
319            auto indirectHeap = CommandList::fromHandle(phCommandLists[0])->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
320            programStateBaseAddress(scratchSpaceController->calculateNewGSH(), indirectHeap->getGraphicsAllocation()->isAllocatedInLocalMemoryPool(), child, cachedMOCSAllowed);
321        }
322
323        if (initialPreemptionMode) {
324            NEO::PreemptionHelper::programCsrBaseAddress<GfxFamily>(child, *neoDevice, csr->getPreemptionAllocation());
325        }
326
327        if (stateSipRequired) {
328            NEO::PreemptionHelper::programStateSip<GfxFamily>(child, *neoDevice);
329        }
330
331        if (cmdQueuePreemption != commandQueuePreemptionMode) {
332            NEO::PreemptionHelper::programCmdStream<GfxFamily>(child,
333                                                               cmdQueuePreemption,
334                                                               commandQueuePreemptionMode,
335                                                               csr->getPreemptionAllocation());
336        }
337
338        statePreemption = cmdQueuePreemption;
339
340        const bool sipKernelUsed = devicePreemption == NEO::PreemptionMode::MidThread ||
341                                   (neoDevice->getDebugger() != nullptr && NEO::Debugger::isDebugEnabled(internalUsage));
342
343        if (devicePreemption == NEO::PreemptionMode::MidThread) {
344            csr->makeResident(*csr->getPreemptionAllocation());
345        }
346
347        if (sipKernelUsed) {
348            auto sipIsa = NEO::SipKernel::getSipKernel(*neoDevice).getSipAllocation();
349            csr->makeResident(*sipIsa);
350        }
351
352        if (NEO::Debugger::isDebugEnabled(internalUsage) && neoDevice->getDebugger()) {
353            UNRECOVERABLE_IF(device->getDebugSurface() == nullptr);
354            csr->makeResident(*device->getDebugSurface());
355        }
356    }
357
358    if (programActivePartitionConfig) {
359        csrHw->programActivePartitionConfig(child);
360    }
361
362    for (auto i = 0u; i < numCommandLists; ++i) {
363        auto commandList = CommandList::fromHandle(phCommandLists[i]);
364        auto cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations();
365        auto cmdBufferCount = cmdBufferAllocations.size();
366
367        auto commandListPreemption = commandList->getCommandListPreemptionMode();
368        if (statePreemption != commandListPreemption) {
369            if (NEO::DebugManager.flags.EnableSWTags.get()) {
370                neoDevice->getRootDeviceEnvironment().tagsManager->insertTag<GfxFamily, NEO::SWTags::PipeControlReasonTag>(
371                    child,
372                    *neoDevice,
373                    "ComandList Preemption Mode update", 0u);
374            }
375
376            if (preemptionCmdSyncProgramming) {
377                NEO::PipeControlArgs args;
378                NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(child, args);
379            }
380            NEO::PreemptionHelper::programCmdStream<GfxFamily>(child,
381                                                               commandListPreemption,
382                                                               statePreemption,
383                                                               csr->getPreemptionAllocation());
384            statePreemption = commandListPreemption;
385        }
386
387        if (!isCopyOnlyCommandQueue) {
388            bool programVfe = frontEndStateDirty;
389            if (isPatchingVfeStateAllowed) {
390                auto &requiredStreamState = commandList->getRequiredStreamState();
391                streamProperties.frontEndState.setProperties(requiredStreamState.frontEndState);
392                programVfe |= streamProperties.frontEndState.isDirty();
393            }
394
395            if (programVfe) {
396                programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child);
397                frontEndStateDirty = false;
398            }
399
400            if (isPatchingVfeStateAllowed) {
401                auto &finalStreamState = commandList->getFinalStreamState();
402                streamProperties.frontEndState.setProperties(finalStreamState.frontEndState);
403            }
404        }
405
406        patchCommands(*commandList, scratchSpaceController->getScratchPatchAddress());
407
408        for (size_t iter = 0; iter < cmdBufferCount; iter++) {
409            auto allocation = cmdBufferAllocations[iter];
410            NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, allocation->getGpuAddress(), true);
411        }
412
413        printfFunctionContainer.insert(printfFunctionContainer.end(),
414                                       commandList->getPrintfFunctionContainer().begin(),
415                                       commandList->getPrintfFunctionContainer().end());
416    }
417
418    if (performMigration) {
419        auto commandList = CommandList::fromHandle(phCommandLists[0]);
420        commandList->migrateSharedAllocations();
421    }
422
423    if (stateSipRequired) {
424        NEO::PreemptionHelper::programStateSipEndWa<GfxFamily>(child, *neoDevice);
425    }
426
427    commandQueuePreemptionMode = statePreemption;
428
429    if (hFence) {
430        csr->makeResident(fence->getAllocation());
431        if (isCopyOnlyCommandQueue) {
432            NEO::MiFlushArgs args;
433            args.commandWithPostSync = true;
434            NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(child, fence->getGpuAddress(), Fence::STATE_SIGNALED, args, hwInfo);
435        } else {
436            NEO::PipeControlArgs args;
437            args.dcFlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed(true, hwInfo);
438            if (partitionCount > 1) {
439                args.workloadPartitionOffset = true;
440            }
441            fence->setPartitionCount(partitionCount);
442            NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
443                child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
444                fence->getGpuAddress(),
445                Fence::STATE_SIGNALED,
446                hwInfo,
447                args);
448        }
449    }
450
451    dispatchTaskCountWrite(child, true);
452
453    csr->makeResident(*csr->getTagAllocation());
454    void *endingCmd = nullptr;
455    if (directSubmissionEnabled) {
456        endingCmd = child.getSpace(0);
457        NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, 0ull, false);
458    } else {
459        MI_BATCH_BUFFER_END cmd = GfxFamily::cmdInitBatchBufferEnd;
460        auto buffer = child.getSpaceForCmd<MI_BATCH_BUFFER_END>();
461        *(MI_BATCH_BUFFER_END *)buffer = cmd;
462    }
463
464    if (padding) {
465        void *paddingPtr = child.getSpace(padding);
466        memset(paddingPtr, 0, padding);
467    }
468
469    auto ret = submitBatchBuffer(ptrDiff(child.getCpuBase(), commandStream->getCpuBase()), csr->getResidencyAllocations(), endingCmd,
470                                 anyCommandListWithCooperativeKernels);
471
472    this->taskCount = csr->peekTaskCount();
473
474    csr->makeSurfacePackNonResident(csr->getResidencyAllocations());
475
476    if (getSynchronousMode() == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS) {
477        this->synchronize(std::numeric_limits<uint64_t>::max());
478    }
479
480    this->heapContainer.clear();
481
482    csr->pollForCompletion();
483    if (ret) {
484        return ZE_RESULT_ERROR_UNKNOWN;
485    }
486
487    return ZE_RESULT_SUCCESS;
488}
489
490template <GFXCORE_FAMILY gfxCoreFamily>
491void CommandQueueHw<gfxCoreFamily>::programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream) {
492    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
493    UNRECOVERABLE_IF(csr == nullptr);
494    auto &hwInfo = device->getHwInfo();
495    auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily);
496    auto engineGroupType = hwHelper.getEngineGroupType(csr->getOsContext().getEngineType(),
497                                                       csr->getOsContext().getEngineUsage(), hwInfo);
498    auto pVfeState = NEO::PreambleHelper<GfxFamily>::getSpaceForVfeState(&commandStream, hwInfo, engineGroupType);
499    NEO::PreambleHelper<GfxFamily>::programVfeState(pVfeState,
500                                                    hwInfo,
501                                                    perThreadScratchSpaceSize,
502                                                    scratchAddress,
503                                                    device->getMaxNumHwThreads(),
504                                                    csr->getStreamProperties());
505    csr->setMediaVFEStateDirty(false);
506}
507
508template <GFXCORE_FAMILY gfxCoreFamily>
509size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSize() {
510    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
511    return NEO::PreambleHelper<GfxFamily>::getVFECommandsSize();
512}
513
514template <GFXCORE_FAMILY gfxCoreFamily>
515size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandLists(
516    bool isFrontEndStateDirty, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists) {
517
518    auto singleFrontEndCmdSize = estimateFrontEndCmdSize();
519    bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get();
520    if (!isPatchingVfeStateAllowed) {
521        return isFrontEndStateDirty * singleFrontEndCmdSize;
522    }
523
524    auto streamPropertiesCopy = csr->getStreamProperties();
525    size_t estimatedSize = 0;
526
527    for (size_t i = 0; i < numCommandLists; i++) {
528        auto commandList = CommandList::fromHandle(phCommandLists[i]);
529        auto &requiredStreamState = commandList->getRequiredStreamState();
530        streamPropertiesCopy.frontEndState.setProperties(requiredStreamState.frontEndState);
531
532        if (isFrontEndStateDirty || streamPropertiesCopy.frontEndState.isDirty()) {
533            estimatedSize += singleFrontEndCmdSize;
534            isFrontEndStateDirty = false;
535        }
536        auto &finalStreamState = commandList->getFinalStreamState();
537        streamPropertiesCopy.frontEndState.setProperties(finalStreamState.frontEndState);
538    }
539
540    return estimatedSize;
541}
542
543template <GFXCORE_FAMILY gfxCoreFamily>
544size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelect() {
545
546    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
547    return NEO::PreambleHelper<GfxFamily>::getCmdSizeForPipelineSelect(device->getHwInfo());
548}
549
550template <GFXCORE_FAMILY gfxCoreFamily>
551void CommandQueueHw<gfxCoreFamily>::programPipelineSelect(NEO::LinearStream &commandStream) {
552    NEO::PipelineSelectArgs args = {0, 0};
553    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
554    NEO::PreambleHelper<GfxFamily>::programPipelineSelect(&commandStream, args, device->getHwInfo());
555    gpgpuEnabled = true;
556}
557
558template <GFXCORE_FAMILY gfxCoreFamily>
559void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &commandStream, bool flushDataCache) {
560    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
561    using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
562    using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
563
564    UNRECOVERABLE_IF(csr == nullptr);
565
566    if (csr->isUpdateTagFromWaitEnabled()) {
567        return;
568    }
569
570    auto taskCountToWrite = csr->peekTaskCount() + 1;
571    auto gpuAddress = static_cast<uint64_t>(csr->getTagAllocation()->getGpuAddress());
572
573    const auto &hwInfo = this->device->getHwInfo();
574    if (isCopyOnlyCommandQueue) {
575        NEO::MiFlushArgs args;
576        args.commandWithPostSync = true;
577        args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
578        NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, gpuAddress, taskCountToWrite, args, hwInfo);
579    } else {
580        NEO::PipeControlArgs args;
581        args.dcFlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed(true, hwInfo);
582        if (partitionCount > 1) {
583            args.workloadPartitionOffset = true;
584        }
585        args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
586        NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
587            commandStream,
588            POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
589            gpuAddress,
590            taskCountToWrite,
591            hwInfo,
592            args);
593    }
594}
595
596template <GFXCORE_FAMILY gfxCoreFamily>
597bool CommandQueueHw<gfxCoreFamily>::getPreemptionCmdProgramming() {
598    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
599    return NEO::PreemptionHelper::getRequiredCmdStreamSize<GfxFamily>(NEO::PreemptionMode::MidThread, NEO::PreemptionMode::Initial) > 0u;
600}
601
602} // namespace L0
603