1 /*
2  * Copyright (C) 2018-2021 Intel Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  */
7 
8 #pragma once
9 #include "shared/source/command_stream/aub_subcapture_status.h"
10 #include "shared/source/command_stream/csr_definitions.h"
11 #include "shared/source/command_stream/csr_properties_flags.h"
12 #include "shared/source/command_stream/linear_stream.h"
13 #include "shared/source/command_stream/stream_properties.h"
14 #include "shared/source/command_stream/submissions_aggregator.h"
15 #include "shared/source/command_stream/thread_arbitration_policy.h"
16 #include "shared/source/helpers/aligned_memory.h"
17 #include "shared/source/helpers/blit_commands_helper.h"
18 #include "shared/source/helpers/common_types.h"
19 #include "shared/source/helpers/completion_stamp.h"
20 #include "shared/source/helpers/flat_batch_buffer_helper.h"
21 #include "shared/source/helpers/options.h"
22 #include "shared/source/helpers/pipe_control_args.h"
23 #include "shared/source/indirect_heap/indirect_heap.h"
24 #include "shared/source/kernel/grf_config.h"
25 #include "shared/source/os_interface/os_thread.h"
26 #include "shared/source/utilities/spinlock.h"
27 
28 #include <cstddef>
29 #include <cstdint>
30 
31 namespace NEO {
32 class AllocationsList;
33 class Device;
34 class ExecutionEnvironment;
35 class ExperimentalCommandBuffer;
36 class GmmPageTableMngr;
37 class GraphicsAllocation;
38 class HostPtrSurface;
39 class IndirectHeap;
40 class InternalAllocationStorage;
41 class LinearStream;
42 class MemoryManager;
43 class MultiGraphicsAllocation;
44 class OsContext;
45 class OSInterface;
46 class ScratchSpaceController;
47 class HwPerfCounter;
48 class HwTimeStamps;
49 class TagAllocatorBase;
50 
51 template <typename TSize>
52 class TimestampPackets;
53 
54 template <typename T1>
55 class TagAllocator;
56 
57 enum class DispatchMode {
58     DeviceDefault = 0,          //default for given device
59     ImmediateDispatch,          //everything is submitted to the HW immediately
60     AdaptiveDispatch,           //dispatching is handled to async thread, which combines batch buffers basing on load (not implemented)
61     BatchedDispatchWithCounter, //dispatching is batched, after n commands there is implicit flush (not implemented)
62     BatchedDispatch             // dispatching is batched, explicit clFlush is required
63 };
64 
65 class CommandStreamReceiver {
66   public:
67     enum class SamplerCacheFlushState {
68         samplerCacheFlushNotRequired,
69         samplerCacheFlushBefore, //add sampler cache flush before Walker with redescribed image
70         samplerCacheFlushAfter   //add sampler cache flush after Walker with redescribed image
71     };
72 
73     using MutexType = std::recursive_mutex;
74     CommandStreamReceiver(ExecutionEnvironment &executionEnvironment,
75                           uint32_t rootDeviceIndex,
76                           const DeviceBitfield deviceBitfield);
77     virtual ~CommandStreamReceiver();
78 
79     virtual bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) = 0;
80 
81     virtual CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart,
82                                       const IndirectHeap &dsh, const IndirectHeap &ioh, const IndirectHeap &ssh,
83                                       uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;
84 
85     virtual bool flushBatchedSubmissions() = 0;
86     MOCKABLE_VIRTUAL int submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
pollForCompletion()87     virtual void pollForCompletion() {}
88     virtual void programHardwareContext(LinearStream &cmdStream) = 0;
89     virtual size_t getCmdsSizeForHardwareContext() const = 0;
90 
91     void makeResident(MultiGraphicsAllocation &gfxAllocation);
92     MOCKABLE_VIRTUAL void makeResident(GraphicsAllocation &gfxAllocation);
93     virtual void makeNonResident(GraphicsAllocation &gfxAllocation);
94     MOCKABLE_VIRTUAL void makeSurfacePackNonResident(ResidencyContainer &allocationsForResidency);
processResidency(const ResidencyContainer & allocationsForResidency,uint32_t handleId)95     virtual void processResidency(const ResidencyContainer &allocationsForResidency, uint32_t handleId) {}
96     virtual void processEviction();
97     void makeResidentHostPtrAllocation(GraphicsAllocation *gfxAllocation);
98 
99     MOCKABLE_VIRTUAL void ensureCommandBufferAllocation(LinearStream &commandStream, size_t minimumRequiredSize, size_t additionalAllocationSize);
100 
101     MemoryManager *getMemoryManager() const;
102 
103     ResidencyContainer &getResidencyAllocations();
104     ResidencyContainer &getEvictionAllocations();
105 
createPageTableManager()106     virtual GmmPageTableMngr *createPageTableManager() { return nullptr; }
107     bool needsPageTableManager() const;
108 
109     MOCKABLE_VIRTUAL void waitForTaskCount(uint32_t requiredTaskCount);
110     void waitForTaskCountAndCleanAllocationList(uint32_t requiredTaskCount, uint32_t allocationUsage);
111     MOCKABLE_VIRTUAL void waitForTaskCountAndCleanTemporaryAllocationList(uint32_t requiredTaskCount);
112 
113     LinearStream &getCS(size_t minRequiredSize = 1024u);
114     OSInterface *getOSInterface() const;
peekExecutionEnvironment()115     ExecutionEnvironment &peekExecutionEnvironment() const { return executionEnvironment; };
116 
117     MOCKABLE_VIRTUAL void setTagAllocation(GraphicsAllocation *allocation);
getTagAllocation()118     GraphicsAllocation *getTagAllocation() const {
119         return tagAllocation;
120     }
getTagsMultiAllocation()121     MultiGraphicsAllocation *getTagsMultiAllocation() const {
122         return tagsMultiAllocation;
123     }
124     MultiGraphicsAllocation &createTagsMultiAllocation();
getTagAddress()125     volatile uint32_t *getTagAddress() const { return tagAddress; }
getDebugPauseStateGPUAddress()126     uint64_t getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + debugPauseStateAddressOffset; }
127 
waitForFlushStamp(FlushStamp & flushStampToWait)128     virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; };
129 
peekTaskCount()130     uint32_t peekTaskCount() const { return taskCount; }
131 
peekTaskLevel()132     uint32_t peekTaskLevel() const { return taskLevel; }
133     FlushStamp obtainCurrentFlushStamp() const;
134 
peekLatestSentTaskCount()135     uint32_t peekLatestSentTaskCount() const { return latestSentTaskCount; }
136 
peekLatestFlushedTaskCount()137     uint32_t peekLatestFlushedTaskCount() const { return latestFlushedTaskCount; }
138 
enableNTo1SubmissionModel()139     void enableNTo1SubmissionModel() { this->nTo1SubmissionModelEnabled = true; }
isNTo1SubmissionModelEnabled()140     bool isNTo1SubmissionModelEnabled() const { return this->nTo1SubmissionModelEnabled; }
overrideDispatchPolicy(DispatchMode overrideValue)141     void overrideDispatchPolicy(DispatchMode overrideValue) { this->dispatchMode = overrideValue; }
142 
setMediaVFEStateDirty(bool dirty)143     void setMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; }
getMediaVFEStateDirty()144     bool getMediaVFEStateDirty() { return mediaVfeStateDirty; }
145 
setGSBAStateDirty(bool dirty)146     void setGSBAStateDirty(bool dirty) { GSBAStateDirty = dirty; }
getGSBAStateDirty()147     bool getGSBAStateDirty() { return GSBAStateDirty; }
148 
149     void setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize);
150     GraphicsAllocation *getScratchAllocation();
getDebugSurfaceAllocation()151     GraphicsAllocation *getDebugSurfaceAllocation() const { return debugSurface; }
152     GraphicsAllocation *allocateDebugSurface(size_t size);
getPreemptionAllocation()153     GraphicsAllocation *getPreemptionAllocation() const { return preemptionAllocation; }
getGlobalFenceAllocation()154     GraphicsAllocation *getGlobalFenceAllocation() const { return globalFenceAllocation; }
getWorkPartitionAllocation()155     GraphicsAllocation *getWorkPartitionAllocation() const { return workPartitionAllocation; }
156 
requestStallingCommandsOnNextFlush()157     void requestStallingCommandsOnNextFlush() { stallingCommandsOnNextFlushRequired = true; }
isStallingCommandsOnNextFlushRequired()158     bool isStallingCommandsOnNextFlushRequired() const { return stallingCommandsOnNextFlushRequired; }
159 
160     virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
161     virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
162     bool baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
163     bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait);
downloadAllocations()164     virtual void downloadAllocations(){};
165 
setSamplerCacheFlushRequired(SamplerCacheFlushState value)166     void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
167 
getFlatBatchBufferHelper()168     FlatBatchBufferHelper &getFlatBatchBufferHelper() const { return *flatBatchBufferHelper; }
overwriteFlatBatchBufferHelper(FlatBatchBufferHelper * newHelper)169     void overwriteFlatBatchBufferHelper(FlatBatchBufferHelper *newHelper) { flatBatchBufferHelper.reset(newHelper); }
170 
171     MOCKABLE_VIRTUAL void initProgrammingFlags();
172     virtual AubSubCaptureStatus checkAndActivateAubSubCapture(const std::string &kernelName);
173     void programForAubSubCapture(bool wasActiveInPreviousEnqueue, bool isActive);
174     virtual void addAubComment(const char *comment);
175 
176     IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize);
177     void allocateHeapMemory(IndirectHeap::Type heapType, size_t minRequiredSize, IndirectHeap *&indirectHeap);
178     void releaseIndirectHeap(IndirectHeap::Type heapType);
179 
180     virtual enum CommandStreamReceiverType getType() = 0;
181     void setExperimentalCmdBuffer(std::unique_ptr<ExperimentalCommandBuffer> &&cmdBuffer);
182 
183     bool initializeTagAllocation();
184     MOCKABLE_VIRTUAL bool createWorkPartitionAllocation(const Device &device);
185     MOCKABLE_VIRTUAL bool createGlobalFenceAllocation();
186     MOCKABLE_VIRTUAL bool createPreemptionAllocation();
187     MOCKABLE_VIRTUAL bool createPerDssBackedBuffer(Device &device);
188     MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainUniqueOwnership();
189 
peekTimestampPacketWriteEnabled()190     bool peekTimestampPacketWriteEnabled() const { return timestampPacketWriteEnabled; }
191 
192     size_t defaultSshSize;
193     bool canUse4GbHeaps = true;
194 
195     AllocationsList &getTemporaryAllocations();
196     AllocationsList &getAllocationsForReuse();
getInternalAllocationStorage()197     InternalAllocationStorage *getInternalAllocationStorage() const { return internalAllocationStorage.get(); }
198     MOCKABLE_VIRTUAL bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush);
199     virtual size_t getPreferredTagPoolSize() const;
setupContext(OsContext & osContext)200     virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; }
getOsContext()201     OsContext &getOsContext() const { return *osContext; }
202 
203     TagAllocatorBase *getEventTsAllocator();
204     TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize);
205     virtual TagAllocatorBase *getTimestampPacketAllocator() = 0;
206 
207     virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation);
208 
209     virtual bool isMultiOsContextCapable() const = 0;
210 
211     virtual MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const = 0;
212 
setLatestSentTaskCount(uint32_t latestSentTaskCount)213     void setLatestSentTaskCount(uint32_t latestSentTaskCount) {
214         this->latestSentTaskCount = latestSentTaskCount;
215     }
setLatestFlushedTaskCount(uint32_t latestFlushedTaskCount)216     void setLatestFlushedTaskCount(uint32_t latestFlushedTaskCount) {
217         this->latestFlushedTaskCount = latestFlushedTaskCount;
218     }
219 
220     virtual uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) = 0;
221 
222     virtual void flushTagUpdate() = 0;
223     virtual void flushNonKernelTask(GraphicsAllocation *eventAlloc, uint64_t immediateGpuAddress, uint64_t immediateData, PipeControlArgs &args, bool isWaitOnEvents, bool isStartOfDispatch, bool isEndOfDispatch) = 0;
224     virtual void updateTagFromWait() = 0;
225     virtual bool isUpdateTagFromWaitEnabled() = 0;
226 
getScratchSpaceController()227     ScratchSpaceController *getScratchSpaceController() const {
228         return scratchSpaceController.get();
229     }
230 
registerInstructionCacheFlush()231     void registerInstructionCacheFlush() {
232         auto mutex = obtainUniqueOwnership();
233         requiresInstructionCacheFlush = true;
234     }
235 
isLocalMemoryEnabled()236     bool isLocalMemoryEnabled() const { return localMemoryEnabled; }
237 
getRootDeviceIndex()238     uint32_t getRootDeviceIndex() { return rootDeviceIndex; }
239 
240     void startControllingDirectSubmissions();
241 
isAnyDirectSubmissionEnabled()242     bool isAnyDirectSubmissionEnabled() {
243         return this->isDirectSubmissionEnabled() || isBlitterDirectSubmissionEnabled();
244     }
245 
initDirectSubmission(Device & device,OsContext & osContext)246     virtual bool initDirectSubmission(Device &device, OsContext &osContext) {
247         return true;
248     }
249 
isDirectSubmissionEnabled()250     virtual bool isDirectSubmissionEnabled() const {
251         return false;
252     }
253 
isBlitterDirectSubmissionEnabled()254     virtual bool isBlitterDirectSubmissionEnabled() const {
255         return false;
256     }
257 
stopDirectSubmission()258     virtual void stopDirectSubmission() {}
259 
isStaticWorkPartitioningEnabled()260     bool isStaticWorkPartitioningEnabled() const {
261         return staticWorkPartitioningEnabled;
262     }
263 
264     uint64_t getWorkPartitionAllocationGpuAddress() const;
265 
266     bool isRcs() const;
267 
initializeDefaultsForInternalEngine()268     virtual void initializeDefaultsForInternalEngine(){};
269 
270     virtual GraphicsAllocation *getClearColorAllocation() = 0;
271 
272     virtual void postInitFlagsSetup() = 0;
273 
isUsedNotifyEnableForPostSync()274     bool isUsedNotifyEnableForPostSync() const {
275         return useNotifyEnableForPostSync;
276     }
277 
getStreamProperties()278     NEO::StreamProperties &getStreamProperties() {
279         return this->streamProperties;
280     }
281 
setActivePartitions(uint32_t newPartitionCount)282     inline void setActivePartitions(uint32_t newPartitionCount) {
283         activePartitions = newPartitionCount;
284     }
285 
getActivePartitions()286     inline uint32_t getActivePartitions() const {
287         return activePartitions;
288     }
289 
290     bool skipResourceCleanup() const;
291 
isProgramActivePartitionConfigRequired()292     inline bool isProgramActivePartitionConfigRequired() const {
293         return this->isDirectSubmissionEnabled() ? false : this->activePartitionsConfig != this->activePartitions;
294     }
295 
296     std::unique_ptr<GmmPageTableMngr> pageTableManager;
297 
getPostSyncWriteOffset()298     inline uint32_t getPostSyncWriteOffset() const {
299         return postSyncWriteOffset;
300     }
301 
isMultiTileOperationEnabled()302     inline bool isMultiTileOperationEnabled() const {
303         return (activePartitions > 1) && staticWorkPartitioningEnabled;
304     }
305 
306     virtual void programComputeBarrierCommand(LinearStream &cmdStream) = 0;
307     virtual size_t getCmdsSizeForComputeBarrierCommand() const = 0;
308 
309   protected:
310     void cleanupResources();
311     void printDeviceIndex();
312     void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation);
313     bool checkImplicitFlushForGpuIdle();
314     MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainHostPtrSurfaceCreationLock();
315 
316     std::unique_ptr<FlushStampTracker> flushStamp;
317     std::unique_ptr<SubmissionAggregator> submissionAggregator;
318     std::unique_ptr<FlatBatchBufferHelper> flatBatchBufferHelper;
319     std::unique_ptr<ExperimentalCommandBuffer> experimentalCmdBuffer;
320     std::unique_ptr<InternalAllocationStorage> internalAllocationStorage;
321     std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper;
322     std::unique_ptr<ScratchSpaceController> scratchSpaceController;
323     std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator;
324     std::unique_ptr<TagAllocatorBase> perfCounterAllocator;
325     std::unique_ptr<TagAllocatorBase> timestampPacketAllocator;
326     std::unique_ptr<Thread> userPauseConfirmation;
327 
328     ResidencyContainer residencyAllocations;
329     ResidencyContainer evictionAllocations;
330     MutexType ownershipMutex;
331     MutexType hostPtrSurfaceCreationMutex;
332     ExecutionEnvironment &executionEnvironment;
333 
334     LinearStream commandStream;
335     StreamProperties streamProperties{};
336 
337     // offset for debug state is 1kbyte, tag writes can use multiple offsets for multiple partitions and each offset can vary per platform
338     const uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte;
339     uint64_t totalMemoryUsed = 0u;
340 
341     volatile uint32_t *tagAddress = nullptr;
342     volatile DebugPauseState *debugPauseStateAddress = nullptr;
343     SpinLock debugPauseStateLock;
344     static void *asyncDebugBreakConfirmation(void *arg);
345     std::function<void()> debugConfirmationFunction = []() { std::cin.get(); };
346 
347     GraphicsAllocation *tagAllocation = nullptr;
348     GraphicsAllocation *globalFenceAllocation = nullptr;
349     GraphicsAllocation *preemptionAllocation = nullptr;
350     GraphicsAllocation *debugSurface = nullptr;
351     GraphicsAllocation *perDssBackedBuffer = nullptr;
352     GraphicsAllocation *clearColorAllocation = nullptr;
353     GraphicsAllocation *workPartitionAllocation = nullptr;
354 
355     MultiGraphicsAllocation *tagsMultiAllocation = nullptr;
356 
357     IndirectHeap *indirectHeap[IndirectHeap::NUM_TYPES];
358     OsContext *osContext = nullptr;
359 
360     // current taskLevel.  Used for determining if a PIPE_CONTROL is needed.
361     std::atomic<uint32_t> taskLevel{0};
362     std::atomic<uint32_t> latestSentTaskCount{0};
363     std::atomic<uint32_t> latestFlushedTaskCount{0};
364     // taskCount - # of tasks submitted
365     std::atomic<uint32_t> taskCount{0};
366 
367     DispatchMode dispatchMode = DispatchMode::ImmediateDispatch;
368     SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired;
369     PreemptionMode lastPreemptionMode = PreemptionMode::Initial;
370 
371     uint32_t lastSentL3Config = 0;
372     uint32_t latestSentStatelessMocsConfig = 0;
373     uint32_t lastSentNumGrfRequired = GrfConfig::DefaultGrfNumber;
374     uint64_t lastSentSliceCount = QueueSliceCount::defaultSliceCount;
375 
376     uint32_t requiredScratchSize = 0;
377     uint32_t requiredPrivateScratchSize = 0;
378     uint32_t lastAdditionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
379     KernelExecutionType lastKernelExecutionType = KernelExecutionType::Default;
380     MemoryCompressionState lastMemoryCompressionState = MemoryCompressionState::NotApplicable;
381     uint32_t activePartitions = 1;
382     uint32_t activePartitionsConfig = 1;
383     uint32_t postSyncWriteOffset = 0;
384 
385     const uint32_t rootDeviceIndex;
386     const DeviceBitfield deviceBitfield;
387 
388     int8_t lastMediaSamplerConfig = -1;
389 
390     bool isPreambleSent = false;
391     bool isStateSipSent = false;
392     bool isEnginePrologueSent = false;
393     bool isPerDssBackedBufferSent = false;
394     bool GSBAFor32BitProgrammed = false;
395     bool GSBAStateDirty = true;
396     bool bindingTableBaseAddressRequired = false;
397     bool mediaVfeStateDirty = true;
398     bool lastVmeSubslicesConfig = false;
399     bool stallingCommandsOnNextFlushRequired = false;
400     bool timestampPacketWriteEnabled = false;
401     bool staticWorkPartitioningEnabled = false;
402     bool nTo1SubmissionModelEnabled = false;
403     bool lastSpecialPipelineSelectMode = false;
404     bool requiresInstructionCacheFlush = false;
405 
406     bool localMemoryEnabled = false;
407     bool pageTableManagerInitialized = false;
408 
409     bool useNewResourceImplicitFlush = false;
410     bool newResources = false;
411     bool useGpuIdleImplicitFlush = false;
412     bool lastSentUseGlobalAtomics = false;
413     bool useNotifyEnableForPostSync = false;
414 };
415 
416 typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump,
417                                                                   ExecutionEnvironment &executionEnvironment,
418                                                                   uint32_t rootDeviceIndex,
419                                                                   const DeviceBitfield deviceBitfield);
420 } // namespace NEO
421