1 /* 2 * Copyright (C) 2018-2021 Intel Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 */ 7 8 #pragma once 9 #include "shared/source/command_stream/aub_subcapture_status.h" 10 #include "shared/source/command_stream/csr_definitions.h" 11 #include "shared/source/command_stream/csr_properties_flags.h" 12 #include "shared/source/command_stream/linear_stream.h" 13 #include "shared/source/command_stream/stream_properties.h" 14 #include "shared/source/command_stream/submissions_aggregator.h" 15 #include "shared/source/command_stream/thread_arbitration_policy.h" 16 #include "shared/source/helpers/aligned_memory.h" 17 #include "shared/source/helpers/blit_commands_helper.h" 18 #include "shared/source/helpers/common_types.h" 19 #include "shared/source/helpers/completion_stamp.h" 20 #include "shared/source/helpers/flat_batch_buffer_helper.h" 21 #include "shared/source/helpers/options.h" 22 #include "shared/source/helpers/pipe_control_args.h" 23 #include "shared/source/indirect_heap/indirect_heap.h" 24 #include "shared/source/kernel/grf_config.h" 25 #include "shared/source/os_interface/os_thread.h" 26 #include "shared/source/utilities/spinlock.h" 27 28 #include <cstddef> 29 #include <cstdint> 30 31 namespace NEO { 32 class AllocationsList; 33 class Device; 34 class ExecutionEnvironment; 35 class ExperimentalCommandBuffer; 36 class GmmPageTableMngr; 37 class GraphicsAllocation; 38 class HostPtrSurface; 39 class IndirectHeap; 40 class InternalAllocationStorage; 41 class LinearStream; 42 class MemoryManager; 43 class MultiGraphicsAllocation; 44 class OsContext; 45 class OSInterface; 46 class ScratchSpaceController; 47 class HwPerfCounter; 48 class HwTimeStamps; 49 class TagAllocatorBase; 50 51 template <typename TSize> 52 class TimestampPackets; 53 54 template <typename T1> 55 class TagAllocator; 56 57 enum class DispatchMode { 58 DeviceDefault = 0, //default for given device 59 ImmediateDispatch, //everything is submitted to the HW immediately 60 AdaptiveDispatch, //dispatching is handled to async thread, which combines batch buffers basing on load (not implemented) 61 BatchedDispatchWithCounter, //dispatching is batched, after n commands there is implicit flush (not implemented) 62 BatchedDispatch // dispatching is batched, explicit clFlush is required 63 }; 64 65 class CommandStreamReceiver { 66 public: 67 enum class SamplerCacheFlushState { 68 samplerCacheFlushNotRequired, 69 samplerCacheFlushBefore, //add sampler cache flush before Walker with redescribed image 70 samplerCacheFlushAfter //add sampler cache flush after Walker with redescribed image 71 }; 72 73 using MutexType = std::recursive_mutex; 74 CommandStreamReceiver(ExecutionEnvironment &executionEnvironment, 75 uint32_t rootDeviceIndex, 76 const DeviceBitfield deviceBitfield); 77 virtual ~CommandStreamReceiver(); 78 79 virtual bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) = 0; 80 81 virtual CompletionStamp flushTask(LinearStream &commandStream, size_t commandStreamStart, 82 const IndirectHeap &dsh, const IndirectHeap &ioh, const IndirectHeap &ssh, 83 uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0; 84 85 virtual bool flushBatchedSubmissions() = 0; 86 MOCKABLE_VIRTUAL int submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency); pollForCompletion()87 virtual void pollForCompletion() {} 88 virtual void programHardwareContext(LinearStream &cmdStream) = 0; 89 virtual size_t getCmdsSizeForHardwareContext() const = 0; 90 91 void makeResident(MultiGraphicsAllocation &gfxAllocation); 92 MOCKABLE_VIRTUAL void makeResident(GraphicsAllocation &gfxAllocation); 93 virtual void makeNonResident(GraphicsAllocation &gfxAllocation); 94 MOCKABLE_VIRTUAL void makeSurfacePackNonResident(ResidencyContainer &allocationsForResidency); processResidency(const ResidencyContainer & allocationsForResidency,uint32_t handleId)95 virtual void processResidency(const ResidencyContainer &allocationsForResidency, uint32_t handleId) {} 96 virtual void processEviction(); 97 void makeResidentHostPtrAllocation(GraphicsAllocation *gfxAllocation); 98 99 MOCKABLE_VIRTUAL void ensureCommandBufferAllocation(LinearStream &commandStream, size_t minimumRequiredSize, size_t additionalAllocationSize); 100 101 MemoryManager *getMemoryManager() const; 102 103 ResidencyContainer &getResidencyAllocations(); 104 ResidencyContainer &getEvictionAllocations(); 105 createPageTableManager()106 virtual GmmPageTableMngr *createPageTableManager() { return nullptr; } 107 bool needsPageTableManager() const; 108 109 MOCKABLE_VIRTUAL void waitForTaskCount(uint32_t requiredTaskCount); 110 void waitForTaskCountAndCleanAllocationList(uint32_t requiredTaskCount, uint32_t allocationUsage); 111 MOCKABLE_VIRTUAL void waitForTaskCountAndCleanTemporaryAllocationList(uint32_t requiredTaskCount); 112 113 LinearStream &getCS(size_t minRequiredSize = 1024u); 114 OSInterface *getOSInterface() const; peekExecutionEnvironment()115 ExecutionEnvironment &peekExecutionEnvironment() const { return executionEnvironment; }; 116 117 MOCKABLE_VIRTUAL void setTagAllocation(GraphicsAllocation *allocation); getTagAllocation()118 GraphicsAllocation *getTagAllocation() const { 119 return tagAllocation; 120 } getTagsMultiAllocation()121 MultiGraphicsAllocation *getTagsMultiAllocation() const { 122 return tagsMultiAllocation; 123 } 124 MultiGraphicsAllocation &createTagsMultiAllocation(); getTagAddress()125 volatile uint32_t *getTagAddress() const { return tagAddress; } getDebugPauseStateGPUAddress()126 uint64_t getDebugPauseStateGPUAddress() const { return tagAllocation->getGpuAddress() + debugPauseStateAddressOffset; } 127 waitForFlushStamp(FlushStamp & flushStampToWait)128 virtual bool waitForFlushStamp(FlushStamp &flushStampToWait) { return true; }; 129 peekTaskCount()130 uint32_t peekTaskCount() const { return taskCount; } 131 peekTaskLevel()132 uint32_t peekTaskLevel() const { return taskLevel; } 133 FlushStamp obtainCurrentFlushStamp() const; 134 peekLatestSentTaskCount()135 uint32_t peekLatestSentTaskCount() const { return latestSentTaskCount; } 136 peekLatestFlushedTaskCount()137 uint32_t peekLatestFlushedTaskCount() const { return latestFlushedTaskCount; } 138 enableNTo1SubmissionModel()139 void enableNTo1SubmissionModel() { this->nTo1SubmissionModelEnabled = true; } isNTo1SubmissionModelEnabled()140 bool isNTo1SubmissionModelEnabled() const { return this->nTo1SubmissionModelEnabled; } overrideDispatchPolicy(DispatchMode overrideValue)141 void overrideDispatchPolicy(DispatchMode overrideValue) { this->dispatchMode = overrideValue; } 142 setMediaVFEStateDirty(bool dirty)143 void setMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; } getMediaVFEStateDirty()144 bool getMediaVFEStateDirty() { return mediaVfeStateDirty; } 145 setGSBAStateDirty(bool dirty)146 void setGSBAStateDirty(bool dirty) { GSBAStateDirty = dirty; } getGSBAStateDirty()147 bool getGSBAStateDirty() { return GSBAStateDirty; } 148 149 void setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize); 150 GraphicsAllocation *getScratchAllocation(); getDebugSurfaceAllocation()151 GraphicsAllocation *getDebugSurfaceAllocation() const { return debugSurface; } 152 GraphicsAllocation *allocateDebugSurface(size_t size); getPreemptionAllocation()153 GraphicsAllocation *getPreemptionAllocation() const { return preemptionAllocation; } getGlobalFenceAllocation()154 GraphicsAllocation *getGlobalFenceAllocation() const { return globalFenceAllocation; } getWorkPartitionAllocation()155 GraphicsAllocation *getWorkPartitionAllocation() const { return workPartitionAllocation; } 156 requestStallingCommandsOnNextFlush()157 void requestStallingCommandsOnNextFlush() { stallingCommandsOnNextFlushRequired = true; } isStallingCommandsOnNextFlushRequired()158 bool isStallingCommandsOnNextFlushRequired() const { return stallingCommandsOnNextFlushRequired; } 159 160 virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0; 161 virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); 162 bool baseWaitFunction(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait); 163 bool testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait); downloadAllocations()164 virtual void downloadAllocations(){}; 165 setSamplerCacheFlushRequired(SamplerCacheFlushState value)166 void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; } 167 getFlatBatchBufferHelper()168 FlatBatchBufferHelper &getFlatBatchBufferHelper() const { return *flatBatchBufferHelper; } overwriteFlatBatchBufferHelper(FlatBatchBufferHelper * newHelper)169 void overwriteFlatBatchBufferHelper(FlatBatchBufferHelper *newHelper) { flatBatchBufferHelper.reset(newHelper); } 170 171 MOCKABLE_VIRTUAL void initProgrammingFlags(); 172 virtual AubSubCaptureStatus checkAndActivateAubSubCapture(const std::string &kernelName); 173 void programForAubSubCapture(bool wasActiveInPreviousEnqueue, bool isActive); 174 virtual void addAubComment(const char *comment); 175 176 IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize); 177 void allocateHeapMemory(IndirectHeap::Type heapType, size_t minRequiredSize, IndirectHeap *&indirectHeap); 178 void releaseIndirectHeap(IndirectHeap::Type heapType); 179 180 virtual enum CommandStreamReceiverType getType() = 0; 181 void setExperimentalCmdBuffer(std::unique_ptr<ExperimentalCommandBuffer> &&cmdBuffer); 182 183 bool initializeTagAllocation(); 184 MOCKABLE_VIRTUAL bool createWorkPartitionAllocation(const Device &device); 185 MOCKABLE_VIRTUAL bool createGlobalFenceAllocation(); 186 MOCKABLE_VIRTUAL bool createPreemptionAllocation(); 187 MOCKABLE_VIRTUAL bool createPerDssBackedBuffer(Device &device); 188 MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainUniqueOwnership(); 189 peekTimestampPacketWriteEnabled()190 bool peekTimestampPacketWriteEnabled() const { return timestampPacketWriteEnabled; } 191 192 size_t defaultSshSize; 193 bool canUse4GbHeaps = true; 194 195 AllocationsList &getTemporaryAllocations(); 196 AllocationsList &getAllocationsForReuse(); getInternalAllocationStorage()197 InternalAllocationStorage *getInternalAllocationStorage() const { return internalAllocationStorage.get(); } 198 MOCKABLE_VIRTUAL bool createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush); 199 virtual size_t getPreferredTagPoolSize() const; setupContext(OsContext & osContext)200 virtual void setupContext(OsContext &osContext) { this->osContext = &osContext; } getOsContext()201 OsContext &getOsContext() const { return *osContext; } 202 203 TagAllocatorBase *getEventTsAllocator(); 204 TagAllocatorBase *getEventPerfCountAllocator(const uint32_t tagSize); 205 virtual TagAllocatorBase *getTimestampPacketAllocator() = 0; 206 207 virtual bool expectMemory(const void *gfxAddress, const void *srcAddress, size_t length, uint32_t compareOperation); 208 209 virtual bool isMultiOsContextCapable() const = 0; 210 211 virtual MemoryCompressionState getMemoryCompressionState(bool auxTranslationRequired, const HardwareInfo &hwInfo) const = 0; 212 setLatestSentTaskCount(uint32_t latestSentTaskCount)213 void setLatestSentTaskCount(uint32_t latestSentTaskCount) { 214 this->latestSentTaskCount = latestSentTaskCount; 215 } setLatestFlushedTaskCount(uint32_t latestFlushedTaskCount)216 void setLatestFlushedTaskCount(uint32_t latestFlushedTaskCount) { 217 this->latestFlushedTaskCount = latestFlushedTaskCount; 218 } 219 220 virtual uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled, Device &device) = 0; 221 222 virtual void flushTagUpdate() = 0; 223 virtual void flushNonKernelTask(GraphicsAllocation *eventAlloc, uint64_t immediateGpuAddress, uint64_t immediateData, PipeControlArgs &args, bool isWaitOnEvents, bool isStartOfDispatch, bool isEndOfDispatch) = 0; 224 virtual void updateTagFromWait() = 0; 225 virtual bool isUpdateTagFromWaitEnabled() = 0; 226 getScratchSpaceController()227 ScratchSpaceController *getScratchSpaceController() const { 228 return scratchSpaceController.get(); 229 } 230 registerInstructionCacheFlush()231 void registerInstructionCacheFlush() { 232 auto mutex = obtainUniqueOwnership(); 233 requiresInstructionCacheFlush = true; 234 } 235 isLocalMemoryEnabled()236 bool isLocalMemoryEnabled() const { return localMemoryEnabled; } 237 getRootDeviceIndex()238 uint32_t getRootDeviceIndex() { return rootDeviceIndex; } 239 240 void startControllingDirectSubmissions(); 241 isAnyDirectSubmissionEnabled()242 bool isAnyDirectSubmissionEnabled() { 243 return this->isDirectSubmissionEnabled() || isBlitterDirectSubmissionEnabled(); 244 } 245 initDirectSubmission(Device & device,OsContext & osContext)246 virtual bool initDirectSubmission(Device &device, OsContext &osContext) { 247 return true; 248 } 249 isDirectSubmissionEnabled()250 virtual bool isDirectSubmissionEnabled() const { 251 return false; 252 } 253 isBlitterDirectSubmissionEnabled()254 virtual bool isBlitterDirectSubmissionEnabled() const { 255 return false; 256 } 257 stopDirectSubmission()258 virtual void stopDirectSubmission() {} 259 isStaticWorkPartitioningEnabled()260 bool isStaticWorkPartitioningEnabled() const { 261 return staticWorkPartitioningEnabled; 262 } 263 264 uint64_t getWorkPartitionAllocationGpuAddress() const; 265 266 bool isRcs() const; 267 initializeDefaultsForInternalEngine()268 virtual void initializeDefaultsForInternalEngine(){}; 269 270 virtual GraphicsAllocation *getClearColorAllocation() = 0; 271 272 virtual void postInitFlagsSetup() = 0; 273 isUsedNotifyEnableForPostSync()274 bool isUsedNotifyEnableForPostSync() const { 275 return useNotifyEnableForPostSync; 276 } 277 getStreamProperties()278 NEO::StreamProperties &getStreamProperties() { 279 return this->streamProperties; 280 } 281 setActivePartitions(uint32_t newPartitionCount)282 inline void setActivePartitions(uint32_t newPartitionCount) { 283 activePartitions = newPartitionCount; 284 } 285 getActivePartitions()286 inline uint32_t getActivePartitions() const { 287 return activePartitions; 288 } 289 290 bool skipResourceCleanup() const; 291 isProgramActivePartitionConfigRequired()292 inline bool isProgramActivePartitionConfigRequired() const { 293 return this->isDirectSubmissionEnabled() ? false : this->activePartitionsConfig != this->activePartitions; 294 } 295 296 std::unique_ptr<GmmPageTableMngr> pageTableManager; 297 getPostSyncWriteOffset()298 inline uint32_t getPostSyncWriteOffset() const { 299 return postSyncWriteOffset; 300 } 301 isMultiTileOperationEnabled()302 inline bool isMultiTileOperationEnabled() const { 303 return (activePartitions > 1) && staticWorkPartitioningEnabled; 304 } 305 306 virtual void programComputeBarrierCommand(LinearStream &cmdStream) = 0; 307 virtual size_t getCmdsSizeForComputeBarrierCommand() const = 0; 308 309 protected: 310 void cleanupResources(); 311 void printDeviceIndex(); 312 void checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation); 313 bool checkImplicitFlushForGpuIdle(); 314 MOCKABLE_VIRTUAL std::unique_lock<MutexType> obtainHostPtrSurfaceCreationLock(); 315 316 std::unique_ptr<FlushStampTracker> flushStamp; 317 std::unique_ptr<SubmissionAggregator> submissionAggregator; 318 std::unique_ptr<FlatBatchBufferHelper> flatBatchBufferHelper; 319 std::unique_ptr<ExperimentalCommandBuffer> experimentalCmdBuffer; 320 std::unique_ptr<InternalAllocationStorage> internalAllocationStorage; 321 std::unique_ptr<KmdNotifyHelper> kmdNotifyHelper; 322 std::unique_ptr<ScratchSpaceController> scratchSpaceController; 323 std::unique_ptr<TagAllocatorBase> profilingTimeStampAllocator; 324 std::unique_ptr<TagAllocatorBase> perfCounterAllocator; 325 std::unique_ptr<TagAllocatorBase> timestampPacketAllocator; 326 std::unique_ptr<Thread> userPauseConfirmation; 327 328 ResidencyContainer residencyAllocations; 329 ResidencyContainer evictionAllocations; 330 MutexType ownershipMutex; 331 MutexType hostPtrSurfaceCreationMutex; 332 ExecutionEnvironment &executionEnvironment; 333 334 LinearStream commandStream; 335 StreamProperties streamProperties{}; 336 337 // offset for debug state is 1kbyte, tag writes can use multiple offsets for multiple partitions and each offset can vary per platform 338 const uint64_t debugPauseStateAddressOffset = MemoryConstants::kiloByte; 339 uint64_t totalMemoryUsed = 0u; 340 341 volatile uint32_t *tagAddress = nullptr; 342 volatile DebugPauseState *debugPauseStateAddress = nullptr; 343 SpinLock debugPauseStateLock; 344 static void *asyncDebugBreakConfirmation(void *arg); 345 std::function<void()> debugConfirmationFunction = []() { std::cin.get(); }; 346 347 GraphicsAllocation *tagAllocation = nullptr; 348 GraphicsAllocation *globalFenceAllocation = nullptr; 349 GraphicsAllocation *preemptionAllocation = nullptr; 350 GraphicsAllocation *debugSurface = nullptr; 351 GraphicsAllocation *perDssBackedBuffer = nullptr; 352 GraphicsAllocation *clearColorAllocation = nullptr; 353 GraphicsAllocation *workPartitionAllocation = nullptr; 354 355 MultiGraphicsAllocation *tagsMultiAllocation = nullptr; 356 357 IndirectHeap *indirectHeap[IndirectHeap::NUM_TYPES]; 358 OsContext *osContext = nullptr; 359 360 // current taskLevel. Used for determining if a PIPE_CONTROL is needed. 361 std::atomic<uint32_t> taskLevel{0}; 362 std::atomic<uint32_t> latestSentTaskCount{0}; 363 std::atomic<uint32_t> latestFlushedTaskCount{0}; 364 // taskCount - # of tasks submitted 365 std::atomic<uint32_t> taskCount{0}; 366 367 DispatchMode dispatchMode = DispatchMode::ImmediateDispatch; 368 SamplerCacheFlushState samplerCacheFlushRequired = SamplerCacheFlushState::samplerCacheFlushNotRequired; 369 PreemptionMode lastPreemptionMode = PreemptionMode::Initial; 370 371 uint32_t lastSentL3Config = 0; 372 uint32_t latestSentStatelessMocsConfig = 0; 373 uint32_t lastSentNumGrfRequired = GrfConfig::DefaultGrfNumber; 374 uint64_t lastSentSliceCount = QueueSliceCount::defaultSliceCount; 375 376 uint32_t requiredScratchSize = 0; 377 uint32_t requiredPrivateScratchSize = 0; 378 uint32_t lastAdditionalKernelExecInfo = AdditionalKernelExecInfo::NotSet; 379 KernelExecutionType lastKernelExecutionType = KernelExecutionType::Default; 380 MemoryCompressionState lastMemoryCompressionState = MemoryCompressionState::NotApplicable; 381 uint32_t activePartitions = 1; 382 uint32_t activePartitionsConfig = 1; 383 uint32_t postSyncWriteOffset = 0; 384 385 const uint32_t rootDeviceIndex; 386 const DeviceBitfield deviceBitfield; 387 388 int8_t lastMediaSamplerConfig = -1; 389 390 bool isPreambleSent = false; 391 bool isStateSipSent = false; 392 bool isEnginePrologueSent = false; 393 bool isPerDssBackedBufferSent = false; 394 bool GSBAFor32BitProgrammed = false; 395 bool GSBAStateDirty = true; 396 bool bindingTableBaseAddressRequired = false; 397 bool mediaVfeStateDirty = true; 398 bool lastVmeSubslicesConfig = false; 399 bool stallingCommandsOnNextFlushRequired = false; 400 bool timestampPacketWriteEnabled = false; 401 bool staticWorkPartitioningEnabled = false; 402 bool nTo1SubmissionModelEnabled = false; 403 bool lastSpecialPipelineSelectMode = false; 404 bool requiresInstructionCacheFlush = false; 405 406 bool localMemoryEnabled = false; 407 bool pageTableManagerInitialized = false; 408 409 bool useNewResourceImplicitFlush = false; 410 bool newResources = false; 411 bool useGpuIdleImplicitFlush = false; 412 bool lastSentUseGlobalAtomics = false; 413 bool useNotifyEnableForPostSync = false; 414 }; 415 416 typedef CommandStreamReceiver *(*CommandStreamReceiverCreateFunc)(bool withAubDump, 417 ExecutionEnvironment &executionEnvironment, 418 uint32_t rootDeviceIndex, 419 const DeviceBitfield deviceBitfield); 420 } // namespace NEO 421