1 //////////////////////////////////////////////////////////////////////////////// 2 // 3 // The University of Illinois/NCSA 4 // Open Source License (NCSA) 5 // 6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. 7 // 8 // Developed by: 9 // 10 // AMD Research and AMD HSA Software Development 11 // 12 // Advanced Micro Devices, Inc. 13 // 14 // www.amd.com 15 // 16 // Permission is hereby granted, free of charge, to any person obtaining a copy 17 // of this software and associated documentation files (the "Software"), to 18 // deal with the Software without restriction, including without limitation 19 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 20 // and/or sell copies of the Software, and to permit persons to whom the 21 // Software is furnished to do so, subject to the following conditions: 22 // 23 // - Redistributions of source code must retain the above copyright notice, 24 // this list of conditions and the following disclaimers. 25 // - Redistributions in binary form must reproduce the above copyright 26 // notice, this list of conditions and the following disclaimers in 27 // the documentation and/or other materials provided with the distribution. 28 // - Neither the names of Advanced Micro Devices, Inc, 29 // nor the names of its contributors may be used to endorse or promote 30 // products derived from this Software without specific prior written 31 // permission. 32 // 33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 39 // DEALINGS WITH THE SOFTWARE. 40 // 41 //////////////////////////////////////////////////////////////////////////////// 42 43 // AMD specific HSA backend. 44 45 #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ 46 #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_ 47 48 #include <vector> 49 #include <map> 50 51 #include "hsakmt.h" 52 53 #include "core/inc/runtime.h" 54 #include "core/inc/agent.h" 55 #include "core/inc/blit.h" 56 #include "core/inc/signal.h" 57 #include "core/inc/cache.h" 58 #include "core/util/small_heap.h" 59 #include "core/util/locks.h" 60 #include "core/util/lazy_ptr.h" 61 62 namespace amd { 63 class MemoryRegion; 64 65 // @brief Contains scratch memory information. 66 struct ScratchInfo { 67 void* queue_base; 68 size_t size; 69 size_t size_per_thread; 70 ptrdiff_t queue_process_offset; 71 bool large; 72 bool retry; 73 }; 74 75 // @brief Interface to represent a GPU agent. 76 class GpuAgentInt : public core::Agent { 77 public: 78 // @brief Constructor GpuAgentInt(uint32_t node_id)79 GpuAgentInt(uint32_t node_id) 80 : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {} 81 82 // @brief Ensure blits are ready (performance hint). PreloadBlits()83 virtual void PreloadBlits() {} 84 85 // @brief Initialization hook invoked after tools library has loaded, 86 // to allow tools interception of interface functions. 87 // 88 // @retval HSA_STATUS_SUCCESS if initialization is successful. 89 virtual hsa_status_t PostToolsInit() = 0; 90 91 // @brief Invoke the user provided callback for each region accessible by 92 // this agent. 93 // 94 // @param [in] include_peer If true, the callback will be also invoked on each 95 // peer memory region accessible by this agent. If false, only invoke the 96 // callback on memory region owned by this agent. 97 // @param [in] callback User provided callback function. 98 // @param [in] data User provided pointer as input for @p callback. 99 // 100 // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed 101 // region returns ::HSA_STATUS_SUCCESS. 102 virtual hsa_status_t VisitRegion(bool include_peer, 103 hsa_status_t (*callback)(hsa_region_t region, 104 void* data), 105 void* data) const = 0; 106 107 // @brief Carve scratch memory from scratch pool. 108 // 109 // @param [in/out] scratch Structure to be populated with the carved memory 110 // information. 111 virtual void AcquireQueueScratch(ScratchInfo& scratch) = 0; 112 113 // @brief Release scratch memory back to scratch pool. 114 // 115 // @param [in/out] scratch Scratch memory previously acquired with call to 116 // ::AcquireQueueScratch. 117 virtual void ReleaseQueueScratch(ScratchInfo& base) = 0; 118 119 // @brief Translate the kernel start and end dispatch timestamp from agent 120 // domain to host domain. 121 // 122 // @param [in] signal Pointer to signal that provides the dispatch timing. 123 // @param [out] time Structure to be populated with the host domain value. 124 virtual void TranslateTime(core::Signal* signal, 125 hsa_amd_profiling_dispatch_time_t& time) = 0; 126 127 // @brief Translate the async copy start and end timestamp from agent 128 // domain to host domain. 129 // 130 // @param [in] signal Pointer to signal that provides the async copy timing. 131 // @param [out] time Structure to be populated with the host domain value. TranslateTime(core::Signal * signal,hsa_amd_profiling_async_copy_time_t & time)132 virtual void TranslateTime(core::Signal* signal, 133 hsa_amd_profiling_async_copy_time_t& time) { 134 return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time); 135 } 136 137 // @brief Translate timestamp agent domain to host domain. 138 // 139 // @param [out] time Timestamp in agent domain. 140 virtual uint64_t TranslateTime(uint64_t tick) = 0; 141 142 // @brief Invalidate caches on the agent which may hold code object data. 143 virtual void InvalidateCodeCaches() = 0; 144 145 // @brief Sets the coherency type of this agent. 146 // 147 // @param [in] type New coherency type. 148 // 149 // @retval true The new coherency type is set successfuly. 150 virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0; 151 152 // @brief Returns the current coherency type of this agent. 153 // 154 // @retval Coherency type. 155 virtual hsa_amd_coherency_type_t current_coherency_type() const = 0; 156 157 // @brief Query if agent represent Kaveri GPU. 158 // 159 // @retval true if agent is Kaveri GPU. 160 virtual bool is_kv_device() const = 0; 161 162 // @brief Query the agent HSA profile. 163 // 164 // @retval HSA profile. 165 virtual hsa_profile_t profile() const = 0; 166 167 // @brief Query the agent memory bus width in bit. 168 // 169 // @retval Bus width in bit. 170 virtual uint32_t memory_bus_width() const = 0; 171 172 // @brief Query the agent memory maximum frequency in MHz. 173 // 174 // @retval Bus width in MHz. 175 virtual uint32_t memory_max_frequency() const = 0; 176 }; 177 178 class GpuAgent : public GpuAgentInt { 179 public: 180 // @brief GPU agent constructor. 181 // 182 // @param [in] node Node id. Each CPU in different socket will get distinct 183 // id. 184 // @param [in] node_props Node property. 185 GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props); 186 187 // @brief GPU agent destructor. 188 ~GpuAgent(); 189 190 // @brief Ensure blits are ready (performance hint). 191 void PreloadBlits() override; 192 193 // @brief Override from core::Agent. 194 hsa_status_t PostToolsInit() override; 195 196 uint16_t GetMicrocodeVersion() const; 197 198 uint16_t GetSdmaMicrocodeVersion() const; 199 200 // @brief Assembles SP3 shader source into ISA or AQL code object. 201 // 202 // @param [in] src_sp3 SP3 shader source text representation. 203 // @param [in] func_name Name of the SP3 function to assemble. 204 // @param [in] assemble_target ISA or AQL assembly target. 205 // @param [out] code_buf Code object buffer. 206 // @param [out] code_buf_size Size of code object buffer in bytes. 207 enum class AssembleTarget { ISA, AQL }; 208 209 void AssembleShader(const char* src_sp3, const char* func_name, 210 AssembleTarget assemble_target, void*& code_buf, 211 size_t& code_buf_size) const; 212 213 // @brief Frees code object created by AssembleShader. 214 // 215 // @param [in] code_buf Code object buffer. 216 // @param [in] code_buf_size Size of code object buffer in bytes. 217 void ReleaseShader(void* code_buf, size_t code_buf_size) const; 218 219 // @brief Override from core::Agent. 220 hsa_status_t VisitRegion(bool include_peer, 221 hsa_status_t (*callback)(hsa_region_t region, 222 void* data), 223 void* data) const override; 224 225 // @brief Override from core::Agent. 226 hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region, 227 void* data), 228 void* data) const override; 229 230 // @brief Override from core::Agent. 231 hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data), 232 void* value) const override; 233 234 // @brief Override from core::Agent. 235 hsa_status_t DmaCopy(void* dst, const void* src, size_t size) override; 236 237 // @brief Override from core::Agent. 238 hsa_status_t DmaCopy(void* dst, core::Agent& dst_agent, const void* src, 239 core::Agent& src_agent, size_t size, 240 std::vector<core::Signal*>& dep_signals, 241 core::Signal& out_signal) override; 242 243 // @brief Override from core::Agent. 244 hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override; 245 246 // @brief Get the next available end timestamp object. 247 uint64_t* ObtainEndTsObject(); 248 249 // @brief Override from core::Agent. 250 hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override; 251 252 // @brief Override from core::Agent. 253 hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type, 254 core::HsaEventCallback event_callback, void* data, 255 uint32_t private_segment_size, 256 uint32_t group_segment_size, 257 core::Queue** queue) override; 258 259 // @brief Override from amd::GpuAgentInt. 260 void AcquireQueueScratch(ScratchInfo& scratch) override; 261 262 // @brief Override from amd::GpuAgentInt. 263 void ReleaseQueueScratch(ScratchInfo& scratch) override; 264 265 // @brief Register signal for notification when scratch may become available. 266 // @p signal is notified by OR'ing with @p value. AddScratchNotifier(hsa_signal_t signal,hsa_signal_value_t value)267 void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) { 268 ScopedAcquire<KernelMutex> lock(&scratch_lock_); 269 scratch_notifiers_[signal] = value; 270 } 271 272 // @brief Deregister scratch notification signal. RemoveScratchNotifier(hsa_signal_t signal)273 void RemoveScratchNotifier(hsa_signal_t signal) { 274 ScopedAcquire<KernelMutex> lock(&scratch_lock_); 275 scratch_notifiers_.erase(signal); 276 } 277 278 // @brief Override from amd::GpuAgentInt. 279 void TranslateTime(core::Signal* signal, 280 hsa_amd_profiling_dispatch_time_t& time) override; 281 282 // @brief Override from amd::GpuAgentInt. 283 uint64_t TranslateTime(uint64_t tick) override; 284 285 // @brief Override from amd::GpuAgentInt. 286 void InvalidateCodeCaches() override; 287 288 // @brief Override from amd::GpuAgentInt. 289 bool current_coherency_type(hsa_amd_coherency_type_t type) override; 290 current_coherency_type()291 hsa_amd_coherency_type_t current_coherency_type() const override { 292 return current_coherency_type_; 293 } 294 295 // Getter & setters. 296 297 // @brief Returns node property. properties()298 __forceinline const HsaNodeProperties& properties() const { 299 return properties_; 300 } 301 302 // @brief Returns number of data caches. num_cache()303 __forceinline size_t num_cache() const { return cache_props_.size(); } 304 305 // @brief Returns data cache property. 306 // 307 // @param [in] idx Cache level. cache_prop(int idx)308 __forceinline const HsaCacheProperties& cache_prop(int idx) const { 309 return cache_props_[idx]; 310 } 311 312 // @brief Override from core::Agent. regions()313 const std::vector<const core::MemoryRegion*>& regions() const override { 314 return regions_; 315 } 316 317 // @brief Override from core::Agent. isa()318 const core::Isa* isa() const override { return isa_; } 319 320 // @brief Override from amd::GpuAgentInt. is_kv_device()321 __forceinline bool is_kv_device() const override { return is_kv_device_; } 322 323 // @brief Override from amd::GpuAgentInt. profile()324 __forceinline hsa_profile_t profile() const override { return profile_; } 325 326 // @brief Override from amd::GpuAgentInt. memory_bus_width()327 __forceinline uint32_t memory_bus_width() const override { 328 return memory_bus_width_; 329 } 330 331 // @brief Override from amd::GpuAgentInt. memory_max_frequency()332 __forceinline uint32_t memory_max_frequency() const override { 333 return memory_max_frequency_; 334 } 335 336 protected: 337 static const uint32_t minAqlSize_ = 0x1000; // 4KB min 338 static const uint32_t maxAqlSize_ = 0x20000; // 8MB max 339 340 // @brief Create a queue through HSA API to allow tools to intercept. 341 core::Queue* CreateInterceptibleQueue(); 342 343 // @brief Create SDMA blit object. 344 // 345 // @retval NULL if SDMA blit creation and initialization failed. 346 core::Blit* CreateBlitSdma(bool h2d); 347 348 // @brief Create Kernel blit object using provided compute queue. 349 // 350 // @retval NULL if Kernel blit creation and initialization failed. 351 core::Blit* CreateBlitKernel(core::Queue* queue); 352 353 // @brief Invoke the user provided callback for every region in @p regions. 354 // 355 // @param [in] regions Array of region object. 356 // @param [in] callback User provided callback function. 357 // @param [in] data User provided pointer as input for @p callback. 358 // 359 // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed 360 // region returns ::HSA_STATUS_SUCCESS. 361 hsa_status_t VisitRegion( 362 const std::vector<const core::MemoryRegion*>& regions, 363 hsa_status_t (*callback)(hsa_region_t region, void* data), 364 void* data) const; 365 366 // @brief Update ::t1_ tick count. 367 void SyncClocks(); 368 369 // @brief Binds the second-level trap handler to this node. 370 void BindTrapHandler(); 371 372 // @brief Override from core::Agent. 373 hsa_status_t EnableDmaProfiling(bool enable) override; 374 375 // @brief Node properties. 376 const HsaNodeProperties properties_; 377 378 // @brief Current coherency type. 379 hsa_amd_coherency_type_t current_coherency_type_; 380 381 // @brief Maximum number of queues that can be created. 382 uint32_t max_queues_; 383 384 // @brief Object to manage scratch memory. 385 SmallHeap scratch_pool_; 386 387 // @brief Current short duration scratch memory size. 388 size_t scratch_used_large_; 389 390 // @brief Notifications for scratch release. 391 std::map<hsa_signal_t, hsa_signal_value_t> scratch_notifiers_; 392 393 // @brief Default scratch size per queue. 394 size_t queue_scratch_len_; 395 396 // @brief Default scratch size per work item. 397 size_t scratch_per_thread_; 398 399 // @brief Blit interfaces for each data path. 400 enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount }; 401 402 lazy_ptr<core::Blit> blits_[BlitCount]; 403 404 // @brief AQL queues for cache management and blit compute usage. 405 enum QueueEnum { 406 QueueUtility, // Cache management and device to {host,device} blit compute 407 QueueBlitOnly, // Host to device blit 408 QueueCount 409 }; 410 411 lazy_ptr<core::Queue> queues_[QueueCount]; 412 413 // @brief Mutex to protect the update to coherency type. 414 KernelMutex coherency_lock_; 415 416 // @brief Mutex to protect access to scratch pool. 417 KernelMutex scratch_lock_; 418 419 // @brief Mutex to protect access to ::t1_. 420 KernelMutex t1_lock_; 421 422 // @brief Mutex to protect access to blit objects. 423 KernelMutex blit_lock_; 424 425 // @brief GPU tick on initialization. 426 HsaClockCounters t0_; 427 428 HsaClockCounters t1_; 429 430 // @brief Array of GPU cache property. 431 std::vector<HsaCacheProperties> cache_props_; 432 433 // @brief Array of HSA cache objects. 434 std::vector<std::unique_ptr<core::Cache>> caches_; 435 436 // @brief Array of regions owned by this agent. 437 std::vector<const core::MemoryRegion*> regions_; 438 439 MemoryRegion* local_region_; 440 441 core::Isa* isa_; 442 443 // @brief HSA profile. 444 hsa_profile_t profile_; 445 446 bool is_kv_device_; 447 448 void* trap_code_buf_; 449 450 size_t trap_code_buf_size_; 451 452 // @brief The GPU memory bus width in bit. 453 uint32_t memory_bus_width_; 454 455 // @brief The GPU memory maximum frequency in MHz. 456 uint32_t memory_max_frequency_; 457 458 private: 459 // @brief Query the driver to get the region list owned by this agent. 460 void InitRegionList(); 461 462 // @brief Reserve memory for scratch pool to be used by AQL queue of this 463 // agent. 464 void InitScratchPool(); 465 466 // @brief Query the driver to get the cache properties. 467 void InitCacheList(); 468 469 // @brief Create internal queues and blits. 470 void InitDma(); 471 472 // @brief Initialize memory pool for end timestamp object. 473 // @retval True if the memory pool for end timestamp object is initialized. 474 bool InitEndTsPool(); 475 476 // @brief Alternative aperture base address. Only on KV. 477 uintptr_t ape1_base_; 478 479 // @brief Alternative aperture size. Only on KV. 480 size_t ape1_size_; 481 482 // Each end ts is 32 bytes. 483 static const size_t kTsSize = 32; 484 485 // Number of element in the pool. 486 uint32_t end_ts_pool_size_; 487 488 std::atomic<uint32_t> end_ts_pool_counter_; 489 490 std::atomic<uint64_t*> end_ts_base_addr_; 491 492 DISALLOW_COPY_AND_ASSIGN(GpuAgent); 493 }; 494 495 } // namespace 496 497 #endif // header guard 498