1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 //                 AMD Research and AMD HSA Software Development
11 //
12 //                 Advanced Micro Devices, Inc.
13 //
14 //                 www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 //  - Redistributions of source code must retain the above copyright notice,
24 //    this list of conditions and the following disclaimers.
25 //  - Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimers in
27 //    the documentation and/or other materials provided with the distribution.
28 //  - Neither the names of Advanced Micro Devices, Inc,
29 //    nor the names of its contributors may be used to endorse or promote
30 //    products derived from this Software without specific prior written
31 //    permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42 
43 // AMD specific HSA backend.
44 
45 #ifndef HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
46 #define HSA_RUNTIME_CORE_INC_AMD_GPU_AGENT_H_
47 
48 #include <vector>
49 #include <map>
50 
51 #include "hsakmt.h"
52 
53 #include "core/inc/runtime.h"
54 #include "core/inc/agent.h"
55 #include "core/inc/blit.h"
56 #include "core/inc/signal.h"
57 #include "core/inc/cache.h"
58 #include "core/util/small_heap.h"
59 #include "core/util/locks.h"
60 #include "core/util/lazy_ptr.h"
61 
62 namespace amd {
63 class MemoryRegion;
64 
65 // @brief Contains scratch memory information.
66 struct ScratchInfo {
67   void* queue_base;
68   size_t size;
69   size_t size_per_thread;
70   ptrdiff_t queue_process_offset;
71   bool large;
72   bool retry;
73 };
74 
75 // @brief Interface to represent a GPU agent.
76 class GpuAgentInt : public core::Agent {
77  public:
78   // @brief Constructor
GpuAgentInt(uint32_t node_id)79   GpuAgentInt(uint32_t node_id)
80       : core::Agent(node_id, core::Agent::DeviceType::kAmdGpuDevice) {}
81 
82   // @brief Ensure blits are ready (performance hint).
PreloadBlits()83   virtual void PreloadBlits() {}
84 
85   // @brief Initialization hook invoked after tools library has loaded,
86   // to allow tools interception of interface functions.
87   //
88   // @retval HSA_STATUS_SUCCESS if initialization is successful.
89   virtual hsa_status_t PostToolsInit() = 0;
90 
91   // @brief Invoke the user provided callback for each region accessible by
92   // this agent.
93   //
94   // @param [in] include_peer If true, the callback will be also invoked on each
95   // peer memory region accessible by this agent. If false, only invoke the
96   // callback on memory region owned by this agent.
97   // @param [in] callback User provided callback function.
98   // @param [in] data User provided pointer as input for @p callback.
99   //
100   // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
101   // region returns ::HSA_STATUS_SUCCESS.
102   virtual hsa_status_t VisitRegion(bool include_peer,
103                                    hsa_status_t (*callback)(hsa_region_t region,
104                                                             void* data),
105                                    void* data) const = 0;
106 
107   // @brief Carve scratch memory from scratch pool.
108   //
109   // @param [in/out] scratch Structure to be populated with the carved memory
110   // information.
111   virtual void AcquireQueueScratch(ScratchInfo& scratch) = 0;
112 
113   // @brief Release scratch memory back to scratch pool.
114   //
115   // @param [in/out] scratch Scratch memory previously acquired with call to
116   // ::AcquireQueueScratch.
117   virtual void ReleaseQueueScratch(ScratchInfo& base) = 0;
118 
119   // @brief Translate the kernel start and end dispatch timestamp from agent
120   // domain to host domain.
121   //
122   // @param [in] signal Pointer to signal that provides the dispatch timing.
123   // @param [out] time Structure to be populated with the host domain value.
124   virtual void TranslateTime(core::Signal* signal,
125                              hsa_amd_profiling_dispatch_time_t& time) = 0;
126 
127   // @brief Translate the async copy start and end timestamp from agent
128   // domain to host domain.
129   //
130   // @param [in] signal Pointer to signal that provides the async copy timing.
131   // @param [out] time Structure to be populated with the host domain value.
TranslateTime(core::Signal * signal,hsa_amd_profiling_async_copy_time_t & time)132   virtual void TranslateTime(core::Signal* signal,
133                              hsa_amd_profiling_async_copy_time_t& time) {
134     return TranslateTime(signal, (hsa_amd_profiling_dispatch_time_t&)time);
135   }
136 
137   // @brief Translate timestamp agent domain to host domain.
138   //
139   // @param [out] time Timestamp in agent domain.
140   virtual uint64_t TranslateTime(uint64_t tick) = 0;
141 
142   // @brief Invalidate caches on the agent which may hold code object data.
143   virtual void InvalidateCodeCaches() = 0;
144 
145   // @brief Sets the coherency type of this agent.
146   //
147   // @param [in] type New coherency type.
148   //
149   // @retval true The new coherency type is set successfuly.
150   virtual bool current_coherency_type(hsa_amd_coherency_type_t type) = 0;
151 
152   // @brief Returns the current coherency type of this agent.
153   //
154   // @retval Coherency type.
155   virtual hsa_amd_coherency_type_t current_coherency_type() const = 0;
156 
157   // @brief Query if agent represent Kaveri GPU.
158   //
159   // @retval true if agent is Kaveri GPU.
160   virtual bool is_kv_device() const = 0;
161 
162   // @brief Query the agent HSA profile.
163   //
164   // @retval HSA profile.
165   virtual hsa_profile_t profile() const = 0;
166 
167   // @brief Query the agent memory bus width in bit.
168   //
169   // @retval Bus width in bit.
170   virtual uint32_t memory_bus_width() const = 0;
171 
172   // @brief Query the agent memory maximum frequency in MHz.
173   //
174   // @retval Bus width in MHz.
175   virtual uint32_t memory_max_frequency() const = 0;
176 };
177 
178 class GpuAgent : public GpuAgentInt {
179  public:
180   // @brief GPU agent constructor.
181   //
182   // @param [in] node Node id. Each CPU in different socket will get distinct
183   // id.
184   // @param [in] node_props Node property.
185   GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props);
186 
187   // @brief GPU agent destructor.
188   ~GpuAgent();
189 
190   // @brief Ensure blits are ready (performance hint).
191   void PreloadBlits() override;
192 
193   // @brief Override from core::Agent.
194   hsa_status_t PostToolsInit() override;
195 
196   uint16_t GetMicrocodeVersion() const;
197 
198   uint16_t GetSdmaMicrocodeVersion() const;
199 
200   // @brief Assembles SP3 shader source into ISA or AQL code object.
201   //
202   // @param [in] src_sp3 SP3 shader source text representation.
203   // @param [in] func_name Name of the SP3 function to assemble.
204   // @param [in] assemble_target ISA or AQL assembly target.
205   // @param [out] code_buf Code object buffer.
206   // @param [out] code_buf_size Size of code object buffer in bytes.
207   enum class AssembleTarget { ISA, AQL };
208 
209   void AssembleShader(const char* src_sp3, const char* func_name,
210                       AssembleTarget assemble_target, void*& code_buf,
211                       size_t& code_buf_size) const;
212 
213   // @brief Frees code object created by AssembleShader.
214   //
215   // @param [in] code_buf Code object buffer.
216   // @param [in] code_buf_size Size of code object buffer in bytes.
217   void ReleaseShader(void* code_buf, size_t code_buf_size) const;
218 
219   // @brief Override from core::Agent.
220   hsa_status_t VisitRegion(bool include_peer,
221                            hsa_status_t (*callback)(hsa_region_t region,
222                                                     void* data),
223                            void* data) const override;
224 
225   // @brief Override from core::Agent.
226   hsa_status_t IterateRegion(hsa_status_t (*callback)(hsa_region_t region,
227                                                       void* data),
228                              void* data) const override;
229 
230   // @brief Override from core::Agent.
231   hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
232                             void* value) const override;
233 
234   // @brief Override from core::Agent.
235   hsa_status_t DmaCopy(void* dst, const void* src, size_t size) override;
236 
237   // @brief Override from core::Agent.
238   hsa_status_t DmaCopy(void* dst, core::Agent& dst_agent, const void* src,
239                        core::Agent& src_agent, size_t size,
240                        std::vector<core::Signal*>& dep_signals,
241                        core::Signal& out_signal) override;
242 
243   // @brief Override from core::Agent.
244   hsa_status_t DmaFill(void* ptr, uint32_t value, size_t count) override;
245 
246   // @brief Get the next available end timestamp object.
247   uint64_t* ObtainEndTsObject();
248 
249   // @brief Override from core::Agent.
250   hsa_status_t GetInfo(hsa_agent_info_t attribute, void* value) const override;
251 
252   // @brief Override from core::Agent.
253   hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type,
254                            core::HsaEventCallback event_callback, void* data,
255                            uint32_t private_segment_size,
256                            uint32_t group_segment_size,
257                            core::Queue** queue) override;
258 
259   // @brief Override from amd::GpuAgentInt.
260   void AcquireQueueScratch(ScratchInfo& scratch) override;
261 
262   // @brief Override from amd::GpuAgentInt.
263   void ReleaseQueueScratch(ScratchInfo& scratch) override;
264 
265   // @brief Register signal for notification when scratch may become available.
266   // @p signal is notified by OR'ing with @p value.
AddScratchNotifier(hsa_signal_t signal,hsa_signal_value_t value)267   void AddScratchNotifier(hsa_signal_t signal, hsa_signal_value_t value) {
268     ScopedAcquire<KernelMutex> lock(&scratch_lock_);
269     scratch_notifiers_[signal] = value;
270   }
271 
272   // @brief Deregister scratch notification signal.
RemoveScratchNotifier(hsa_signal_t signal)273   void RemoveScratchNotifier(hsa_signal_t signal) {
274     ScopedAcquire<KernelMutex> lock(&scratch_lock_);
275     scratch_notifiers_.erase(signal);
276   }
277 
278   // @brief Override from amd::GpuAgentInt.
279   void TranslateTime(core::Signal* signal,
280                      hsa_amd_profiling_dispatch_time_t& time) override;
281 
282   // @brief Override from amd::GpuAgentInt.
283   uint64_t TranslateTime(uint64_t tick) override;
284 
285   // @brief Override from amd::GpuAgentInt.
286   void InvalidateCodeCaches() override;
287 
288   // @brief Override from amd::GpuAgentInt.
289   bool current_coherency_type(hsa_amd_coherency_type_t type) override;
290 
current_coherency_type()291   hsa_amd_coherency_type_t current_coherency_type() const override {
292     return current_coherency_type_;
293   }
294 
295   // Getter & setters.
296 
297   // @brief Returns node property.
properties()298   __forceinline const HsaNodeProperties& properties() const {
299     return properties_;
300   }
301 
302   // @brief Returns number of data caches.
num_cache()303   __forceinline size_t num_cache() const { return cache_props_.size(); }
304 
305   // @brief Returns data cache property.
306   //
307   // @param [in] idx Cache level.
cache_prop(int idx)308   __forceinline const HsaCacheProperties& cache_prop(int idx) const {
309     return cache_props_[idx];
310   }
311 
312   // @brief Override from core::Agent.
regions()313   const std::vector<const core::MemoryRegion*>& regions() const override {
314     return regions_;
315   }
316 
317   // @brief Override from core::Agent.
isa()318   const core::Isa* isa() const override { return isa_; }
319 
320   // @brief Override from amd::GpuAgentInt.
is_kv_device()321   __forceinline bool is_kv_device() const override { return is_kv_device_; }
322 
323   // @brief Override from amd::GpuAgentInt.
profile()324   __forceinline hsa_profile_t profile() const override { return profile_; }
325 
326   // @brief Override from amd::GpuAgentInt.
memory_bus_width()327   __forceinline uint32_t memory_bus_width() const override {
328     return memory_bus_width_;
329   }
330 
331   // @brief Override from amd::GpuAgentInt.
memory_max_frequency()332   __forceinline uint32_t memory_max_frequency() const override {
333     return memory_max_frequency_;
334   }
335 
336  protected:
337   static const uint32_t minAqlSize_ = 0x1000;   // 4KB min
338   static const uint32_t maxAqlSize_ = 0x20000;  // 8MB max
339 
340   // @brief Create a queue through HSA API to allow tools to intercept.
341   core::Queue* CreateInterceptibleQueue();
342 
343   // @brief Create SDMA blit object.
344   //
345   // @retval NULL if SDMA blit creation and initialization failed.
346   core::Blit* CreateBlitSdma(bool h2d);
347 
348   // @brief Create Kernel blit object using provided compute queue.
349   //
350   // @retval NULL if Kernel blit creation and initialization failed.
351   core::Blit* CreateBlitKernel(core::Queue* queue);
352 
353   // @brief Invoke the user provided callback for every region in @p regions.
354   //
355   // @param [in] regions Array of region object.
356   // @param [in] callback User provided callback function.
357   // @param [in] data User provided pointer as input for @p callback.
358   //
359   // @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
360   // region returns ::HSA_STATUS_SUCCESS.
361   hsa_status_t VisitRegion(
362       const std::vector<const core::MemoryRegion*>& regions,
363       hsa_status_t (*callback)(hsa_region_t region, void* data),
364       void* data) const;
365 
366   // @brief Update ::t1_ tick count.
367   void SyncClocks();
368 
369   // @brief Binds the second-level trap handler to this node.
370   void BindTrapHandler();
371 
372   // @brief Override from core::Agent.
373   hsa_status_t EnableDmaProfiling(bool enable) override;
374 
375   // @brief Node properties.
376   const HsaNodeProperties properties_;
377 
378   // @brief Current coherency type.
379   hsa_amd_coherency_type_t current_coherency_type_;
380 
381   // @brief Maximum number of queues that can be created.
382   uint32_t max_queues_;
383 
384   // @brief Object to manage scratch memory.
385   SmallHeap scratch_pool_;
386 
387   // @brief Current short duration scratch memory size.
388   size_t scratch_used_large_;
389 
390   // @brief Notifications for scratch release.
391   std::map<hsa_signal_t, hsa_signal_value_t> scratch_notifiers_;
392 
393   // @brief Default scratch size per queue.
394   size_t queue_scratch_len_;
395 
396   // @brief Default scratch size per work item.
397   size_t scratch_per_thread_;
398 
399   // @brief Blit interfaces for each data path.
400   enum BlitEnum { BlitHostToDev, BlitDevToHost, BlitDevToDev, BlitCount };
401 
402   lazy_ptr<core::Blit> blits_[BlitCount];
403 
404   // @brief AQL queues for cache management and blit compute usage.
405   enum QueueEnum {
406     QueueUtility,   // Cache management and device to {host,device} blit compute
407     QueueBlitOnly,  // Host to device blit
408     QueueCount
409   };
410 
411   lazy_ptr<core::Queue> queues_[QueueCount];
412 
413   // @brief Mutex to protect the update to coherency type.
414   KernelMutex coherency_lock_;
415 
416   // @brief Mutex to protect access to scratch pool.
417   KernelMutex scratch_lock_;
418 
419   // @brief Mutex to protect access to ::t1_.
420   KernelMutex t1_lock_;
421 
422   // @brief Mutex to protect access to blit objects.
423   KernelMutex blit_lock_;
424 
425   // @brief GPU tick on initialization.
426   HsaClockCounters t0_;
427 
428   HsaClockCounters t1_;
429 
430   // @brief Array of GPU cache property.
431   std::vector<HsaCacheProperties> cache_props_;
432 
433   // @brief Array of HSA cache objects.
434   std::vector<std::unique_ptr<core::Cache>> caches_;
435 
436   // @brief Array of regions owned by this agent.
437   std::vector<const core::MemoryRegion*> regions_;
438 
439   MemoryRegion* local_region_;
440 
441   core::Isa* isa_;
442 
443   // @brief HSA profile.
444   hsa_profile_t profile_;
445 
446   bool is_kv_device_;
447 
448   void* trap_code_buf_;
449 
450   size_t trap_code_buf_size_;
451 
452   // @brief The GPU memory bus width in bit.
453   uint32_t memory_bus_width_;
454 
455   // @brief The GPU memory maximum frequency in MHz.
456   uint32_t memory_max_frequency_;
457 
458  private:
459   // @brief Query the driver to get the region list owned by this agent.
460   void InitRegionList();
461 
462   // @brief Reserve memory for scratch pool to be used by AQL queue of this
463   // agent.
464   void InitScratchPool();
465 
466   // @brief Query the driver to get the cache properties.
467   void InitCacheList();
468 
469   // @brief Create internal queues and blits.
470   void InitDma();
471 
472   // @brief Initialize memory pool for end timestamp object.
473   // @retval True if the memory pool for end timestamp object is initialized.
474   bool InitEndTsPool();
475 
476   // @brief Alternative aperture base address. Only on KV.
477   uintptr_t ape1_base_;
478 
479   // @brief Alternative aperture size. Only on KV.
480   size_t ape1_size_;
481 
482   // Each end ts is 32 bytes.
483   static const size_t kTsSize = 32;
484 
485   // Number of element in the pool.
486   uint32_t end_ts_pool_size_;
487 
488   std::atomic<uint32_t> end_ts_pool_counter_;
489 
490   std::atomic<uint64_t*> end_ts_base_addr_;
491 
492   DISALLOW_COPY_AND_ASSIGN(GpuAgent);
493 };
494 
495 }  // namespace
496 
497 #endif  // header guard
498