1 /*===--------------------------------------------------------------------------
2  *              ATMI (Asynchronous Task and Memory Interface)
3  *
4  * This file is distributed under the MIT License. See LICENSE.txt for details.
5  *===------------------------------------------------------------------------*/
6 #ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
7 #define SRC_RUNTIME_INCLUDE_INTERNAL_H_
8 #include <inttypes.h>
9 #include <pthread.h>
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 
15 #include <atomic>
16 #include <cstring>
17 #include <deque>
18 #include <map>
19 #include <queue>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "hsa.h"
25 #include "hsa_ext_amd.h"
26 #include "hsa_ext_finalize.h"
27 
28 #include "atmi.h"
29 #include "atmi_runtime.h"
30 #include "rt.h"
31 
32 #define MAX_NUM_KERNELS (1024 * 16)
33 
34 typedef struct atmi_implicit_args_s {
35   unsigned long offset_x;
36   unsigned long offset_y;
37   unsigned long offset_z;
38   unsigned long hostcall_ptr;
39   char num_gpu_queues;
40   unsigned long gpu_queue_ptr;
41   char num_cpu_queues;
42   unsigned long cpu_worker_signals;
43   unsigned long cpu_queue_ptr;
44   unsigned long kernarg_template_ptr;
45 } atmi_implicit_args_t;
46 
47 #ifdef __cplusplus
48 extern "C" {
49 #endif
50 
51 #define check(msg, status)                                                     \
52   if (status != HSA_STATUS_SUCCESS) {                                          \
53     printf("%s failed.\n", #msg);                                              \
54     exit(1);                                                                   \
55   }
56 
57 #ifdef DEBUG
58 #define DEBUG_PRINT(fmt, ...)                                                  \
59   if (core::Runtime::getInstance().getDebugMode()) {                           \
60     fprintf(stderr, "[%s:%d] " fmt, __FILE__, __LINE__, ##__VA_ARGS__);        \
61   }
62 #else
63 #define DEBUG_PRINT(...)                                                       \
64   do {                                                                         \
65   } while (false)
66 #endif
67 
68 #ifndef HSA_RUNTIME_INC_HSA_H_
69 typedef struct hsa_signal_s {
70   uint64_t handle;
71 } hsa_signal_t;
72 #endif
73 
74 /*  All global values go in this global structure */
75 typedef struct atl_context_s {
76   bool struct_initialized;
77   bool g_hsa_initialized;
78   bool g_gpu_initialized;
79   bool g_tasks_initialized;
80 } atl_context_t;
81 extern atl_context_t atlc;
82 extern atl_context_t *atlc_p;
83 
84 #ifdef __cplusplus
85 }
86 #endif
87 
88 /* ---------------------------------------------------------------------------------
89  * Simulated CPU Data Structures and API
90  * ---------------------------------------------------------------------------------
91  */
92 
93 #define ATMI_WAIT_STATE HSA_WAIT_STATE_BLOCKED
94 
95 // ---------------------- Kernel Start -------------
96 typedef struct atl_kernel_info_s {
97   uint64_t kernel_object;
98   uint32_t group_segment_size;
99   uint32_t private_segment_size;
100   uint32_t kernel_segment_size;
101   uint32_t num_args;
102   std::vector<uint64_t> arg_alignments;
103   std::vector<uint64_t> arg_offsets;
104   std::vector<uint64_t> arg_sizes;
105 } atl_kernel_info_t;
106 
107 typedef struct atl_symbol_info_s {
108   uint64_t addr;
109   uint32_t size;
110 } atl_symbol_info_t;
111 
112 extern std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
113 extern std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
114 
115 // ---------------------- Kernel End -------------
116 
117 extern struct timespec context_init_time;
118 
119 namespace core {
120 class TaskgroupImpl;
121 class TaskImpl;
122 class Kernel;
123 class KernelImpl;
124 } // namespace core
125 
126 struct SignalPoolT {
SignalPoolTSignalPoolT127   SignalPoolT() {
128     // If no signals are created, and none can be created later,
129     // will ultimately fail at pop()
130 
131     unsigned N = 1024; // default max pool size from atmi
132     for (unsigned i = 0; i < N; i++) {
133       hsa_signal_t new_signal;
134       hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
135       if (err != HSA_STATUS_SUCCESS) {
136         break;
137       }
138       state.push(new_signal);
139     }
140     DEBUG_PRINT("Signal Pool Initial Size: %lu\n", state.size());
141   }
142   SignalPoolT(const SignalPoolT &) = delete;
143   SignalPoolT(SignalPoolT &&) = delete;
~SignalPoolTSignalPoolT144   ~SignalPoolT() {
145     size_t N = state.size();
146     for (size_t i = 0; i < N; i++) {
147       hsa_signal_t signal = state.front();
148       state.pop();
149       hsa_status_t rc = hsa_signal_destroy(signal);
150       if (rc != HSA_STATUS_SUCCESS) {
151         DEBUG_PRINT("Signal pool destruction failed\n");
152       }
153     }
154   }
sizeSignalPoolT155   size_t size() {
156     lock l(&mutex);
157     return state.size();
158   }
pushSignalPoolT159   void push(hsa_signal_t s) {
160     lock l(&mutex);
161     state.push(s);
162   }
popSignalPoolT163   hsa_signal_t pop(void) {
164     lock l(&mutex);
165     if (!state.empty()) {
166       hsa_signal_t res = state.front();
167       state.pop();
168       return res;
169     }
170 
171     // Pool empty, attempt to create another signal
172     hsa_signal_t new_signal;
173     hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
174     if (err == HSA_STATUS_SUCCESS) {
175       return new_signal;
176     }
177 
178     // Fail
179     return {0};
180   }
181 
182 private:
183   static pthread_mutex_t mutex;
184   std::queue<hsa_signal_t> state;
185   struct lock {
lockSignalPoolT::lock186     lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
~lockSignalPoolT::lock187     ~lock() { pthread_mutex_unlock(m); }
188     pthread_mutex_t *m;
189   };
190 };
191 
192 extern std::vector<hsa_amd_memory_pool_t> atl_gpu_kernarg_pools;
193 
194 namespace core {
195 atmi_status_t atl_init_gpu_context();
196 
197 hsa_status_t init_hsa();
198 hsa_status_t finalize_hsa();
199 /*
200  * Generic utils
201  */
alignDown(T value,size_t alignment)202 template <typename T> inline T alignDown(T value, size_t alignment) {
203   return (T)(value & ~(alignment - 1));
204 }
205 
alignDown(T * value,size_t alignment)206 template <typename T> inline T *alignDown(T *value, size_t alignment) {
207   return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
208 }
209 
alignUp(T value,size_t alignment)210 template <typename T> inline T alignUp(T value, size_t alignment) {
211   return alignDown((T)(value + alignment - 1), alignment);
212 }
213 
alignUp(T * value,size_t alignment)214 template <typename T> inline T *alignUp(T *value, size_t alignment) {
215   return reinterpret_cast<T *>(
216       alignDown((intptr_t)(value + alignment - 1), alignment));
217 }
218 
219 extern void register_allocation(void *addr, size_t size,
220                                 atmi_mem_place_t place);
221 extern hsa_amd_memory_pool_t
222 get_memory_pool_by_mem_place(atmi_mem_place_t place);
223 extern bool atl_is_atmi_initialized();
224 
225 bool handle_group_signal(hsa_signal_value_t value, void *arg);
226 
227 void packet_store_release(uint32_t *packet, uint16_t header, uint16_t rest);
228 uint16_t
229 create_header(hsa_packet_type_t type, int barrier,
230               atmi_task_fence_scope_t acq_fence = ATMI_FENCE_SCOPE_SYSTEM,
231               atmi_task_fence_scope_t rel_fence = ATMI_FENCE_SCOPE_SYSTEM);
232 
233 void allow_access_to_all_gpu_agents(void *ptr);
234 } // namespace core
235 
236 const char *get_error_string(hsa_status_t err);
237 const char *get_atmi_error_string(atmi_status_t err);
238 
239 #define ATMIErrorCheck(msg, status)                                            \
240   if (status != ATMI_STATUS_SUCCESS) {                                         \
241     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,                \
242            get_atmi_error_string(status));                                     \
243     exit(1);                                                                   \
244   } else {                                                                     \
245     /*  printf("%s succeeded.\n", #msg);*/                                     \
246   }
247 
248 #define ErrorCheck(msg, status)                                                \
249   if (status != HSA_STATUS_SUCCESS) {                                          \
250     printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,                \
251            get_error_string(status));                                          \
252     exit(1);                                                                   \
253   } else {                                                                     \
254     /*  printf("%s succeeded.\n", #msg);*/                                     \
255   }
256 
257 #define ErrorCheckAndContinue(msg, status)                                     \
258   if (status != HSA_STATUS_SUCCESS) {                                          \
259     DEBUG_PRINT("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, #msg,           \
260                 get_error_string(status));                                     \
261     continue;                                                                  \
262   } else {                                                                     \
263     /*  printf("%s succeeded.\n", #msg);*/                                     \
264   }
265 
266 #endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
267