1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 //                 AMD Research and AMD HSA Software Development
11 //
12 //                 Advanced Micro Devices, Inc.
13 //
14 //                 www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 //  - Redistributions of source code must retain the above copyright notice,
24 //    this list of conditions and the following disclaimers.
25 //  - Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimers in
27 //    the documentation and/or other materials provided with the distribution.
28 //  - Neither the names of Advanced Micro Devices, Inc,
29 //    nor the names of its contributors may be used to endorse or promote
30 //    products derived from this Software without specific prior written
31 //    permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42 
43 #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
44 #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_
45 
46 #include <map>
47 #include <mutex>
48 #include <stdint.h>
49 
50 #include "core/inc/blit.h"
51 
52 namespace amd {
53 class BlitKernel : public core::Blit {
54  public:
55   explicit BlitKernel(core::Queue* queue);
56   virtual ~BlitKernel() override;
57 
58   /// @brief Initialize a blit kernel object.
59   ///
60   /// @param agent Pointer to the agent that will execute the AQL packets.
61   ///
62   /// @return hsa_status_t
63   virtual hsa_status_t Initialize(const core::Agent& agent) override;
64 
65   /// @brief Marks the blit kernel object as invalid and uncouples its link with
66   /// the underlying AQL kernel queue. Use of the blit object
67   /// once it has been release is illegal and any behavior is indeterminate
68   ///
69   /// @note: The call will block until all AQL packets have been executed.
70   ///
71   /// @param agent Agent passed to Initialize.
72   ///
73   /// @return hsa_status_t
74   virtual hsa_status_t Destroy(const core::Agent& agent) override;
75 
76   /// @brief Submit an AQL packet to perform vector copy. The call is blocking
77   /// until the command execution is finished.
78   ///
79   /// @param dst Memory address of the copy destination.
80   /// @param src Memory address of the copy source.
81   /// @param size Size of the data to be copied.
82   virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src,
83                                                size_t size) override;
84 
85   /// @brief Submit a linear copy command to the the underlying compute device's
86   /// control block. The call is non blocking. The memory transfer will start
87   /// after all dependent signals are satisfied. After the transfer is
88   /// completed, the out signal will be decremented.
89   ///
90   /// @param dst Memory address of the copy destination.
91   /// @param src Memory address of the copy source.
92   /// @param size Size of the data to be copied.
93   /// @param dep_signals Arrays of dependent signal.
94   /// @param out_signal Output signal.
95   virtual hsa_status_t SubmitLinearCopyCommand(
96       void* dst, const void* src, size_t size,
97       std::vector<core::Signal*>& dep_signals,
98       core::Signal& out_signal) override;
99 
100   /// @brief Submit an AQL packet to perform memory fill. The call is blocking
101   /// until the command execution is finished.
102   ///
103   /// @param ptr Memory address of the fill destination.
104   /// @param value Value to be set.
105   /// @param count Number of uint32_t element to be set to the value.
106   virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value,
107                                                size_t count) override;
108 
109   virtual hsa_status_t EnableProfiling(bool enable) override;
110 
111  private:
112   union KernelArgs {
113     struct __ALIGNED__(16) {
114       uint64_t phase1_src_start;
115       uint64_t phase1_dst_start;
116       uint64_t phase2_src_start;
117       uint64_t phase2_dst_start;
118       uint64_t phase3_src_start;
119       uint64_t phase3_dst_start;
120       uint64_t phase4_src_start;
121       uint64_t phase4_dst_start;
122       uint64_t phase4_src_end;
123       uint64_t phase4_dst_end;
124       uint32_t num_workitems;
125     } copy_aligned;
126 
127     struct __ALIGNED__(16) {
128       uint64_t phase1_src_start;
129       uint64_t phase1_dst_start;
130       uint64_t phase2_src_start;
131       uint64_t phase2_dst_start;
132       uint64_t phase2_src_end;
133       uint64_t phase2_dst_end;
134       uint32_t num_workitems;
135     } copy_misaligned;
136 
137     struct __ALIGNED__(16) {
138       uint64_t phase1_dst_start;
139       uint64_t phase2_dst_start;
140       uint64_t phase2_dst_end;
141       uint32_t fill_value;
142       uint32_t num_workitems;
143     } fill;
144   };
145 
146   /// Reserve a slot in the queue buffer. The call will wait until the queue
147   /// buffer has a room.
148   uint64_t AcquireWriteIndex(uint32_t num_packet);
149 
150   /// Update the queue doorbell register with ::write_index. This
151   /// function also serializes concurrent doorbell update to ensure that the
152   /// packet processor doesn't get invalid packet.
153   void ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet);
154 
155   void PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
156                      uint32_t grid_size_x, hsa_signal_t completion_signal);
157 
158   KernelArgs* ObtainAsyncKernelCopyArg();
159 
160   /// AQL code object and size for each kernel.
161   enum class KernelType {
162     CopyAligned,
163     CopyMisaligned,
164     Fill,
165   };
166 
167   struct KernelCode {
168     void* code_buf_;
169     size_t code_buf_size_;
170   };
171 
172   std::map<KernelType, KernelCode> kernels_;
173 
174   /// AQL queue for submitting the vector copy kernel.
175   core::Queue* queue_;
176   uint32_t queue_bitmask_;
177 
178   /// Pointer to the kernel argument buffer.
179   KernelArgs* kernarg_async_;
180   uint32_t kernarg_async_mask_;
181   volatile uint32_t kernarg_async_counter_;
182 
183   /// Completion signal for every kernel dispatched.
184   hsa_signal_t completion_signal_;
185 
186   /// Lock to synchronize access to kernarg_ and completion_signal_
187   std::mutex lock_;
188 
189   /// Number of CUs on the underlying agent.
190   int num_cus_;
191 };
192 }  // namespace amd
193 
194 #endif  // header guard
195