1 //////////////////////////////////////////////////////////////////////////////// 2 // 3 // The University of Illinois/NCSA 4 // Open Source License (NCSA) 5 // 6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. 7 // 8 // Developed by: 9 // 10 // AMD Research and AMD HSA Software Development 11 // 12 // Advanced Micro Devices, Inc. 13 // 14 // www.amd.com 15 // 16 // Permission is hereby granted, free of charge, to any person obtaining a copy 17 // of this software and associated documentation files (the "Software"), to 18 // deal with the Software without restriction, including without limitation 19 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 20 // and/or sell copies of the Software, and to permit persons to whom the 21 // Software is furnished to do so, subject to the following conditions: 22 // 23 // - Redistributions of source code must retain the above copyright notice, 24 // this list of conditions and the following disclaimers. 25 // - Redistributions in binary form must reproduce the above copyright 26 // notice, this list of conditions and the following disclaimers in 27 // the documentation and/or other materials provided with the distribution. 28 // - Neither the names of Advanced Micro Devices, Inc, 29 // nor the names of its contributors may be used to endorse or promote 30 // products derived from this Software without specific prior written 31 // permission. 32 // 33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 39 // DEALINGS WITH THE SOFTWARE. 40 // 41 //////////////////////////////////////////////////////////////////////////////// 42 43 #ifndef HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ 44 #define HSA_RUNTIME_CORE_INC_AMD_BLIT_KERNEL_H_ 45 46 #include <map> 47 #include <mutex> 48 #include <stdint.h> 49 50 #include "core/inc/blit.h" 51 52 namespace amd { 53 class BlitKernel : public core::Blit { 54 public: 55 explicit BlitKernel(core::Queue* queue); 56 virtual ~BlitKernel() override; 57 58 /// @brief Initialize a blit kernel object. 59 /// 60 /// @param agent Pointer to the agent that will execute the AQL packets. 61 /// 62 /// @return hsa_status_t 63 virtual hsa_status_t Initialize(const core::Agent& agent) override; 64 65 /// @brief Marks the blit kernel object as invalid and uncouples its link with 66 /// the underlying AQL kernel queue. Use of the blit object 67 /// once it has been release is illegal and any behavior is indeterminate 68 /// 69 /// @note: The call will block until all AQL packets have been executed. 70 /// 71 /// @param agent Agent passed to Initialize. 72 /// 73 /// @return hsa_status_t 74 virtual hsa_status_t Destroy(const core::Agent& agent) override; 75 76 /// @brief Submit an AQL packet to perform vector copy. The call is blocking 77 /// until the command execution is finished. 78 /// 79 /// @param dst Memory address of the copy destination. 80 /// @param src Memory address of the copy source. 81 /// @param size Size of the data to be copied. 82 virtual hsa_status_t SubmitLinearCopyCommand(void* dst, const void* src, 83 size_t size) override; 84 85 /// @brief Submit a linear copy command to the the underlying compute device's 86 /// control block. The call is non blocking. The memory transfer will start 87 /// after all dependent signals are satisfied. After the transfer is 88 /// completed, the out signal will be decremented. 89 /// 90 /// @param dst Memory address of the copy destination. 91 /// @param src Memory address of the copy source. 92 /// @param size Size of the data to be copied. 93 /// @param dep_signals Arrays of dependent signal. 94 /// @param out_signal Output signal. 95 virtual hsa_status_t SubmitLinearCopyCommand( 96 void* dst, const void* src, size_t size, 97 std::vector<core::Signal*>& dep_signals, 98 core::Signal& out_signal) override; 99 100 /// @brief Submit an AQL packet to perform memory fill. The call is blocking 101 /// until the command execution is finished. 102 /// 103 /// @param ptr Memory address of the fill destination. 104 /// @param value Value to be set. 105 /// @param count Number of uint32_t element to be set to the value. 106 virtual hsa_status_t SubmitLinearFillCommand(void* ptr, uint32_t value, 107 size_t count) override; 108 109 virtual hsa_status_t EnableProfiling(bool enable) override; 110 111 private: 112 union KernelArgs { 113 struct __ALIGNED__(16) { 114 uint64_t phase1_src_start; 115 uint64_t phase1_dst_start; 116 uint64_t phase2_src_start; 117 uint64_t phase2_dst_start; 118 uint64_t phase3_src_start; 119 uint64_t phase3_dst_start; 120 uint64_t phase4_src_start; 121 uint64_t phase4_dst_start; 122 uint64_t phase4_src_end; 123 uint64_t phase4_dst_end; 124 uint32_t num_workitems; 125 } copy_aligned; 126 127 struct __ALIGNED__(16) { 128 uint64_t phase1_src_start; 129 uint64_t phase1_dst_start; 130 uint64_t phase2_src_start; 131 uint64_t phase2_dst_start; 132 uint64_t phase2_src_end; 133 uint64_t phase2_dst_end; 134 uint32_t num_workitems; 135 } copy_misaligned; 136 137 struct __ALIGNED__(16) { 138 uint64_t phase1_dst_start; 139 uint64_t phase2_dst_start; 140 uint64_t phase2_dst_end; 141 uint32_t fill_value; 142 uint32_t num_workitems; 143 } fill; 144 }; 145 146 /// Reserve a slot in the queue buffer. The call will wait until the queue 147 /// buffer has a room. 148 uint64_t AcquireWriteIndex(uint32_t num_packet); 149 150 /// Update the queue doorbell register with ::write_index. This 151 /// function also serializes concurrent doorbell update to ensure that the 152 /// packet processor doesn't get invalid packet. 153 void ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet); 154 155 void PopulateQueue(uint64_t index, uint64_t code_handle, void* args, 156 uint32_t grid_size_x, hsa_signal_t completion_signal); 157 158 KernelArgs* ObtainAsyncKernelCopyArg(); 159 160 /// AQL code object and size for each kernel. 161 enum class KernelType { 162 CopyAligned, 163 CopyMisaligned, 164 Fill, 165 }; 166 167 struct KernelCode { 168 void* code_buf_; 169 size_t code_buf_size_; 170 }; 171 172 std::map<KernelType, KernelCode> kernels_; 173 174 /// AQL queue for submitting the vector copy kernel. 175 core::Queue* queue_; 176 uint32_t queue_bitmask_; 177 178 /// Pointer to the kernel argument buffer. 179 KernelArgs* kernarg_async_; 180 uint32_t kernarg_async_mask_; 181 volatile uint32_t kernarg_async_counter_; 182 183 /// Completion signal for every kernel dispatched. 184 hsa_signal_t completion_signal_; 185 186 /// Lock to synchronize access to kernarg_ and completion_signal_ 187 std::mutex lock_; 188 189 /// Number of CUs on the underlying agent. 190 int num_cus_; 191 }; 192 } // namespace amd 193 194 #endif // header guard 195