1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 //                 AMD Research and AMD HSA Software Development
11 //
12 //                 Advanced Micro Devices, Inc.
13 //
14 //                 www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 //  - Redistributions of source code must retain the above copyright notice,
24 //    this list of conditions and the following disclaimers.
25 //  - Redistributions in binary form must reproduce the above copyright
26 //    notice, this list of conditions and the following disclaimers in
27 //    the documentation and/or other materials provided with the distribution.
28 //  - Neither the names of Advanced Micro Devices, Inc,
29 //    nor the names of its contributors may be used to endorse or promote
30 //    products derived from this Software without specific prior written
31 //    permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42 
43 #include "core/inc/amd_blit_kernel.h"
44 
45 #include <algorithm>
46 #include <sstream>
47 #include <string>
48 
49 #include "core/inc/amd_gpu_agent.h"
50 #include "core/inc/hsa_internal.h"
51 #include "core/util/utils.h"
52 
53 namespace amd {
54 static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;
55 
56 static std::string kBlitKernelSource(R"(
57   // Compatibility function for GFXIP 7.
58 
59   function s_load_dword_offset(byte_offset)
60     if kGFXIPVersion == 7
61       return byte_offset / 4
62     else
63       return byte_offset
64     end
65   end
66 
67   // Memory copy for all cases except:
68   //  (src_addr & 0x3) != (dst_addr & 0x3)
69   //
70   // Kernel argument buffer:
71   //   [DW  0, 1]  Phase 1 src start address
72   //   [DW  2, 3]  Phase 1 dst start address
73   //   [DW  4, 5]  Phase 2 src start address
74   //   [DW  6, 7]  Phase 2 dst start address
75   //   [DW  8, 9]  Phase 3 src start address
76   //   [DW 10,11]  Phase 3 dst start address
77   //   [DW 12,13]  Phase 4 src start address
78   //   [DW 14,15]  Phase 4 dst start address
79   //   [DW 16,17]  Phase 4 src end address
80   //   [DW 18,19]  Phase 4 dst end address
81   //   [DW 20   ]  Total number of workitems
82 
83   var kCopyAlignedVecWidth = 4
84   var kCopyAlignedUnroll = 1
85 
86   shader CopyAligned
87     type(CS)
88     user_sgpr_count(2)
89     sgpr_count(32)
90     vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
91 
92     // Retrieve kernel arguments.
93     s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
94     s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
95     s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
96     s_load_dwordx4          s[16:19], s[0:1], s_load_dword_offset(0x30)
97     s_load_dwordx4          s[20:23], s[0:1], s_load_dword_offset(0x40)
98     s_load_dword            s24, s[0:1], s_load_dword_offset(0x50)
99     s_waitcnt               lgkmcnt(0)
100 
101     // Compute workitem id.
102     s_lshl_b32              s2, s2, 0x6
103     v_add_u32               v0, vcc, s2, v0
104 
105     // =====================================================
106     // Phase 1: Byte copy up to 0x100 destination alignment.
107     // =====================================================
108 
109     // Compute phase source address.
110     v_mov_b32               v3, s5
111     v_add_u32               v2, vcc, v0, s4
112     v_addc_u32              v3, vcc, v3, 0x0, vcc
113 
114     // Compute phase destination address.
115     v_mov_b32               v5, s7
116     v_add_u32               v4, vcc, v0, s6
117     v_addc_u32              v5, vcc, v5, 0x0, vcc
118 
119   L_COPY_ALIGNED_PHASE_1_LOOP:
120     // Mask off lanes (or branch out) after phase end.
121     v_cmp_lt_u64            vcc, v[2:3], s[8:9]
122     s_cbranch_vccz          L_COPY_ALIGNED_PHASE_1_DONE
123     s_and_b64               exec, exec, vcc
124 
125     // Load from/advance the source address.
126     flat_load_ubyte         v1, v[2:3]
127     s_waitcnt               vmcnt(0)
128     v_add_u32               v2, vcc, v2, s24
129     v_addc_u32              v3, vcc, v3, 0x0, vcc
130 
131     // Write to/advance the destination address.
132     flat_store_byte         v[4:5], v1
133     v_add_u32               v4, vcc, v4, s24
134     v_addc_u32              v5, vcc, v5, 0x0, vcc
135 
136     // Repeat until branched out.
137     s_branch                L_COPY_ALIGNED_PHASE_1_LOOP
138 
139   L_COPY_ALIGNED_PHASE_1_DONE:
140     // Restore EXEC mask for all lanes.
141     s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
142 
143     // ========================================================
144     // Phase 2: Unrolled dword[x4] copy up to last whole block.
145     // ========================================================
146 
147     // Compute unrolled dword[x4] stride across all threads.
148     if kCopyAlignedVecWidth == 4
149       s_lshl_b32            s25, s24, 0x4
150     else
151       s_lshl_b32            s25, s24, 0x2
152     end
153 
154     // Compute phase source address.
155     if kCopyAlignedVecWidth == 4
156       v_lshlrev_b32         v1, 0x4, v0
157     else
158       v_lshlrev_b32         v1, 0x2, v0
159     end
160 
161     v_mov_b32               v3, s9
162     v_add_u32               v2, vcc, v1, s8
163     v_addc_u32              v3, vcc, v3, 0x0, vcc
164 
165     // Compute phase destination address.
166     v_mov_b32               v5, s11
167     v_add_u32               v4, vcc, v1, s10
168     v_addc_u32              v5, vcc, v5, 0x0, vcc
169 
170   L_COPY_ALIGNED_PHASE_2_LOOP:
171     // Branch out after phase end.
172     v_cmp_lt_u64            vcc, v[2:3], s[12:13]
173     s_cbranch_vccz          L_COPY_ALIGNED_PHASE_2_DONE
174 
175     // Load from/advance the source address.
176     for var i = 0; i < kCopyAlignedUnroll; i ++
177       if kCopyAlignedVecWidth == 4
178         flat_load_dwordx4   v[8 + (i * 4)], v[2:3]
179       else
180         flat_load_dword     v[8 + i], v[2:3]
181       end
182 
183       v_add_u32             v2, vcc, v2, s25
184       v_addc_u32            v3, vcc, v3, 0x0, vcc
185     end
186 
187     // Write to/advance the destination address.
188     s_waitcnt               vmcnt(0)
189 
190     for var i = 0; i < kCopyAlignedUnroll; i ++
191       if kCopyAlignedVecWidth == 4
192         flat_store_dwordx4  v[4:5], v[8 + (i * 4)]
193       else
194         flat_store_dword    v[4:5], v[8 + i]
195       end
196 
197       v_add_u32             v4, vcc, v4, s25
198       v_addc_u32            v5, vcc, v5, 0x0, vcc
199     end
200 
201     // Repeat until branched out.
202     s_branch                L_COPY_ALIGNED_PHASE_2_LOOP
203 
204   L_COPY_ALIGNED_PHASE_2_DONE:
205 
206     // ===========================================
207     // Phase 3: Dword copy up to last whole dword.
208     // ===========================================
209 
210     // Compute dword stride across all threads.
211     s_lshl_b32              s25, s24, 0x2
212 
213     // Compute phase source address.
214     v_lshlrev_b32           v1, 0x2, v0
215     v_mov_b32               v3, s13
216     v_add_u32               v2, vcc, v1, s12
217     v_addc_u32              v3, vcc, v3, 0x0, vcc
218 
219     // Compute phase destination address.
220     v_mov_b32               v5, s15
221     v_add_u32               v4, vcc, v1, s14
222     v_addc_u32              v5, vcc, v5, 0x0, vcc
223 
224   L_COPY_ALIGNED_PHASE_3_LOOP:
225     // Mask off lanes (or branch out) after phase end.
226     v_cmp_lt_u64            vcc, v[2:3], s[16:17]
227     s_cbranch_vccz          L_COPY_ALIGNED_PHASE_3_DONE
228     s_and_b64               exec, exec, vcc
229 
230     // Load from/advance the source address.
231     flat_load_dword         v1, v[2:3]
232     v_add_u32               v2, vcc, v2, s25
233     v_addc_u32              v3, vcc, v3, 0x0, vcc
234     s_waitcnt               vmcnt(0)
235 
236     // Write to/advance the destination address.
237     flat_store_dword        v[4:5], v1
238     v_add_u32               v4, vcc, v4, s25
239     v_addc_u32              v5, vcc, v5, 0x0, vcc
240 
241     // Repeat until branched out.
242     s_branch                L_COPY_ALIGNED_PHASE_3_LOOP
243 
244   L_COPY_ALIGNED_PHASE_3_DONE:
245     // Restore EXEC mask for all lanes.
246     s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
247 
248     // =============================
249     // Phase 4: Byte copy up to end.
250     // =============================
251 
252     // Compute phase source address.
253     v_mov_b32               v3, s17
254     v_add_u32               v2, vcc, v0, s16
255     v_addc_u32              v3, vcc, v3, 0x0, vcc
256 
257     // Compute phase destination address.
258     v_mov_b32               v5, s19
259     v_add_u32               v4, vcc, v0, s18
260     v_addc_u32              v5, vcc, v5, 0x0, vcc
261 
262     // Mask off lanes (or branch out) after phase end.
263     v_cmp_lt_u64            vcc, v[2:3], s[20:21]
264     s_cbranch_vccz          L_COPY_ALIGNED_PHASE_4_DONE
265     s_and_b64               exec, exec, vcc
266 
267     // Load from the source address.
268     flat_load_ubyte         v1, v[2:3]
269     s_waitcnt               vmcnt(0)
270 
271     // Write to the destination address.
272     flat_store_byte         v[4:5], v1
273 
274   L_COPY_ALIGNED_PHASE_4_DONE:
275     s_endpgm
276   end
277 
278   // Memory copy for this case:
279   //  (src_addr & 0x3) != (dst_addr & 0x3)
280   //
281   // Kernel argument buffer:
282   //   [DW  0, 1]  Phase 1 src start address
283   //   [DW  2, 3]  Phase 1 dst start address
284   //   [DW  4, 5]  Phase 2 src start address
285   //   [DW  6, 7]  Phase 2 dst start address
286   //   [DW  8, 9]  Phase 2 src end address
287   //   [DW 10,11]  Phase 2 dst end address
288   //   [DW 12   ]  Total number of workitems
289 
290   var kCopyMisalignedUnroll = 4
291 
292   shader CopyMisaligned
293     type(CS)
294     user_sgpr_count(2)
295     sgpr_count(23)
296     vgpr_count(6 + kCopyMisalignedUnroll)
297 
298     // Retrieve kernel arguments.
299     s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
300     s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
301     s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
302     s_load_dword            s16, s[0:1], s_load_dword_offset(0x30)
303     s_waitcnt               lgkmcnt(0)
304 
305     // Compute workitem id.
306     s_lshl_b32              s2, s2, 0x6
307     v_add_u32               v0, vcc, s2, v0
308 
309     // ===================================================
310     // Phase 1: Unrolled byte copy up to last whole block.
311     // ===================================================
312 
313     // Compute phase source address.
314     v_mov_b32               v3, s5
315     v_add_u32               v2, vcc, v0, s4
316     v_addc_u32              v3, vcc, v3, 0x0, vcc
317 
318     // Compute phase destination address.
319     v_mov_b32               v5, s7
320     v_add_u32               v4, vcc, v0, s6
321     v_addc_u32              v5, vcc, v5, 0x0, vcc
322 
323   L_COPY_MISALIGNED_PHASE_1_LOOP:
324     // Branch out after phase end.
325     v_cmp_lt_u64            vcc, v[2:3], s[8:9]
326     s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_1_DONE
327 
328     // Load from/advance the source address.
329     for var i = 0; i < kCopyMisalignedUnroll; i ++
330       flat_load_ubyte       v[6 + i], v[2:3]
331       v_add_u32             v2, vcc, v2, s16
332       v_addc_u32            v3, vcc, v3, 0x0, vcc
333     end
334 
335     // Write to/advance the destination address.
336     s_waitcnt               vmcnt(0)
337 
338     for var i = 0; i < kCopyMisalignedUnroll; i ++
339       flat_store_byte       v[4:5], v[6 + i]
340       v_add_u32             v4, vcc, v4, s16
341       v_addc_u32            v5, vcc, v5, 0x0, vcc
342     end
343 
344     // Repeat until branched out.
345     s_branch                L_COPY_MISALIGNED_PHASE_1_LOOP
346 
347   L_COPY_MISALIGNED_PHASE_1_DONE:
348 
349     // =============================
350     // Phase 2: Byte copy up to end.
351     // =============================
352 
353     // Compute phase source address.
354     v_mov_b32               v3, s9
355     v_add_u32               v2, vcc, v0, s8
356     v_addc_u32              v3, vcc, v3, 0x0, vcc
357 
358     // Compute phase destination address.
359     v_mov_b32               v5, s11
360     v_add_u32               v4, vcc, v0, s10
361     v_addc_u32              v5, vcc, v5, 0x0, vcc
362 
363   L_COPY_MISALIGNED_PHASE_2_LOOP:
364     // Mask off lanes (or branch out) after phase end.
365     v_cmp_lt_u64            vcc, v[2:3], s[12:13]
366     s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_2_DONE
367     s_and_b64               exec, exec, vcc
368 
369     // Load from/advance the source address.
370     flat_load_ubyte         v1, v[2:3]
371     v_add_u32               v2, vcc, v2, s16
372     v_addc_u32              v3, vcc, v3, 0x0, vcc
373     s_waitcnt               vmcnt(0)
374 
375     // Write to/advance the destination address.
376     flat_store_byte         v[4:5], v1
377     v_add_u32               v4, vcc, v4, s16
378     v_addc_u32              v5, vcc, v5, 0x0, vcc
379 
380     // Repeat until branched out.
381     s_branch                L_COPY_MISALIGNED_PHASE_2_LOOP
382 
383   L_COPY_MISALIGNED_PHASE_2_DONE:
384     s_endpgm
385   end
386 
387   // Memory fill for dword-aligned region.
388   //
389   // Kernel argument buffer:
390   //   [DW  0, 1]  Phase 1 dst start address
391   //   [DW  2, 3]  Phase 2 dst start address
392   //   [DW  4, 5]  Phase 2 dst end address
393   //   [DW  6   ]  Value to fill memory with
394   //   [DW  7   ]  Total number of workitems
395 
396   var kFillVecWidth = 4
397   var kFillUnroll = 1
398 
399   shader Fill
400     type(CS)
401     user_sgpr_count(2)
402     sgpr_count(19)
403     vgpr_count(8)
404 
405     // Retrieve kernel arguments.
406     s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
407     s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
408     s_waitcnt               lgkmcnt(0)
409 
410     // Compute workitem id.
411     s_lshl_b32              s2, s2, 0x6
412     v_add_u32               v0, vcc, s2, v0
413 
414     // Copy fill pattern into VGPRs.
415     for var i = 0; i < kFillVecWidth; i ++
416       v_mov_b32           v[4 + i], s10
417     end
418 
419     // ========================================================
420     // Phase 1: Unrolled dword[x4] fill up to last whole block.
421     // ========================================================
422 
423     // Compute unrolled dword[x4] stride across all threads.
424     if kFillVecWidth == 4
425       s_lshl_b32            s12, s11, 0x4
426     else
427       s_lshl_b32            s12, s11, 0x2
428     end
429 
430     // Compute phase destination address.
431     if kFillVecWidth == 4
432       v_lshlrev_b32         v1, 0x4, v0
433     else
434       v_lshlrev_b32         v1, 0x2, v0
435     end
436 
437     v_mov_b32               v3, s5
438     v_add_u32               v2, vcc, v1, s4
439     v_addc_u32              v3, vcc, v3, 0x0, vcc
440 
441   L_FILL_PHASE_1_LOOP:
442     // Branch out after phase end.
443     v_cmp_lt_u64            vcc, v[2:3], s[6:7]
444     s_cbranch_vccz          L_FILL_PHASE_1_DONE
445 
446     // Write to/advance the destination address.
447     for var i = 0; i < kFillUnroll; i ++
448       if kFillVecWidth == 4
449         flat_store_dwordx4  v[2:3], v[4:7]
450       else
451         flat_store_dword    v[2:3], v4
452       end
453 
454       v_add_u32             v2, vcc, v2, s12
455       v_addc_u32            v3, vcc, v3, 0x0, vcc
456     end
457 
458     // Repeat until branched out.
459     s_branch                L_FILL_PHASE_1_LOOP
460 
461   L_FILL_PHASE_1_DONE:
462 
463     // ==============================
464     // Phase 2: Dword fill up to end.
465     // ==============================
466 
467     // Compute dword stride across all threads.
468     s_lshl_b32              s12, s11, 0x2
469 
470     // Compute phase destination address.
471     v_lshlrev_b32           v1, 0x2, v0
472     v_mov_b32               v3, s7
473     v_add_u32               v2, vcc, v1, s6
474     v_addc_u32              v3, vcc, v3, 0x0, vcc
475 
476   L_FILL_PHASE_2_LOOP:
477     // Mask off lanes (or branch out) after phase end.
478     v_cmp_lt_u64            vcc, v[2:3], s[8:9]
479     s_cbranch_vccz          L_FILL_PHASE_2_DONE
480     s_and_b64               exec, exec, vcc
481 
482     // Write to/advance the destination address.
483     flat_store_dword        v[2:3], v4
484     v_add_u32               v2, vcc, v2, s12
485     v_addc_u32              v3, vcc, v3, 0x0, vcc
486 
487     // Repeat until branched out.
488     s_branch                L_FILL_PHASE_2_LOOP
489 
490   L_FILL_PHASE_2_DONE:
491     s_endpgm
492   end
493 )");
494 
495 // Search kernel source for variable definition and return value.
GetKernelSourceParam(const char * paramName)496 int GetKernelSourceParam(const char* paramName) {
497   std::stringstream paramDef;
498   paramDef << "var " << paramName << " = ";
499 
500   std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str());
501   assert(paramDefLoc != std::string::npos);
502   std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size();
503   std::string::size_type paramEndLoc =
504       kBlitKernelSource.find('\n', paramDefLoc);
505   assert(paramDefLoc != std::string::npos);
506 
507   std::string paramVal(&kBlitKernelSource[paramValLoc],
508                        &kBlitKernelSource[paramEndLoc]);
509   return std::stoi(paramVal);
510 }
511 
512 static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth");
513 static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll");
514 static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll");
515 static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth");
516 static int kFillUnroll = GetKernelSourceParam("kFillUnroll");
517 
BlitKernel(core::Queue * queue)518 BlitKernel::BlitKernel(core::Queue* queue)
519     : core::Blit(),
520       queue_(queue),
521       kernarg_async_(NULL),
522       kernarg_async_mask_(0),
523       kernarg_async_counter_(0),
524       num_cus_(0) {
525   completion_signal_.handle = 0;
526 }
527 
~BlitKernel()528 BlitKernel::~BlitKernel() {}
529 
Initialize(const core::Agent & agent)530 hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
531   queue_bitmask_ = queue_->public_handle()->size - 1;
532 
533   hsa_status_t status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
534   if (HSA_STATUS_SUCCESS != status) {
535     return status;
536   }
537 
538   kernarg_async_ = reinterpret_cast<KernelArgs*>(
539       core::Runtime::runtime_singleton_->system_allocator()(
540           queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16,
541           core::MemoryRegion::AllocateNoFlags));
542 
543   kernarg_async_mask_ = queue_->public_handle()->size - 1;
544 
545   // Obtain the number of compute units in the underlying agent.
546   const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
547   num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
548 
549   // Assemble shaders to AQL code objects.
550   std::map<KernelType, const char*> kernel_names = {
551       {KernelType::CopyAligned, "CopyAligned"},
552       {KernelType::CopyMisaligned, "CopyMisaligned"},
553       {KernelType::Fill, "Fill"}};
554 
555   for (auto kernel_name : kernel_names) {
556     KernelCode& kernel = kernels_[kernel_name.first];
557     gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second,
558                             GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
559                             kernel.code_buf_size_);
560   }
561 
562   if (agent.profiling_enabled()) {
563     return EnableProfiling(true);
564   }
565 
566   return HSA_STATUS_SUCCESS;
567 }
568 
Destroy(const core::Agent & agent)569 hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
570   std::lock_guard<std::mutex> guard(lock_);
571 
572   const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
573 
574   for (auto kernel_pair : kernels_) {
575     gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
576                            kernel_pair.second.code_buf_size_);
577   }
578 
579   if (kernarg_async_ != NULL) {
580     core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_);
581   }
582 
583   if (completion_signal_.handle != 0) {
584     HSA::hsa_signal_destroy(completion_signal_);
585   }
586 
587   return HSA_STATUS_SUCCESS;
588 }
589 
SubmitLinearCopyCommand(void * dst,const void * src,size_t size)590 hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
591                                                  size_t size) {
592   // Protect completion_signal_.
593   std::lock_guard<std::mutex> guard(lock_);
594 
595   HSA::hsa_signal_store_relaxed(completion_signal_, 1);
596 
597   std::vector<core::Signal*> dep_signals(0);
598 
599   hsa_status_t stat = SubmitLinearCopyCommand(
600       dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_));
601 
602   if (stat != HSA_STATUS_SUCCESS) {
603     return stat;
604   }
605 
606   // Wait for the packet to finish.
607   if (HSA::hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
608                                      HSA_WAIT_STATE_ACTIVE) != 0) {
609     // Signal wait returned unexpected value.
610     return HSA_STATUS_ERROR;
611   }
612 
613   return HSA_STATUS_SUCCESS;
614 }
615 
SubmitLinearCopyCommand(void * dst,const void * src,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & out_signal)616 hsa_status_t BlitKernel::SubmitLinearCopyCommand(
617     void* dst, const void* src, size_t size,
618     std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
619   // Reserve write index for barrier(s) + dispatch packet.
620   const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
621   const uint32_t total_num_packet = num_barrier_packet + 1;
622 
623   uint64_t write_index = AcquireWriteIndex(total_num_packet);
624   uint64_t write_index_temp = write_index;
625 
626   // Insert barrier packets to handle dependent signals.
627   // Barrier bit keeps signal checking traffic from competing with a copy.
628   const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
629       (1 << HSA_PACKET_HEADER_BARRIER) |
630       (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
631       (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
632 
633   hsa_barrier_and_packet_t barrier_packet = {0};
634   barrier_packet.header = HSA_PACKET_TYPE_INVALID;
635 
636   hsa_barrier_and_packet_t* queue_buffer =
637       reinterpret_cast<hsa_barrier_and_packet_t*>(
638           queue_->public_handle()->base_address);
639 
640   const size_t dep_signal_count = dep_signals.size();
641   for (size_t i = 0; i < dep_signal_count; ++i) {
642     const size_t idx = i % 5;
643     barrier_packet.dep_signal[idx] = core::Signal::Convert(dep_signals[i]);
644     if (i == (dep_signal_count - 1) || idx == 4) {
645       std::atomic_thread_fence(std::memory_order_acquire);
646       queue_buffer[(write_index)&queue_bitmask_] = barrier_packet;
647       std::atomic_thread_fence(std::memory_order_release);
648       queue_buffer[(write_index)&queue_bitmask_].header = kBarrierPacketHeader;
649 
650       ++write_index;
651 
652       memset(&barrier_packet, 0, sizeof(hsa_barrier_and_packet_t));
653       barrier_packet.header = HSA_PACKET_TYPE_INVALID;
654     }
655   }
656 
657   // Insert dispatch packet for copy kernel.
658   KernelArgs* args = ObtainAsyncKernelCopyArg();
659   KernelCode* kernel_code = nullptr;
660   int num_workitems = 0;
661 
662   bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3));
663 
664   if (aligned) {
665     // Use dword-based aligned kernel.
666     kernel_code = &kernels_[KernelType::CopyAligned];
667 
668     // Compute the size of each copy phase.
669     num_workitems = 64 * 4 * num_cus_;
670 
671     // Phase 1 (byte copy) ends when destination is 0x100-aligned.
672     uintptr_t src_start = uintptr_t(src);
673     uintptr_t dst_start = uintptr_t(dst);
674     uint64_t phase1_size =
675         std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF);
676 
677     // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits.
678     uint64_t phase2_block = num_workitems * sizeof(uint32_t) *
679                             kCopyAlignedUnroll * kCopyAlignedVecWidth;
680     uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block;
681 
682     // Phase 3 (dword copy) ends when last whole dword fits.
683     uint64_t phase3_size =
684         ((size - phase1_size - phase2_size) / sizeof(uint32_t)) *
685         sizeof(uint32_t);
686 
687     args->copy_aligned.phase1_src_start = src_start;
688     args->copy_aligned.phase1_dst_start = dst_start;
689     args->copy_aligned.phase2_src_start = src_start + phase1_size;
690     args->copy_aligned.phase2_dst_start = dst_start + phase1_size;
691     args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size;
692     args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size;
693     args->copy_aligned.phase4_src_start =
694         src_start + phase1_size + phase2_size + phase3_size;
695     args->copy_aligned.phase4_dst_start =
696         dst_start + phase1_size + phase2_size + phase3_size;
697     args->copy_aligned.phase4_src_end = src_start + size;
698     args->copy_aligned.phase4_dst_end = dst_start + size;
699     args->copy_aligned.num_workitems = num_workitems;
700   } else {
701     // Use byte-based misaligned kernel.
702     kernel_code = &kernels_[KernelType::CopyMisaligned];
703 
704     // Compute the size of each copy phase.
705     num_workitems = 64 * 4 * num_cus_;
706 
707     // Phase 1 (unrolled byte copy) ends when last whole block fits.
708     uintptr_t src_start = uintptr_t(src);
709     uintptr_t dst_start = uintptr_t(dst);
710     uint64_t phase1_block =
711         num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll;
712     uint64_t phase1_size = (size / phase1_block) * phase1_block;
713 
714     args->copy_misaligned.phase1_src_start = src_start;
715     args->copy_misaligned.phase1_dst_start = dst_start;
716     args->copy_misaligned.phase2_src_start = src_start + phase1_size;
717     args->copy_misaligned.phase2_dst_start = dst_start + phase1_size;
718     args->copy_misaligned.phase2_src_end = src_start + size;
719     args->copy_misaligned.phase2_dst_end = dst_start + size;
720     args->copy_misaligned.num_workitems = num_workitems;
721   }
722 
723   hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle};
724   PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args,
725                 num_workitems, signal);
726 
727   // Submit barrier(s) and dispatch packets.
728   ReleaseWriteIndex(write_index_temp, total_num_packet);
729 
730   return HSA_STATUS_SUCCESS;
731 }
732 
SubmitLinearFillCommand(void * ptr,uint32_t value,size_t count)733 hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
734                                                  size_t count) {
735   std::lock_guard<std::mutex> guard(lock_);
736 
737   // Reject misaligned base address.
738   if ((uintptr_t(ptr) & 0x3) != 0) {
739     return HSA_STATUS_ERROR;
740   }
741 
742   // Compute the size of each fill phase.
743   int num_workitems = 64 * num_cus_;
744 
745   // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits.
746   uintptr_t dst_start = uintptr_t(ptr);
747   uint64_t fill_size = count * sizeof(uint32_t);
748 
749   uint64_t phase1_block =
750       num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth;
751   uint64_t phase1_size = (fill_size / phase1_block) * phase1_block;
752 
753   KernelArgs* args = ObtainAsyncKernelCopyArg();
754   args->fill.phase1_dst_start = dst_start;
755   args->fill.phase2_dst_start = dst_start + phase1_size;
756   args->fill.phase2_dst_end = dst_start + fill_size;
757   args->fill.fill_value = value;
758   args->fill.num_workitems = num_workitems;
759 
760   // Submit dispatch packet.
761   HSA::hsa_signal_store_relaxed(completion_signal_, 1);
762 
763   uint64_t write_index = AcquireWriteIndex(1);
764   PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_),
765                 args, num_workitems, completion_signal_);
766   ReleaseWriteIndex(write_index, 1);
767 
768   // Wait for the packet to finish.
769   if (HSA::hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
770                                      HSA_WAIT_STATE_ACTIVE) != 0) {
771     // Signal wait returned unexpected value.
772     return HSA_STATUS_ERROR;
773   }
774 
775   return HSA_STATUS_SUCCESS;
776 }
777 
EnableProfiling(bool enable)778 hsa_status_t BlitKernel::EnableProfiling(bool enable) {
779   queue_->SetProfiling(enable);
780   return HSA_STATUS_SUCCESS;
781 }
782 
AcquireWriteIndex(uint32_t num_packet)783 uint64_t BlitKernel::AcquireWriteIndex(uint32_t num_packet) {
784   assert(queue_->public_handle()->size >= num_packet);
785 
786   uint64_t write_index = queue_->AddWriteIndexAcqRel(num_packet);
787 
788   while (write_index + num_packet - queue_->LoadReadIndexRelaxed() > queue_->public_handle()->size) {
789     os::YieldThread();
790   }
791 
792   return write_index;
793 }
794 
ReleaseWriteIndex(uint64_t write_index,uint32_t num_packet)795 void BlitKernel::ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet) {
796   // Update doorbel register with last packet id.
797   core::Signal* doorbell =
798       core::Signal::Convert(queue_->public_handle()->doorbell_signal);
799   doorbell->StoreRelease(write_index + num_packet - 1);
800 }
801 
PopulateQueue(uint64_t index,uint64_t code_handle,void * args,uint32_t grid_size_x,hsa_signal_t completion_signal)802 void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
803                                uint32_t grid_size_x,
804                                hsa_signal_t completion_signal) {
805   assert(IsMultipleOf(args, 16));
806 
807   hsa_kernel_dispatch_packet_t packet = {0};
808 
809   static const uint16_t kDispatchPacketHeader =
810       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
811       (((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) |
812       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
813       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
814 
815   packet.header = kInvalidPacketHeader;
816   packet.kernel_object = code_handle;
817   packet.kernarg_address = args;
818 
819   // Setup working size.
820   const int kNumDimension = 1;
821   packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
822   packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64);
823   packet.grid_size_y = packet.grid_size_z = 1;
824   packet.workgroup_size_x = 64;
825   packet.workgroup_size_y = packet.workgroup_size_z = 1;
826 
827   packet.completion_signal = completion_signal;
828 
829   // Populate queue buffer with AQL packet.
830   hsa_kernel_dispatch_packet_t* queue_buffer =
831       reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
832           queue_->public_handle()->base_address);
833   std::atomic_thread_fence(std::memory_order_acquire);
834   queue_buffer[index & queue_bitmask_] = packet;
835   std::atomic_thread_fence(std::memory_order_release);
836   queue_buffer[index & queue_bitmask_].header = kDispatchPacketHeader;
837 }
838 
ObtainAsyncKernelCopyArg()839 BlitKernel::KernelArgs* BlitKernel::ObtainAsyncKernelCopyArg() {
840   const uint32_t index =
841       atomic::Add(&kernarg_async_counter_, 1U, std::memory_order_acquire) & kernarg_async_mask_;
842 
843   KernelArgs* arg = &kernarg_async_[index];
844   assert(IsMultipleOf(arg, 16));
845   return arg;
846 }
847 
848 }  // namespace amd
849