1 ////////////////////////////////////////////////////////////////////////////////
2 //
3 // The University of Illinois/NCSA
4 // Open Source License (NCSA)
5 //
6 // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
7 //
8 // Developed by:
9 //
10 // AMD Research and AMD HSA Software Development
11 //
12 // Advanced Micro Devices, Inc.
13 //
14 // www.amd.com
15 //
16 // Permission is hereby granted, free of charge, to any person obtaining a copy
17 // of this software and associated documentation files (the "Software"), to
18 // deal with the Software without restriction, including without limitation
19 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
20 // and/or sell copies of the Software, and to permit persons to whom the
21 // Software is furnished to do so, subject to the following conditions:
22 //
23 // - Redistributions of source code must retain the above copyright notice,
24 // this list of conditions and the following disclaimers.
25 // - Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimers in
27 // the documentation and/or other materials provided with the distribution.
28 // - Neither the names of Advanced Micro Devices, Inc,
29 // nor the names of its contributors may be used to endorse or promote
30 // products derived from this Software without specific prior written
31 // permission.
32 //
33 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
36 // THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
37 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
38 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
39 // DEALINGS WITH THE SOFTWARE.
40 //
41 ////////////////////////////////////////////////////////////////////////////////
42
43 #include "core/inc/amd_blit_kernel.h"
44
45 #include <algorithm>
46 #include <sstream>
47 #include <string>
48
49 #include "core/inc/amd_gpu_agent.h"
50 #include "core/inc/hsa_internal.h"
51 #include "core/util/utils.h"
52
53 namespace amd {
54 static const uint16_t kInvalidPacketHeader = HSA_PACKET_TYPE_INVALID;
55
56 static std::string kBlitKernelSource(R"(
57 // Compatibility function for GFXIP 7.
58
59 function s_load_dword_offset(byte_offset)
60 if kGFXIPVersion == 7
61 return byte_offset / 4
62 else
63 return byte_offset
64 end
65 end
66
67 // Memory copy for all cases except:
68 // (src_addr & 0x3) != (dst_addr & 0x3)
69 //
70 // Kernel argument buffer:
71 // [DW 0, 1] Phase 1 src start address
72 // [DW 2, 3] Phase 1 dst start address
73 // [DW 4, 5] Phase 2 src start address
74 // [DW 6, 7] Phase 2 dst start address
75 // [DW 8, 9] Phase 3 src start address
76 // [DW 10,11] Phase 3 dst start address
77 // [DW 12,13] Phase 4 src start address
78 // [DW 14,15] Phase 4 dst start address
79 // [DW 16,17] Phase 4 src end address
80 // [DW 18,19] Phase 4 dst end address
81 // [DW 20 ] Total number of workitems
82
83 var kCopyAlignedVecWidth = 4
84 var kCopyAlignedUnroll = 1
85
86 shader CopyAligned
87 type(CS)
88 user_sgpr_count(2)
89 sgpr_count(32)
90 vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
91
92 // Retrieve kernel arguments.
93 s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
94 s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
95 s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
96 s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30)
97 s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40)
98 s_load_dword s24, s[0:1], s_load_dword_offset(0x50)
99 s_waitcnt lgkmcnt(0)
100
101 // Compute workitem id.
102 s_lshl_b32 s2, s2, 0x6
103 v_add_u32 v0, vcc, s2, v0
104
105 // =====================================================
106 // Phase 1: Byte copy up to 0x100 destination alignment.
107 // =====================================================
108
109 // Compute phase source address.
110 v_mov_b32 v3, s5
111 v_add_u32 v2, vcc, v0, s4
112 v_addc_u32 v3, vcc, v3, 0x0, vcc
113
114 // Compute phase destination address.
115 v_mov_b32 v5, s7
116 v_add_u32 v4, vcc, v0, s6
117 v_addc_u32 v5, vcc, v5, 0x0, vcc
118
119 L_COPY_ALIGNED_PHASE_1_LOOP:
120 // Mask off lanes (or branch out) after phase end.
121 v_cmp_lt_u64 vcc, v[2:3], s[8:9]
122 s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE
123 s_and_b64 exec, exec, vcc
124
125 // Load from/advance the source address.
126 flat_load_ubyte v1, v[2:3]
127 s_waitcnt vmcnt(0)
128 v_add_u32 v2, vcc, v2, s24
129 v_addc_u32 v3, vcc, v3, 0x0, vcc
130
131 // Write to/advance the destination address.
132 flat_store_byte v[4:5], v1
133 v_add_u32 v4, vcc, v4, s24
134 v_addc_u32 v5, vcc, v5, 0x0, vcc
135
136 // Repeat until branched out.
137 s_branch L_COPY_ALIGNED_PHASE_1_LOOP
138
139 L_COPY_ALIGNED_PHASE_1_DONE:
140 // Restore EXEC mask for all lanes.
141 s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
142
143 // ========================================================
144 // Phase 2: Unrolled dword[x4] copy up to last whole block.
145 // ========================================================
146
147 // Compute unrolled dword[x4] stride across all threads.
148 if kCopyAlignedVecWidth == 4
149 s_lshl_b32 s25, s24, 0x4
150 else
151 s_lshl_b32 s25, s24, 0x2
152 end
153
154 // Compute phase source address.
155 if kCopyAlignedVecWidth == 4
156 v_lshlrev_b32 v1, 0x4, v0
157 else
158 v_lshlrev_b32 v1, 0x2, v0
159 end
160
161 v_mov_b32 v3, s9
162 v_add_u32 v2, vcc, v1, s8
163 v_addc_u32 v3, vcc, v3, 0x0, vcc
164
165 // Compute phase destination address.
166 v_mov_b32 v5, s11
167 v_add_u32 v4, vcc, v1, s10
168 v_addc_u32 v5, vcc, v5, 0x0, vcc
169
170 L_COPY_ALIGNED_PHASE_2_LOOP:
171 // Branch out after phase end.
172 v_cmp_lt_u64 vcc, v[2:3], s[12:13]
173 s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE
174
175 // Load from/advance the source address.
176 for var i = 0; i < kCopyAlignedUnroll; i ++
177 if kCopyAlignedVecWidth == 4
178 flat_load_dwordx4 v[8 + (i * 4)], v[2:3]
179 else
180 flat_load_dword v[8 + i], v[2:3]
181 end
182
183 v_add_u32 v2, vcc, v2, s25
184 v_addc_u32 v3, vcc, v3, 0x0, vcc
185 end
186
187 // Write to/advance the destination address.
188 s_waitcnt vmcnt(0)
189
190 for var i = 0; i < kCopyAlignedUnroll; i ++
191 if kCopyAlignedVecWidth == 4
192 flat_store_dwordx4 v[4:5], v[8 + (i * 4)]
193 else
194 flat_store_dword v[4:5], v[8 + i]
195 end
196
197 v_add_u32 v4, vcc, v4, s25
198 v_addc_u32 v5, vcc, v5, 0x0, vcc
199 end
200
201 // Repeat until branched out.
202 s_branch L_COPY_ALIGNED_PHASE_2_LOOP
203
204 L_COPY_ALIGNED_PHASE_2_DONE:
205
206 // ===========================================
207 // Phase 3: Dword copy up to last whole dword.
208 // ===========================================
209
210 // Compute dword stride across all threads.
211 s_lshl_b32 s25, s24, 0x2
212
213 // Compute phase source address.
214 v_lshlrev_b32 v1, 0x2, v0
215 v_mov_b32 v3, s13
216 v_add_u32 v2, vcc, v1, s12
217 v_addc_u32 v3, vcc, v3, 0x0, vcc
218
219 // Compute phase destination address.
220 v_mov_b32 v5, s15
221 v_add_u32 v4, vcc, v1, s14
222 v_addc_u32 v5, vcc, v5, 0x0, vcc
223
224 L_COPY_ALIGNED_PHASE_3_LOOP:
225 // Mask off lanes (or branch out) after phase end.
226 v_cmp_lt_u64 vcc, v[2:3], s[16:17]
227 s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE
228 s_and_b64 exec, exec, vcc
229
230 // Load from/advance the source address.
231 flat_load_dword v1, v[2:3]
232 v_add_u32 v2, vcc, v2, s25
233 v_addc_u32 v3, vcc, v3, 0x0, vcc
234 s_waitcnt vmcnt(0)
235
236 // Write to/advance the destination address.
237 flat_store_dword v[4:5], v1
238 v_add_u32 v4, vcc, v4, s25
239 v_addc_u32 v5, vcc, v5, 0x0, vcc
240
241 // Repeat until branched out.
242 s_branch L_COPY_ALIGNED_PHASE_3_LOOP
243
244 L_COPY_ALIGNED_PHASE_3_DONE:
245 // Restore EXEC mask for all lanes.
246 s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF
247
248 // =============================
249 // Phase 4: Byte copy up to end.
250 // =============================
251
252 // Compute phase source address.
253 v_mov_b32 v3, s17
254 v_add_u32 v2, vcc, v0, s16
255 v_addc_u32 v3, vcc, v3, 0x0, vcc
256
257 // Compute phase destination address.
258 v_mov_b32 v5, s19
259 v_add_u32 v4, vcc, v0, s18
260 v_addc_u32 v5, vcc, v5, 0x0, vcc
261
262 // Mask off lanes (or branch out) after phase end.
263 v_cmp_lt_u64 vcc, v[2:3], s[20:21]
264 s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE
265 s_and_b64 exec, exec, vcc
266
267 // Load from the source address.
268 flat_load_ubyte v1, v[2:3]
269 s_waitcnt vmcnt(0)
270
271 // Write to the destination address.
272 flat_store_byte v[4:5], v1
273
274 L_COPY_ALIGNED_PHASE_4_DONE:
275 s_endpgm
276 end
277
278 // Memory copy for this case:
279 // (src_addr & 0x3) != (dst_addr & 0x3)
280 //
281 // Kernel argument buffer:
282 // [DW 0, 1] Phase 1 src start address
283 // [DW 2, 3] Phase 1 dst start address
284 // [DW 4, 5] Phase 2 src start address
285 // [DW 6, 7] Phase 2 dst start address
286 // [DW 8, 9] Phase 2 src end address
287 // [DW 10,11] Phase 2 dst end address
288 // [DW 12 ] Total number of workitems
289
290 var kCopyMisalignedUnroll = 4
291
292 shader CopyMisaligned
293 type(CS)
294 user_sgpr_count(2)
295 sgpr_count(23)
296 vgpr_count(6 + kCopyMisalignedUnroll)
297
298 // Retrieve kernel arguments.
299 s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
300 s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
301 s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20)
302 s_load_dword s16, s[0:1], s_load_dword_offset(0x30)
303 s_waitcnt lgkmcnt(0)
304
305 // Compute workitem id.
306 s_lshl_b32 s2, s2, 0x6
307 v_add_u32 v0, vcc, s2, v0
308
309 // ===================================================
310 // Phase 1: Unrolled byte copy up to last whole block.
311 // ===================================================
312
313 // Compute phase source address.
314 v_mov_b32 v3, s5
315 v_add_u32 v2, vcc, v0, s4
316 v_addc_u32 v3, vcc, v3, 0x0, vcc
317
318 // Compute phase destination address.
319 v_mov_b32 v5, s7
320 v_add_u32 v4, vcc, v0, s6
321 v_addc_u32 v5, vcc, v5, 0x0, vcc
322
323 L_COPY_MISALIGNED_PHASE_1_LOOP:
324 // Branch out after phase end.
325 v_cmp_lt_u64 vcc, v[2:3], s[8:9]
326 s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE
327
328 // Load from/advance the source address.
329 for var i = 0; i < kCopyMisalignedUnroll; i ++
330 flat_load_ubyte v[6 + i], v[2:3]
331 v_add_u32 v2, vcc, v2, s16
332 v_addc_u32 v3, vcc, v3, 0x0, vcc
333 end
334
335 // Write to/advance the destination address.
336 s_waitcnt vmcnt(0)
337
338 for var i = 0; i < kCopyMisalignedUnroll; i ++
339 flat_store_byte v[4:5], v[6 + i]
340 v_add_u32 v4, vcc, v4, s16
341 v_addc_u32 v5, vcc, v5, 0x0, vcc
342 end
343
344 // Repeat until branched out.
345 s_branch L_COPY_MISALIGNED_PHASE_1_LOOP
346
347 L_COPY_MISALIGNED_PHASE_1_DONE:
348
349 // =============================
350 // Phase 2: Byte copy up to end.
351 // =============================
352
353 // Compute phase source address.
354 v_mov_b32 v3, s9
355 v_add_u32 v2, vcc, v0, s8
356 v_addc_u32 v3, vcc, v3, 0x0, vcc
357
358 // Compute phase destination address.
359 v_mov_b32 v5, s11
360 v_add_u32 v4, vcc, v0, s10
361 v_addc_u32 v5, vcc, v5, 0x0, vcc
362
363 L_COPY_MISALIGNED_PHASE_2_LOOP:
364 // Mask off lanes (or branch out) after phase end.
365 v_cmp_lt_u64 vcc, v[2:3], s[12:13]
366 s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE
367 s_and_b64 exec, exec, vcc
368
369 // Load from/advance the source address.
370 flat_load_ubyte v1, v[2:3]
371 v_add_u32 v2, vcc, v2, s16
372 v_addc_u32 v3, vcc, v3, 0x0, vcc
373 s_waitcnt vmcnt(0)
374
375 // Write to/advance the destination address.
376 flat_store_byte v[4:5], v1
377 v_add_u32 v4, vcc, v4, s16
378 v_addc_u32 v5, vcc, v5, 0x0, vcc
379
380 // Repeat until branched out.
381 s_branch L_COPY_MISALIGNED_PHASE_2_LOOP
382
383 L_COPY_MISALIGNED_PHASE_2_DONE:
384 s_endpgm
385 end
386
387 // Memory fill for dword-aligned region.
388 //
389 // Kernel argument buffer:
390 // [DW 0, 1] Phase 1 dst start address
391 // [DW 2, 3] Phase 2 dst start address
392 // [DW 4, 5] Phase 2 dst end address
393 // [DW 6 ] Value to fill memory with
394 // [DW 7 ] Total number of workitems
395
396 var kFillVecWidth = 4
397 var kFillUnroll = 1
398
399 shader Fill
400 type(CS)
401 user_sgpr_count(2)
402 sgpr_count(19)
403 vgpr_count(8)
404
405 // Retrieve kernel arguments.
406 s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0)
407 s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10)
408 s_waitcnt lgkmcnt(0)
409
410 // Compute workitem id.
411 s_lshl_b32 s2, s2, 0x6
412 v_add_u32 v0, vcc, s2, v0
413
414 // Copy fill pattern into VGPRs.
415 for var i = 0; i < kFillVecWidth; i ++
416 v_mov_b32 v[4 + i], s10
417 end
418
419 // ========================================================
420 // Phase 1: Unrolled dword[x4] fill up to last whole block.
421 // ========================================================
422
423 // Compute unrolled dword[x4] stride across all threads.
424 if kFillVecWidth == 4
425 s_lshl_b32 s12, s11, 0x4
426 else
427 s_lshl_b32 s12, s11, 0x2
428 end
429
430 // Compute phase destination address.
431 if kFillVecWidth == 4
432 v_lshlrev_b32 v1, 0x4, v0
433 else
434 v_lshlrev_b32 v1, 0x2, v0
435 end
436
437 v_mov_b32 v3, s5
438 v_add_u32 v2, vcc, v1, s4
439 v_addc_u32 v3, vcc, v3, 0x0, vcc
440
441 L_FILL_PHASE_1_LOOP:
442 // Branch out after phase end.
443 v_cmp_lt_u64 vcc, v[2:3], s[6:7]
444 s_cbranch_vccz L_FILL_PHASE_1_DONE
445
446 // Write to/advance the destination address.
447 for var i = 0; i < kFillUnroll; i ++
448 if kFillVecWidth == 4
449 flat_store_dwordx4 v[2:3], v[4:7]
450 else
451 flat_store_dword v[2:3], v4
452 end
453
454 v_add_u32 v2, vcc, v2, s12
455 v_addc_u32 v3, vcc, v3, 0x0, vcc
456 end
457
458 // Repeat until branched out.
459 s_branch L_FILL_PHASE_1_LOOP
460
461 L_FILL_PHASE_1_DONE:
462
463 // ==============================
464 // Phase 2: Dword fill up to end.
465 // ==============================
466
467 // Compute dword stride across all threads.
468 s_lshl_b32 s12, s11, 0x2
469
470 // Compute phase destination address.
471 v_lshlrev_b32 v1, 0x2, v0
472 v_mov_b32 v3, s7
473 v_add_u32 v2, vcc, v1, s6
474 v_addc_u32 v3, vcc, v3, 0x0, vcc
475
476 L_FILL_PHASE_2_LOOP:
477 // Mask off lanes (or branch out) after phase end.
478 v_cmp_lt_u64 vcc, v[2:3], s[8:9]
479 s_cbranch_vccz L_FILL_PHASE_2_DONE
480 s_and_b64 exec, exec, vcc
481
482 // Write to/advance the destination address.
483 flat_store_dword v[2:3], v4
484 v_add_u32 v2, vcc, v2, s12
485 v_addc_u32 v3, vcc, v3, 0x0, vcc
486
487 // Repeat until branched out.
488 s_branch L_FILL_PHASE_2_LOOP
489
490 L_FILL_PHASE_2_DONE:
491 s_endpgm
492 end
493 )");
494
495 // Search kernel source for variable definition and return value.
GetKernelSourceParam(const char * paramName)496 int GetKernelSourceParam(const char* paramName) {
497 std::stringstream paramDef;
498 paramDef << "var " << paramName << " = ";
499
500 std::string::size_type paramDefLoc = kBlitKernelSource.find(paramDef.str());
501 assert(paramDefLoc != std::string::npos);
502 std::string::size_type paramValLoc = paramDefLoc + paramDef.str().size();
503 std::string::size_type paramEndLoc =
504 kBlitKernelSource.find('\n', paramDefLoc);
505 assert(paramDefLoc != std::string::npos);
506
507 std::string paramVal(&kBlitKernelSource[paramValLoc],
508 &kBlitKernelSource[paramEndLoc]);
509 return std::stoi(paramVal);
510 }
511
512 static int kCopyAlignedVecWidth = GetKernelSourceParam("kCopyAlignedVecWidth");
513 static int kCopyAlignedUnroll = GetKernelSourceParam("kCopyAlignedUnroll");
514 static int kCopyMisalignedUnroll = GetKernelSourceParam("kCopyMisalignedUnroll");
515 static int kFillVecWidth = GetKernelSourceParam("kFillVecWidth");
516 static int kFillUnroll = GetKernelSourceParam("kFillUnroll");
517
BlitKernel(core::Queue * queue)518 BlitKernel::BlitKernel(core::Queue* queue)
519 : core::Blit(),
520 queue_(queue),
521 kernarg_async_(NULL),
522 kernarg_async_mask_(0),
523 kernarg_async_counter_(0),
524 num_cus_(0) {
525 completion_signal_.handle = 0;
526 }
527
~BlitKernel()528 BlitKernel::~BlitKernel() {}
529
Initialize(const core::Agent & agent)530 hsa_status_t BlitKernel::Initialize(const core::Agent& agent) {
531 queue_bitmask_ = queue_->public_handle()->size - 1;
532
533 hsa_status_t status = HSA::hsa_signal_create(1, 0, NULL, &completion_signal_);
534 if (HSA_STATUS_SUCCESS != status) {
535 return status;
536 }
537
538 kernarg_async_ = reinterpret_cast<KernelArgs*>(
539 core::Runtime::runtime_singleton_->system_allocator()(
540 queue_->public_handle()->size * AlignUp(sizeof(KernelArgs), 16), 16,
541 core::MemoryRegion::AllocateNoFlags));
542
543 kernarg_async_mask_ = queue_->public_handle()->size - 1;
544
545 // Obtain the number of compute units in the underlying agent.
546 const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
547 num_cus_ = gpuAgent.properties().NumFComputeCores / 4;
548
549 // Assemble shaders to AQL code objects.
550 std::map<KernelType, const char*> kernel_names = {
551 {KernelType::CopyAligned, "CopyAligned"},
552 {KernelType::CopyMisaligned, "CopyMisaligned"},
553 {KernelType::Fill, "Fill"}};
554
555 for (auto kernel_name : kernel_names) {
556 KernelCode& kernel = kernels_[kernel_name.first];
557 gpuAgent.AssembleShader(kBlitKernelSource.c_str(), kernel_name.second,
558 GpuAgent::AssembleTarget::AQL, kernel.code_buf_,
559 kernel.code_buf_size_);
560 }
561
562 if (agent.profiling_enabled()) {
563 return EnableProfiling(true);
564 }
565
566 return HSA_STATUS_SUCCESS;
567 }
568
Destroy(const core::Agent & agent)569 hsa_status_t BlitKernel::Destroy(const core::Agent& agent) {
570 std::lock_guard<std::mutex> guard(lock_);
571
572 const GpuAgent& gpuAgent = static_cast<const GpuAgent&>(agent);
573
574 for (auto kernel_pair : kernels_) {
575 gpuAgent.ReleaseShader(kernel_pair.second.code_buf_,
576 kernel_pair.second.code_buf_size_);
577 }
578
579 if (kernarg_async_ != NULL) {
580 core::Runtime::runtime_singleton_->system_deallocator()(kernarg_async_);
581 }
582
583 if (completion_signal_.handle != 0) {
584 HSA::hsa_signal_destroy(completion_signal_);
585 }
586
587 return HSA_STATUS_SUCCESS;
588 }
589
SubmitLinearCopyCommand(void * dst,const void * src,size_t size)590 hsa_status_t BlitKernel::SubmitLinearCopyCommand(void* dst, const void* src,
591 size_t size) {
592 // Protect completion_signal_.
593 std::lock_guard<std::mutex> guard(lock_);
594
595 HSA::hsa_signal_store_relaxed(completion_signal_, 1);
596
597 std::vector<core::Signal*> dep_signals(0);
598
599 hsa_status_t stat = SubmitLinearCopyCommand(
600 dst, src, size, dep_signals, *core::Signal::Convert(completion_signal_));
601
602 if (stat != HSA_STATUS_SUCCESS) {
603 return stat;
604 }
605
606 // Wait for the packet to finish.
607 if (HSA::hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
608 HSA_WAIT_STATE_ACTIVE) != 0) {
609 // Signal wait returned unexpected value.
610 return HSA_STATUS_ERROR;
611 }
612
613 return HSA_STATUS_SUCCESS;
614 }
615
SubmitLinearCopyCommand(void * dst,const void * src,size_t size,std::vector<core::Signal * > & dep_signals,core::Signal & out_signal)616 hsa_status_t BlitKernel::SubmitLinearCopyCommand(
617 void* dst, const void* src, size_t size,
618 std::vector<core::Signal*>& dep_signals, core::Signal& out_signal) {
619 // Reserve write index for barrier(s) + dispatch packet.
620 const uint32_t num_barrier_packet = uint32_t((dep_signals.size() + 4) / 5);
621 const uint32_t total_num_packet = num_barrier_packet + 1;
622
623 uint64_t write_index = AcquireWriteIndex(total_num_packet);
624 uint64_t write_index_temp = write_index;
625
626 // Insert barrier packets to handle dependent signals.
627 // Barrier bit keeps signal checking traffic from competing with a copy.
628 const uint16_t kBarrierPacketHeader = (HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
629 (1 << HSA_PACKET_HEADER_BARRIER) |
630 (HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
631 (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
632
633 hsa_barrier_and_packet_t barrier_packet = {0};
634 barrier_packet.header = HSA_PACKET_TYPE_INVALID;
635
636 hsa_barrier_and_packet_t* queue_buffer =
637 reinterpret_cast<hsa_barrier_and_packet_t*>(
638 queue_->public_handle()->base_address);
639
640 const size_t dep_signal_count = dep_signals.size();
641 for (size_t i = 0; i < dep_signal_count; ++i) {
642 const size_t idx = i % 5;
643 barrier_packet.dep_signal[idx] = core::Signal::Convert(dep_signals[i]);
644 if (i == (dep_signal_count - 1) || idx == 4) {
645 std::atomic_thread_fence(std::memory_order_acquire);
646 queue_buffer[(write_index)&queue_bitmask_] = barrier_packet;
647 std::atomic_thread_fence(std::memory_order_release);
648 queue_buffer[(write_index)&queue_bitmask_].header = kBarrierPacketHeader;
649
650 ++write_index;
651
652 memset(&barrier_packet, 0, sizeof(hsa_barrier_and_packet_t));
653 barrier_packet.header = HSA_PACKET_TYPE_INVALID;
654 }
655 }
656
657 // Insert dispatch packet for copy kernel.
658 KernelArgs* args = ObtainAsyncKernelCopyArg();
659 KernelCode* kernel_code = nullptr;
660 int num_workitems = 0;
661
662 bool aligned = ((uintptr_t(src) & 0x3) == (uintptr_t(dst) & 0x3));
663
664 if (aligned) {
665 // Use dword-based aligned kernel.
666 kernel_code = &kernels_[KernelType::CopyAligned];
667
668 // Compute the size of each copy phase.
669 num_workitems = 64 * 4 * num_cus_;
670
671 // Phase 1 (byte copy) ends when destination is 0x100-aligned.
672 uintptr_t src_start = uintptr_t(src);
673 uintptr_t dst_start = uintptr_t(dst);
674 uint64_t phase1_size =
675 std::min(size, uint64_t(0x100 - (dst_start & 0xFF)) & 0xFF);
676
677 // Phase 2 (unrolled dwordx4 copy) ends when last whole block fits.
678 uint64_t phase2_block = num_workitems * sizeof(uint32_t) *
679 kCopyAlignedUnroll * kCopyAlignedVecWidth;
680 uint64_t phase2_size = ((size - phase1_size) / phase2_block) * phase2_block;
681
682 // Phase 3 (dword copy) ends when last whole dword fits.
683 uint64_t phase3_size =
684 ((size - phase1_size - phase2_size) / sizeof(uint32_t)) *
685 sizeof(uint32_t);
686
687 args->copy_aligned.phase1_src_start = src_start;
688 args->copy_aligned.phase1_dst_start = dst_start;
689 args->copy_aligned.phase2_src_start = src_start + phase1_size;
690 args->copy_aligned.phase2_dst_start = dst_start + phase1_size;
691 args->copy_aligned.phase3_src_start = src_start + phase1_size + phase2_size;
692 args->copy_aligned.phase3_dst_start = dst_start + phase1_size + phase2_size;
693 args->copy_aligned.phase4_src_start =
694 src_start + phase1_size + phase2_size + phase3_size;
695 args->copy_aligned.phase4_dst_start =
696 dst_start + phase1_size + phase2_size + phase3_size;
697 args->copy_aligned.phase4_src_end = src_start + size;
698 args->copy_aligned.phase4_dst_end = dst_start + size;
699 args->copy_aligned.num_workitems = num_workitems;
700 } else {
701 // Use byte-based misaligned kernel.
702 kernel_code = &kernels_[KernelType::CopyMisaligned];
703
704 // Compute the size of each copy phase.
705 num_workitems = 64 * 4 * num_cus_;
706
707 // Phase 1 (unrolled byte copy) ends when last whole block fits.
708 uintptr_t src_start = uintptr_t(src);
709 uintptr_t dst_start = uintptr_t(dst);
710 uint64_t phase1_block =
711 num_workitems * sizeof(uint8_t) * kCopyMisalignedUnroll;
712 uint64_t phase1_size = (size / phase1_block) * phase1_block;
713
714 args->copy_misaligned.phase1_src_start = src_start;
715 args->copy_misaligned.phase1_dst_start = dst_start;
716 args->copy_misaligned.phase2_src_start = src_start + phase1_size;
717 args->copy_misaligned.phase2_dst_start = dst_start + phase1_size;
718 args->copy_misaligned.phase2_src_end = src_start + size;
719 args->copy_misaligned.phase2_dst_end = dst_start + size;
720 args->copy_misaligned.num_workitems = num_workitems;
721 }
722
723 hsa_signal_t signal = {(core::Signal::Convert(&out_signal)).handle};
724 PopulateQueue(write_index, uintptr_t(kernel_code->code_buf_), args,
725 num_workitems, signal);
726
727 // Submit barrier(s) and dispatch packets.
728 ReleaseWriteIndex(write_index_temp, total_num_packet);
729
730 return HSA_STATUS_SUCCESS;
731 }
732
SubmitLinearFillCommand(void * ptr,uint32_t value,size_t count)733 hsa_status_t BlitKernel::SubmitLinearFillCommand(void* ptr, uint32_t value,
734 size_t count) {
735 std::lock_guard<std::mutex> guard(lock_);
736
737 // Reject misaligned base address.
738 if ((uintptr_t(ptr) & 0x3) != 0) {
739 return HSA_STATUS_ERROR;
740 }
741
742 // Compute the size of each fill phase.
743 int num_workitems = 64 * num_cus_;
744
745 // Phase 1 (unrolled dwordx4 copy) ends when last whole block fits.
746 uintptr_t dst_start = uintptr_t(ptr);
747 uint64_t fill_size = count * sizeof(uint32_t);
748
749 uint64_t phase1_block =
750 num_workitems * sizeof(uint32_t) * kFillUnroll * kFillVecWidth;
751 uint64_t phase1_size = (fill_size / phase1_block) * phase1_block;
752
753 KernelArgs* args = ObtainAsyncKernelCopyArg();
754 args->fill.phase1_dst_start = dst_start;
755 args->fill.phase2_dst_start = dst_start + phase1_size;
756 args->fill.phase2_dst_end = dst_start + fill_size;
757 args->fill.fill_value = value;
758 args->fill.num_workitems = num_workitems;
759
760 // Submit dispatch packet.
761 HSA::hsa_signal_store_relaxed(completion_signal_, 1);
762
763 uint64_t write_index = AcquireWriteIndex(1);
764 PopulateQueue(write_index, uintptr_t(kernels_[KernelType::Fill].code_buf_),
765 args, num_workitems, completion_signal_);
766 ReleaseWriteIndex(write_index, 1);
767
768 // Wait for the packet to finish.
769 if (HSA::hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
770 HSA_WAIT_STATE_ACTIVE) != 0) {
771 // Signal wait returned unexpected value.
772 return HSA_STATUS_ERROR;
773 }
774
775 return HSA_STATUS_SUCCESS;
776 }
777
EnableProfiling(bool enable)778 hsa_status_t BlitKernel::EnableProfiling(bool enable) {
779 queue_->SetProfiling(enable);
780 return HSA_STATUS_SUCCESS;
781 }
782
AcquireWriteIndex(uint32_t num_packet)783 uint64_t BlitKernel::AcquireWriteIndex(uint32_t num_packet) {
784 assert(queue_->public_handle()->size >= num_packet);
785
786 uint64_t write_index = queue_->AddWriteIndexAcqRel(num_packet);
787
788 while (write_index + num_packet - queue_->LoadReadIndexRelaxed() > queue_->public_handle()->size) {
789 os::YieldThread();
790 }
791
792 return write_index;
793 }
794
ReleaseWriteIndex(uint64_t write_index,uint32_t num_packet)795 void BlitKernel::ReleaseWriteIndex(uint64_t write_index, uint32_t num_packet) {
796 // Update doorbel register with last packet id.
797 core::Signal* doorbell =
798 core::Signal::Convert(queue_->public_handle()->doorbell_signal);
799 doorbell->StoreRelease(write_index + num_packet - 1);
800 }
801
PopulateQueue(uint64_t index,uint64_t code_handle,void * args,uint32_t grid_size_x,hsa_signal_t completion_signal)802 void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args,
803 uint32_t grid_size_x,
804 hsa_signal_t completion_signal) {
805 assert(IsMultipleOf(args, 16));
806
807 hsa_kernel_dispatch_packet_t packet = {0};
808
809 static const uint16_t kDispatchPacketHeader =
810 (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
811 (((completion_signal.handle != 0) ? 1 : 0) << HSA_PACKET_HEADER_BARRIER) |
812 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
813 (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
814
815 packet.header = kInvalidPacketHeader;
816 packet.kernel_object = code_handle;
817 packet.kernarg_address = args;
818
819 // Setup working size.
820 const int kNumDimension = 1;
821 packet.setup = kNumDimension << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
822 packet.grid_size_x = AlignUp(static_cast<uint32_t>(grid_size_x), 64);
823 packet.grid_size_y = packet.grid_size_z = 1;
824 packet.workgroup_size_x = 64;
825 packet.workgroup_size_y = packet.workgroup_size_z = 1;
826
827 packet.completion_signal = completion_signal;
828
829 // Populate queue buffer with AQL packet.
830 hsa_kernel_dispatch_packet_t* queue_buffer =
831 reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
832 queue_->public_handle()->base_address);
833 std::atomic_thread_fence(std::memory_order_acquire);
834 queue_buffer[index & queue_bitmask_] = packet;
835 std::atomic_thread_fence(std::memory_order_release);
836 queue_buffer[index & queue_bitmask_].header = kDispatchPacketHeader;
837 }
838
ObtainAsyncKernelCopyArg()839 BlitKernel::KernelArgs* BlitKernel::ObtainAsyncKernelCopyArg() {
840 const uint32_t index =
841 atomic::Add(&kernarg_async_counter_, 1U, std::memory_order_acquire) & kernarg_async_mask_;
842
843 KernelArgs* arg = &kernarg_async_[index];
844 assert(IsMultipleOf(arg, 16));
845 return arg;
846 }
847
848 } // namespace amd
849