1 /*============================================================================
2 * Definitions, global variables, and base functions for accelerators.
3 *============================================================================*/
4
5 /*
6 This file is part of Code_Saturne, a general-purpose CFD tool.
7
8 Copyright (C) 1998-2021 EDF S.A.
9
10 This program is free software; you can redistribute it and/or modify it under
11 the terms of the GNU General Public License as published by the Free Software
12 Foundation; either version 2 of the License, or (at your option) any later
13 version.
14
15 This program is distributed in the hope that it will be useful, but WITHOUT
16 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
18 details.
19
20 You should have received a copy of the GNU General Public License along with
21 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22 Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24
25 /*----------------------------------------------------------------------------*/
26
27 #include "cs_defs.h"
28
29 /*----------------------------------------------------------------------------
30 * Standard C library headers
31 *----------------------------------------------------------------------------*/
32
33 #include <assert.h>
34 #include <string.h>
35
36 /*----------------------------------------------------------------------------
37 * Standard C++ library headers
38 *----------------------------------------------------------------------------*/
39
40 #include <map>
41
42 /*----------------------------------------------------------------------------
43 * Local headers
44 *----------------------------------------------------------------------------*/
45
46 #include "bft_error.h"
47 #include "bft_mem.h"
48
49 #if defined(HAVE_CUDA)
50 #include "cs_base_cuda.h"
51 #endif
52
53 /*----------------------------------------------------------------------------
54 * Header for the current file
55 *----------------------------------------------------------------------------*/
56
57 #include "cs_base_accel.h"
58
59 /*----------------------------------------------------------------------------*/
60
61 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
62
63 /*============================================================================
64 * Local Macro Definitions
65 *============================================================================*/
66
67 /*============================================================================
68 * Local Type Definitions
69 *============================================================================*/
70
71 typedef struct
72 {
73 void *host_ptr; //!< host pointer
74 void *device_ptr; //!< host pointer
75
76 size_t size; //! allocation size
77 cs_alloc_mode_t mode; //!< allocation mode
78 } _cs_base_accel_mem_map;
79
80 /*============================================================================
81 * Global variables
82 *============================================================================*/
83
84 static std::map<const void *, _cs_base_accel_mem_map> _hd_alloc_map;
85
86 static bool _initialized = false;
87
88 /*! Default "host+device" allocation mode */
89
90 cs_alloc_mode_t cs_alloc_mode = CS_ALLOC_HOST_DEVICE_SHARED;
91
92 /* Keep track of active device id using OpenMP; usually queried dynamically,
93 but saving the value in this variable can be useful when debugging */
94
95 int cs_glob_omp_target_device_id = -1;
96
97 /*============================================================================
98 * Private function definitions
99 *============================================================================*/
100
101 /*----------------------------------------------------------------------------*/
102 /*!
103 * \brief Reallocate memory on host and device for ni elements of size bytes.
104 *
105 * This function calls the appropriate reallocation function based on
106 * the requested mode, and allows introspection of the allocated memory.
107 *
108 * \param [in] host_ptr host pointer
109 * \param [in] ni number of elements
110 * \param [in] size element size
111 * \param [in] var_name allocated variable name string
112 * \param [in] file_name name of calling source file
113 * \param [in] line_num line number in calling source file
114 *
115 * \returns pointer to allocated memory.
116 */
117 /*----------------------------------------------------------------------------*/
118
119 static void *
_realloc_host(void * host_ptr,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)120 _realloc_host(void *host_ptr,
121 size_t ni,
122 size_t size,
123 const char *var_name,
124 const char *file_name,
125 int line_num)
126 {
127 return cs_realloc_hd(host_ptr,
128 CS_ALLOC_HOST,
129 ni, size,
130 var_name, file_name, line_num);
131 }
132
133 /*----------------------------------------------------------------------------*/
134 /*!
135 * \brief Initialize memory mapping on device.
136 */
137 /*----------------------------------------------------------------------------*/
138
139 static void
_initialize(void)140 _initialize(void)
141 {
142 bft_mem_alternative_set(cs_get_allocation_hd_size,
143 _realloc_host,
144 cs_free_hd);
145
146 _initialized = true;
147
148 if (cs_get_device_id() < 0)
149 cs_alloc_mode = CS_ALLOC_HOST;
150 }
151
152 #if defined(HAVE_OPENMP_TARGET)
153
154 /*----------------------------------------------------------------------------*/
155 /*!
156 * \brief Allocate n bytes of OpenMP device memory.
157 *
158 * A safety check is added.
159 *
160 * \param [in] n element size
161 * \param [in] var_name allocated variable name string
162 * \param [in] file_name name of calling source file
163 * \param [in] line_num line number in calling source file
164 *
165 * \returns pointer to allocated memory.
166 */
167 /*----------------------------------------------------------------------------*/
168
169 static void *
_omp_target_mem_malloc_device(size_t n,const char * var_name,const char * file_name,int line_num)170 _omp_target_mem_malloc_device(size_t n,
171 const char *var_name,
172 const char *file_name,
173 int line_num)
174 {
175 void *ptr = omp_target_alloc_device(n, cs_glob_omp_target_device_id);
176
177 if (ptr == NULL)
178 bft_error(file_name, line_num, 0,
179 "[OpenMP offload error]: unable to allocate %llu bytes on device\n"
180 " running: %s",
181 (unsigned long long)n, __func__);
182
183 return ptr;
184 }
185
186 /*----------------------------------------------------------------------------*/
187 /*!
188 * \brief Allocate n bytes of host memory using OpenMP Offload.
189 *
190 * No OpenMP standard way to mimick cudaMallocHost today aka Host pinned memory
191 * allocation + GPU driver acceleration (DMA/zero copy).
192 *
193 * Closest is Intel proprietary omp_target_alloc_host (accepted in OMP 6.0) or
194 * new omp allocator (pinned) + explicit data transfer
195 * Note: omp_target_alloc_host supports implicit data transfert.
196 *
197 * A safety check is added.
198 *
199 * \param [in] n element size
200 * \param [in] var_name allocated variable name string
201 * \param [in] file_name name of calling source file
202 * \param [in] line_num line number in calling source file
203 *
204 * \returns pointer to allocated memory.
205 */
206 /*----------------------------------------------------------------------------*/
207
208 static void *
_omp_target_mem_malloc_host(size_t n,const char * var_name,const char * file_name,int line_num)209 _omp_target_mem_malloc_host(size_t n,
210 const char *var_name,
211 const char *file_name,
212 int line_num)
213 {
214 void *ptr = NULL;
215
216 #if defined(__INTEL_LLVM_COMPILER)
217 ptr = omp_target_alloc_host(n, cs_glob_omp_target_device_id);
218 #else
219 assert(0 && "Not implemented yet");
220 #endif
221
222 if (ptr == NULL)
223 bft_error(file_name, line_num, 0,
224 "[OpenMP offload error]: unable to allocate %llu bytes on host\n"
225 " running: %s",
226 (unsigned long long)n, __func__);
227
228 return ptr;
229 }
230
231 /*----------------------------------------------------------------------------*/
232 /*!
233 * \brief Allocate n bytes of OpenMP Offload managed memory.
234 *
235 * Standards define pragma unified_shared_memory to drive
236 * omp_target_alloc to allocate USM
237 *
238 * Intel proprietary omp_target_alloc_shared (accepted in OMP 6.0) is
239 * another convenient way to do so.
240 *
241 * A safety check is added.
242 *
243 * \param [in] n element size
244 * \param [in] var_name allocated variable name string
245 * \param [in] file_name name of calling source file
246 * \param [in] line_num line number in calling source file
247 *
248 * \returns pointer to allocated memory.
249 */
250 /*----------------------------------------------------------------------------*/
251
252 static void *
_omp_target_mem_malloc_managed(size_t n,const char * var_name,const char * file_name,int line_num)253 _omp_target_mem_malloc_managed(size_t n,
254 const char *var_name,
255 const char *file_name,
256 int line_num)
257 {
258 #if defined(__INTEL_LLVM_COMPILER)
259
260 void *ptr = omp_target_alloc_shared(n, cs_glob_omp_target_device_id);
261
262 #else
263
264 #pragma omp requires unified_shared_memory
265 void *ptr = omp_target_alloc(n, cs_glob_omp_target_device_id);
266
267 #endif
268
269 if (ptr == NULL)
270 bft_error(file_name, line_num, 0,
271 "[OpenMP offload error]: unable to allocate %llu bytes\n"
272 " running: %s",
273 (unsigned long long)n, __func__);
274
275 return ptr;
276 }
277
278 #endif /* defined(HAVE_OPENMP_TARGET) */
279
280 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
281
282 BEGIN_C_DECLS
283
284 /*============================================================================
285 * Public function definitions
286 *============================================================================*/
287
288 /*----------------------------------------------------------------------------*/
289 /*!
290 * \brief Return currently associated device id.
291 *
292 * \returns currently available device id, or -1 if none is available.
293 */
294 /*----------------------------------------------------------------------------*/
295
296 int
cs_get_device_id(void)297 cs_get_device_id(void)
298 {
299 int retval = -1;
300
301 #if defined(HAVE_CUDA)
302
303 retval = cs_base_cuda_get_device();
304
305 #elif defined (HAVE_OPENMP_TARGET)
306
307 retval = omp_get_default_device();
308
309 #endif
310
311 return retval;
312 }
313
314 /*----------------------------------------------------------------------------*/
315 /*!
316 * \brief Allocate memory on host and device for ni elements of size bytes.
317 *
318 * This function calls the appropriate allocation function based on
319 * the requested mode, and allows introspection of the allocated memory.
320 *
321 * If separate pointers are used on the host and device,
322 * the host pointer is returned.
323 *
324 * \param [in] mode allocation mode
325 * \param [in] ni number of elements
326 * \param [in] size element size
327 * \param [in] var_name allocated variable name string
328 * \param [in] file_name name of calling source file
329 * \param [in] line_num line number in calling source file
330 *
331 * \returns pointer to allocated memory.
332 */
333 /*----------------------------------------------------------------------------*/
334
335 void *
cs_malloc_hd(cs_alloc_mode_t mode,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)336 cs_malloc_hd(cs_alloc_mode_t mode,
337 size_t ni,
338 size_t size,
339 const char *var_name,
340 const char *file_name,
341 int line_num)
342 {
343 if (_initialized == false) {
344 _initialize();
345 mode = cs_alloc_mode;
346 }
347
348 if (ni == 0)
349 return NULL;
350
351 _cs_base_accel_mem_map me = {
352 .host_ptr = NULL,
353 .device_ptr = NULL,
354 .size = ni * size,
355 .mode = mode};
356
357 if (mode < CS_ALLOC_HOST_DEVICE_PINNED)
358 me.host_ptr = bft_mem_malloc(ni, size, var_name, file_name, line_num);
359
360 // Device allocation will be postponed later thru call to
361 // cs_get_device_ptr. This applies for CS_ALLOC_HOST_DEVICE
362 // and CS_ALLOC_HOST_DEVICE_PINNED modes
363
364 #if defined(HAVE_CUDA)
365
366 else if (mode == CS_ALLOC_HOST_DEVICE_PINNED)
367 me.host_ptr = cs_cuda_mem_malloc_host(me.size,
368 var_name,
369 file_name,
370 line_num);
371
372 else if (mode == CS_ALLOC_HOST_DEVICE_SHARED) {
373 me.host_ptr = cs_cuda_mem_malloc_managed(me.size,
374 var_name,
375 file_name,
376 line_num);
377 me.device_ptr = me.host_ptr;
378 }
379
380 else if (mode == CS_ALLOC_DEVICE)
381 me.device_ptr = cs_cuda_mem_malloc_device(me.size,
382 var_name,
383 file_name,
384 line_num);
385
386 #elif defined(HAVE_OPENMP_TARGET)
387
388 else if (mode == CS_ALLOC_HOST_DEVICE_PINNED)
389 me.host_ptr = _omp_target_mem_malloc_host(me.size,
390 var_name,
391 file_name,
392 line_num);
393
394 else if (mode == CS_ALLOC_HOST_DEVICE_SHARED) {
395 me.host_ptr = _omp_target_mem_malloc_managed(me.size,
396 var_name,
397 file_name,
398 line_num);
399 me.device_ptr = me.host_ptr;
400 }
401
402 else if (mode == CS_ALLOC_DEVICE)
403 me.device_ptr = _omp_target_mem_malloc_device(me.size,
404 var_name,
405 file_name,
406 line_num);
407
408 #endif
409
410 if (me.host_ptr != NULL)
411 _hd_alloc_map[me.host_ptr] = me;
412 else if (me.device_ptr != NULL)
413 _hd_alloc_map[me.device_ptr] = me;
414
415 /* Return pointer to allocated memory */
416
417 if (me.host_ptr != NULL)
418 return me.host_ptr;
419 else
420 return me.device_ptr;
421 }
422
423 /*----------------------------------------------------------------------------*/
424 /*!
425 * \brief Reallocate memory on host and device for ni elements of size bytes.
426 *
427 * This function calls the appropriate reallocation function based on
428 * the requested mode, and allows introspection of the allocated memory.
429 *
430 * If separate pointers are used on the host and device,
431 * the host pointer should be used with this function.
432 *
433 * If the allocation parameters are unchanged, no actual reallocation
434 * occurs.
435 *
436 * \param [in] ptr pointer to previously allocated memory
437 * \param [in] mode allocation mode
438 * \param [in] ni number of elements
439 * \param [in] size element size
440 * \param [in] var_name allocated variable name string
441 * \param [in] file_name name of calling source file
442 * \param [in] line_num line number in calling source file
443 *
444 * \returns pointer to allocated memory.
445 */
446 /*----------------------------------------------------------------------------*/
447
448 void *
cs_realloc_hd(void * ptr,cs_alloc_mode_t mode,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)449 cs_realloc_hd(void *ptr,
450 cs_alloc_mode_t mode,
451 size_t ni,
452 size_t size,
453 const char *var_name,
454 const char *file_name,
455 int line_num)
456 {
457 void *ret_ptr = ptr;
458 size_t new_size = ni*size;
459
460 if (ptr == NULL) {
461 return cs_malloc_hd(mode, ni, size, var_name, file_name, line_num);
462 }
463 else if (new_size == 0) {
464 cs_free_hd(ptr, var_name, file_name, line_num);
465 return NULL;
466 }
467
468 _cs_base_accel_mem_map me;
469
470 if (_hd_alloc_map.count(ptr) == 0) { /* Case where memory was allocated
471 on host only (through BFT_MALLOC) */
472 me = {.host_ptr = ptr,
473 .device_ptr = NULL,
474 .size = bft_mem_get_block_size(ptr),
475 .mode = CS_ALLOC_HOST};
476 _hd_alloc_map[me.host_ptr] = me;
477 }
478 else {
479 me = _hd_alloc_map[ptr];
480 }
481
482 if (new_size == me.size && mode == me.mode) {
483 if (me.host_ptr != NULL)
484 return me.host_ptr;
485 else
486 return me.device_ptr;
487 }
488
489 if ( me.mode <= CS_ALLOC_HOST_DEVICE
490 && me.mode == mode) {
491 me.host_ptr = bft_mem_realloc(me.host_ptr, ni, size,
492 var_name, file_name, line_num);
493 me.size = new_size;
494 _hd_alloc_map.erase(ptr);
495 _hd_alloc_map[me.host_ptr] = me;
496
497 ret_ptr = me.host_ptr;
498 }
499 else {
500 ret_ptr = cs_malloc_hd(mode, 1, me.size,
501 var_name, file_name, line_num);
502
503 memcpy(ret_ptr, ptr, me.size);
504
505 cs_free_hd(ptr, var_name, file_name, line_num);
506 }
507
508 return ret_ptr;
509 }
510
511 /*----------------------------------------------------------------------------*/
512 /*!
513 * \brief Free memory on host and device for a given host pointer.
514 *
515 * If separate pointers are used on the host and device,
516 * the host pointer should be used with this function.
517 *
518 * \param [in] ptr pointer to free
519 * \param [in] var_name allocated variable name string
520 * \param [in] file_name name of calling source file
521 * \param [in] line_num line number in calling source file
522 */
523 /*----------------------------------------------------------------------------*/
524
525 void
cs_free_hd(void * ptr,const char * var_name,const char * file_name,int line_num)526 cs_free_hd(void *ptr,
527 const char *var_name,
528 const char *file_name,
529 int line_num)
530 {
531 if (ptr == NULL)
532 return;
533
534 if (_hd_alloc_map.count(ptr) == 0)
535 bft_error(__FILE__, __LINE__, 0,
536 _("%s: No host or device pointer matching %p."),
537 __func__, ptr);
538
539 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
540
541 if (me.mode < CS_ALLOC_HOST_DEVICE_PINNED) {
542 bft_mem_free(me.host_ptr, var_name, file_name, line_num);
543 me.host_ptr = NULL;
544 }
545
546 if (me.host_ptr != NULL) {
547
548 #if defined(HAVE_CUDA)
549
550 if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED) {
551 cs_cuda_mem_free(me.host_ptr, var_name, file_name, line_num);
552 me.device_ptr = NULL;
553 }
554
555 else
556 cs_cuda_mem_free_host(me.host_ptr, var_name, file_name, line_num);
557
558 me.host_ptr = NULL;
559
560 #elif defined(HAVE_OPENMP_TARGET)
561
562 omp_target_free(me.host_ptr, cs_glob_omp_target_device_id);
563 if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED)
564 me.device_ptr = NULL;
565
566 me.host_ptr = NULL;
567
568 #endif
569
570 }
571
572 if (me.device_ptr != NULL) {
573
574 #if defined(HAVE_CUDA)
575
576 cs_cuda_mem_free(me.device_ptr, var_name, file_name, line_num);
577 me.device_ptr = NULL;
578
579 #elif defined(HAVE_OPENMP_TARGET)
580
581 omp_target_free(me.device_ptr, cs_glob_omp_target_device_id);
582 me.device_ptr = NULL;
583
584 #endif
585
586 }
587
588 _hd_alloc_map.erase(ptr);
589 }
590
591 /*----------------------------------------------------------------------------*/
592 /*!
593 * \brief Free memory on host and device for a given pointer.
594 *
595 * Compared to \cs_free_hd, this function also allows freeing memory
596 * allocated through BFT_MEM_MALLOC / bft_mem_malloc.
597 *
598 * \param [in] ptr pointer to free
599 * \param [in] var_name allocated variable name string
600 * \param [in] file_name name of calling source file
601 * \param [in] line_num line number in calling source file
602 */
603 /*----------------------------------------------------------------------------*/
604
605 void
cs_free(void * ptr,const char * var_name,const char * file_name,int line_num)606 cs_free(void *ptr,
607 const char *var_name,
608 const char *file_name,
609 int line_num)
610 {
611 if (ptr == NULL)
612 return;
613
614 else if (_hd_alloc_map.count(ptr) == 0) {
615 bft_mem_free(ptr, var_name, file_name, line_num);
616 return;
617 }
618 else
619 cs_free_hd(ptr, var_name, file_name, line_num);
620 }
621
622 /*----------------------------------------------------------------------------*/
623 /*!
624 * \brief Return matching device pointer for a given pointer.
625 *
626 * If separate pointers are used on the host and device,
627 * the host pointer should be used with this function.
628 *
629 * If memory is not allocated on device yet at the call site, it will
630 * be allocated automatically by this function.
631 *
632 * \param [in] ptr pointer
633 *
634 * \returns pointer to device memory.
635 */
636 /*----------------------------------------------------------------------------*/
637
638 void *
cs_get_device_ptr(void * ptr)639 cs_get_device_ptr(void *ptr)
640 {
641 if (ptr == NULL)
642 return NULL;
643
644 if (_hd_alloc_map.count(ptr) == 0) {
645 bft_error(__FILE__, __LINE__, 0,
646 _("%s: No host or device pointer matching %p."), __func__, ptr);
647 return NULL;
648 }
649
650 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
651
652 /* Allocate on device if not done yet */
653
654 if (me.device_ptr == NULL) {
655 if ( me.mode == CS_ALLOC_HOST_DEVICE
656 || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
657 #if defined(HAVE_CUDA)
658
659 me.device_ptr = cs_cuda_mem_malloc_device(me.size,
660 "me.device_ptr",
661 __FILE__,
662 __LINE__);
663
664 #elif defined(HAVE_OPENMP_TARGET)
665
666 me.device_ptr = _omp_target_mem_malloc_device(me.size,
667 "me.device_ptr",
668 __FILE__,
669 __LINE__);
670
671 if (omp_target_associate_ptr(me.host_ptr, me.device_ptr, me.size, 0,
672 cs_glob_omp_target_device_id))
673 bft_error(__FILE__, __LINE__, 0,
674 _("%s: Can't associate host pointer %p to device pointer %p."),
675 "omp_target_associate_ptr", me.host_ptr, me.device_ptr);
676
677 #endif
678
679 _hd_alloc_map[ptr] = me;
680
681 }
682 }
683
684 return me.device_ptr;
685 }
686
687 /*----------------------------------------------------------------------------*/
688 /*!
689 * \brief Return matching device pointer for a given constant pointer.
690 *
691 * If separate pointers are used on the host and device,
692 * the host pointer should be used with this function.
693 *
694 * If memory is not allocated on device yet at the call site, it will
695 * be allocated automatically by this function.
696 *
697 * \param [in] ptr pointer
698 *
699 * \returns pointer to device memory.
700 */
701 /*----------------------------------------------------------------------------*/
702
703 const void *
cs_get_device_ptr_const(const void * ptr)704 cs_get_device_ptr_const(const void *ptr)
705 {
706 if (ptr == NULL)
707 return NULL;
708
709 if (_hd_alloc_map.count(ptr) == 0) {
710 bft_error(__FILE__, __LINE__, 0,
711 _("%s: No host or device pointer matching %p."), __func__, ptr);
712 return NULL;
713 }
714
715 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
716
717 /* Allocate and sync on device if not done yet */
718
719 if (me.device_ptr == NULL) {
720 if ( me.mode == CS_ALLOC_HOST_DEVICE
721 || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
722 #if defined(HAVE_CUDA)
723
724 me.device_ptr = cs_cuda_mem_malloc_device(me.size,
725 "me.device_ptr",
726 __FILE__,
727 __LINE__);
728
729 #elif defined(HAVE_OPENMP_TARGET)
730
731 me.device_ptr = _omp_target_mem_malloc_device(me.size,
732 "me.device_ptr",
733 __FILE__,
734 __LINE__);
735
736 #endif
737
738 _hd_alloc_map[ptr] = me;
739 cs_sync_h2d(ptr);
740
741 }
742 }
743
744 return me.device_ptr;
745 }
746
747 /*----------------------------------------------------------------------------*/
748 /*!
749 * \brief Return matching device pointer for a given constant pointer,
750 * prefetching if applicable.
751 *
752 * If separate pointers are used on the host and device, the host pointer
753 * should be used with this function. In this case, it is assumed that
754 * the host and device values have already been synchronized, unless
755 * memory is not allocated on device yet at the call site, in which case
756 * it will be allocated automatically by this function.
757 *
758 * \param [in] ptr pointer
759 *
760 * \returns pointer to device memory.
761 */
762 /*----------------------------------------------------------------------------*/
763
764 const void *
cs_get_device_ptr_const_pf(const void * ptr)765 cs_get_device_ptr_const_pf(const void *ptr)
766 {
767 if (ptr == NULL)
768 return NULL;
769
770 if (_hd_alloc_map.count(ptr) == 0) {
771 bft_error(__FILE__, __LINE__, 0,
772 _("%s: No host or device pointer matching %p."), __func__, ptr);
773 return NULL;
774 }
775
776 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
777
778 /* Allocate and sync on device if not done yet */
779
780 if (me.device_ptr == NULL) {
781 if ( me.mode == CS_ALLOC_HOST_DEVICE
782 || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
783 #if defined(HAVE_CUDA)
784
785 me.device_ptr = cs_cuda_mem_malloc_device(me.size,
786 "me.device_ptr",
787 __FILE__,
788 __LINE__);
789
790 #elif defined(HAVE_OPENMP_TARGET)
791
792 me.device_ptr = _omp_target_mem_malloc_device(me.size,
793 "me.device_ptr",
794 __FILE__,
795 __LINE__);
796
797 #endif
798
799 _hd_alloc_map[ptr] = me;
800 cs_sync_h2d(ptr);
801
802 }
803 }
804
805 /* Prefetch if shared */
806
807 else if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED)
808 cs_prefetch_h2d(me.host_ptr, me.size);
809
810 return me.device_ptr;
811 }
812
813 /*----------------------------------------------------------------------------*/
814 /*!
815 * \brief Check if a pointer is associated with a device.
816 *
817 * If separate pointers are used on the host and device,
818 * the host pointer should be used with this function.
819 *
820 * \returns allocation mode associated with pointer
821 */
822 /*----------------------------------------------------------------------------*/
823
824 cs_alloc_mode_t
cs_check_device_ptr(const void * ptr)825 cs_check_device_ptr(const void *ptr)
826 {
827 if (_hd_alloc_map.count(ptr) == 0)
828 return CS_ALLOC_HOST;
829
830 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
831 return me.mode;
832 }
833
834 /*----------------------------------------------------------------------------*/
835 /*!
836 * \brief Associate device memory with a given host memory pointer.
837 *
838 * If the host memory is already associated with the device, the existing
839 * device pointer is returned. Otherwise, a new device allocation is
840 * called and returned.
841 *
842 * \param [in] host_ptr host pointer
843 * \param [in] ni number of elements
844 * \param [in] size element size
845 *
846 * \returns pointer to allocated memory.
847 */
848 /*----------------------------------------------------------------------------*/
849
850 void *
cs_associate_device_ptr(void * host_ptr,size_t ni,size_t size)851 cs_associate_device_ptr(void *host_ptr,
852 size_t ni,
853 size_t size)
854 {
855 if (_hd_alloc_map.count(host_ptr) == 0) {
856
857 _cs_base_accel_mem_map me = {
858 .host_ptr = host_ptr,
859 .device_ptr = NULL,
860 .size = ni * size,
861 .mode = CS_ALLOC_HOST_DEVICE};
862
863 _hd_alloc_map[me.host_ptr] = me;
864
865 }
866
867 return cs_get_device_ptr(host_ptr);
868 }
869
870 /*----------------------------------------------------------------------------*/
871 /*!
872 * \brief Detach device memory from a given host memory pointer.
873 *
874 * If the host memory is shared with the device (i.e. using CS_ALLOC_SHARED),
875 * device memory stays shared.
876 *
877 * \param [in] host_ptr host pointer
878 */
879 /*----------------------------------------------------------------------------*/
880
881 void
cs_dissassociate_device_ptr(void * host_ptr)882 cs_dissassociate_device_ptr(void *host_ptr)
883 {
884 if (_hd_alloc_map.count(host_ptr) == 0)
885 return;
886
887 _cs_base_accel_mem_map me = _hd_alloc_map[host_ptr];
888
889 if (me.device_ptr != NULL) {
890
891 #if defined(HAVE_CUDA)
892
893 if (me.mode == CS_ALLOC_HOST_DEVICE)
894 cs_cuda_mem_free(me.device_ptr, "me.device_ptr", __FILE__, __LINE__);
895
896 #elif defined(HAVE_OPENMP_TARGET)
897
898 if (me.mode == CS_ALLOC_HOST_DEVICE)
899 omp_target_free(me.device_ptr, cs_glob_omp_target_device_id);
900
901 #endif
902
903 }
904 }
905
906 /*----------------------------------------------------------------------------*/
907 /*!
908 * \brief Set allocation mode for an already allocated pointer.
909 *
910 * If the allocation mode is different from the previous one,
911 * the associated memory will be reallocated with the desired mode,
912 * and the previous allocation freed.
913 *
914 * \param [in, out] host_ptr pointer to host pointer to modify
915 * \param [in] mode desired allocation mode
916 */
917 /*----------------------------------------------------------------------------*/
918
919 void
cs_set_alloc_mode(void ** host_ptr,cs_alloc_mode_t mode)920 cs_set_alloc_mode(void **host_ptr,
921 cs_alloc_mode_t mode)
922 {
923 if (host_ptr == NULL)
924 return;
925
926 void *ret_ptr = *host_ptr;
927
928 void *_host_ptr = *host_ptr;
929
930 if (_host_ptr == NULL)
931 return;
932
933 if (_hd_alloc_map.count(_host_ptr) == 0) {
934
935 _cs_base_accel_mem_map me = {
936 .host_ptr = _host_ptr,
937 .device_ptr = NULL,
938 .size = bft_mem_get_block_size(_host_ptr),
939 .mode = CS_ALLOC_HOST};
940
941 _hd_alloc_map[me.host_ptr] = me;
942
943 }
944
945 cs_alloc_mode_t old_mode = cs_check_device_ptr(_host_ptr);
946
947 if (mode != old_mode) {
948
949 _cs_base_accel_mem_map me = _hd_alloc_map[_host_ptr];
950
951 if (old_mode == CS_ALLOC_HOST_DEVICE)
952 cs_dissassociate_device_ptr(_host_ptr);
953
954 if ( mode == CS_ALLOC_HOST_DEVICE_SHARED
955 || old_mode == CS_ALLOC_HOST_DEVICE_SHARED) {
956
957 ret_ptr = cs_malloc_hd(mode, 1, me.size,
958 "me.host_ptr", __FILE__, __LINE__);
959
960 /* TODO: check if we have multiple OpenMP threads, in which
961 case applying a "first-touch" policy might be useful here */
962
963 memcpy(ret_ptr, _host_ptr, me.size);
964
965 cs_free_hd(_host_ptr, "me.host_ptr", __FILE__, __LINE__);
966
967 }
968
969 }
970
971 *host_ptr = ret_ptr;
972 }
973
974 /*----------------------------------------------------------------------------*/
975 /*!
976 * \brief Synchronize data from host to device.
977 *
978 * If separate pointers are used on the host and device,
979 * the host pointer should be used with this function.
980 *
981 * Depending on the allocation type, this can imply a copy, data prefetch,
982 * or a no-op.
983 *
984 * This function assumes the provided pointer was allocated using
985 * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
986 * determine associated metadata.
987 *
988 * \param [in, out] ptr host pointer to values to copy or prefetch
989 */
990 /*----------------------------------------------------------------------------*/
991
992 void
cs_sync_h2d(const void * ptr)993 cs_sync_h2d(const void *ptr)
994 {
995 if (_hd_alloc_map.count(ptr) == 0)
996 return;
997
998 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
999
1000 if (me.device_ptr == NULL)
1001 me.device_ptr = const_cast<void *>(cs_get_device_ptr_const(ptr));
1002
1003 #if defined(HAVE_CUDA)
1004
1005 switch (me.mode) {
1006 case CS_ALLOC_HOST:
1007 bft_error(__FILE__, __LINE__, 0,
1008 _("%s: %p allocated on host only."),
1009 __func__, ptr);
1010 break;
1011 case CS_ALLOC_HOST_DEVICE:
1012 cs_cuda_copy_h2d(me.device_ptr, me.host_ptr, me.size);
1013 break;
1014 case CS_ALLOC_HOST_DEVICE_PINNED:
1015 cs_cuda_copy_h2d_async(me.device_ptr, me.host_ptr, me.size);
1016 break;
1017 case CS_ALLOC_HOST_DEVICE_SHARED:
1018 cs_cuda_prefetch_h2d(me.device_ptr, me.size);
1019 break;
1020 case CS_ALLOC_DEVICE:
1021 bft_error(__FILE__, __LINE__, 0,
1022 _("%s: %p allocated on device only."),
1023 __func__, ptr);
1024 break;
1025 }
1026
1027 #elif defined(HAVE_OPENMP_TARGET)
1028
1029 switch (me.mode) {
1030
1031 case CS_ALLOC_HOST:
1032 bft_error(__FILE__, __LINE__, 0,
1033 _("%s: %p allocated on host only."),
1034 __func__, ptr);
1035 break;
1036 case CS_ALLOC_HOST_DEVICE:
1037 omp_target_memcpy(me.device_ptr, me.host_ptr, me.size, 0, 0,
1038 cs_glob_omp_target_device_id, omp_get_initial_device());
1039 break;
1040 case CS_ALLOC_HOST_DEVICE_PINNED:
1041 {
1042 char *host_ptr = (char *)me.device_ptr;
1043 #pragma omp target enter data map(to:host_ptr[:me.size]) \
1044 nowait device(cs_glob_omp_target_device_id)
1045 }
1046 break;
1047 case CS_ALLOC_HOST_DEVICE_SHARED:
1048 {
1049 char *host_ptr = (char *)me.host_ptr;
1050 #pragma omp target enter data map(to:host_ptr[:me.size]) \
1051 nowait device(cs_glob_omp_target_device_id)
1052 }
1053 break;
1054 case CS_ALLOC_DEVICE:
1055 bft_error(__FILE__, __LINE__, 0,
1056 _("%s: %p allocated on device only."),
1057 __func__, ptr);
1058 break;
1059 }
1060
1061 #endif
1062 }
1063
1064 /*----------------------------------------------------------------------------*/
1065 /*!
1066 * \brief Initiate synchronization of data from host to device for
1067 * future access.
1068 *
1069 * If separate pointers are used on the host and device,
1070 * the host pointer should be used with this function.
1071 * In this case, synchronization is done are started (asynchronously
1072 * if the allocation mode supports it).
1073 *
1074 * In other cases, synchronization will be delayed until actual use.
1075 * the host pointer should be used with this function.
1076 *
1077 * Depending on the allocation type, this can imply a copy, data prefetch,
1078 * or a no-op.
1079 *
1080 * This function assumes the provided pointer was allocated using
1081 * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
1082 * determine associated metadata.
1083 *
1084 * \param [in, out] ptr host pointer to values to copy or prefetch
1085 */
1086 /*----------------------------------------------------------------------------*/
1087
1088 void
cs_sync_h2d_future(const void * ptr)1089 cs_sync_h2d_future(const void *ptr)
1090 {
1091 if (_hd_alloc_map.count(ptr) == 0)
1092 return;
1093
1094 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
1095
1096 #if defined(HAVE_CUDA)
1097
1098 switch (me.mode) {
1099 case CS_ALLOC_HOST_DEVICE:
1100 cs_cuda_copy_h2d(me.device_ptr, me.host_ptr, me.size);
1101 break;
1102 case CS_ALLOC_HOST_DEVICE_PINNED:
1103 cs_cuda_copy_h2d_async(me.device_ptr, me.host_ptr, me.size);
1104 break;
1105 default:
1106 break;
1107 }
1108
1109 #elif defined(HAVE_OPENMP_TARGET)
1110
1111 switch (me.mode) {
1112 case CS_ALLOC_HOST_DEVICE:
1113 omp_target_memcpy(me.device_ptr, me.host_ptr, me.size, 0, 0,
1114 cs_glob_omp_target_device_id, omp_get_initial_device());
1115 break;
1116 case CS_ALLOC_HOST_DEVICE_PINNED:
1117 {
1118 char *host_ptr = (char *)me.device_ptr;
1119 #pragma omp target enter data map(to:host_ptr[:me.size]) \
1120 nowait device(cs_glob_omp_target_device_id)
1121 }
1122 break;
1123 default:
1124 break;
1125 }
1126
1127 #endif
1128 }
1129
1130 /*----------------------------------------------------------------------------*/
1131 /*!
1132 * \brief Synchronize data from device to host.
1133 *
1134 * If separate allocations are used on the host and device
1135 * (mode == CS_ALLOC_HOST_DEVICE), the host pointer should be passed to this
1136 * function.
1137 *
1138 * Depending on the allocaton type, this can imply a copy, data prefetch,
1139 * or a no-op.
1140 *
1141 * This function assumes the provided pointer was allocated using
1142 * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
1143 * determine associated metadata.
1144 *
1145 * \param [in, out] ptr pointer to values to copy or prefetch
1146 */
1147 /*----------------------------------------------------------------------------*/
1148
1149 void
cs_sync_d2h(void * ptr)1150 cs_sync_d2h(void *ptr)
1151 {
1152 if (_hd_alloc_map.count(ptr) == 0)
1153 return;
1154
1155 _cs_base_accel_mem_map me = _hd_alloc_map[ptr];
1156
1157 #if defined(HAVE_CUDA)
1158
1159 switch (me.mode) {
1160 case CS_ALLOC_HOST:
1161 bft_error(__FILE__, __LINE__, 0,
1162 _("%s: %p allocated on host only."),
1163 __func__, ptr);
1164 break;
1165 case CS_ALLOC_HOST_DEVICE:
1166 cs_cuda_copy_d2h(me.host_ptr, me.device_ptr, me.size);
1167 break;
1168 case CS_ALLOC_HOST_DEVICE_PINNED:
1169 cs_cuda_copy_d2h_async(me.host_ptr, me.device_ptr, me.size);
1170 break;
1171 case CS_ALLOC_HOST_DEVICE_SHARED:
1172 cs_cuda_prefetch_d2h(me.host_ptr, me.size);
1173 break;
1174 case CS_ALLOC_DEVICE:
1175 bft_error(__FILE__, __LINE__, 0,
1176 _("%s: %p allocated on device only."),
1177 __func__, ptr);
1178 break;
1179 }
1180
1181 #elif defined(HAVE_OPENMP_TARGET)
1182
1183 switch (me.mode) {
1184 case CS_ALLOC_HOST:
1185 bft_error(__FILE__, __LINE__, 0,
1186 _("%s: %p allocated on host only."),
1187 __func__, ptr);
1188 break;
1189 case CS_ALLOC_HOST_DEVICE:
1190 omp_target_memcpy(me.host_ptr, me.device_ptr, me.size, 0, 0,
1191 omp_get_initial_device(), cs_glob_omp_target_device_id);
1192
1193 break;
1194 case CS_ALLOC_HOST_DEVICE_PINNED:
1195 {
1196 char *host_ptr = (char *)me.host_ptr;
1197 #pragma omp target exit data map(from:host_ptr[:me.size]) \
1198 nowait device(cs_glob_omp_target_device_id)
1199 }
1200 break;
1201 case CS_ALLOC_HOST_DEVICE_SHARED:
1202 {
1203 char *host_ptr = (char *)me.host_ptr;
1204 #pragma omp target exit data map(from:host_ptr[:me.size]) \
1205 nowait device(cs_glob_omp_target_device_id)
1206 }
1207 break;
1208 case CS_ALLOC_DEVICE:
1209 bft_error(__FILE__, __LINE__, 0,
1210 _("%s: %p allocated on device only."),
1211 __func__, ptr);
1212 break;
1213 }
1214
1215 #endif
1216 }
1217
1218 /*----------------------------------------------------------------------------*/
1219 /*!
1220 * \brief Prefetch data from host to device.
1221 *
1222 * This function should only be used on arrays using shared host and device
1223 * memory, shuch as those allocated using CS_ALLOC_HOST_DEVICE_SHARED.
1224 * It should be usable on a subset of such an array.
1225 *
1226 * \param [in, out] ptr pointer to data to prefetch
1227 * \param [in] size number of bytes to prefetch
1228 */
1229 /*----------------------------------------------------------------------------*/
1230
1231 void
cs_prefetch_h2d(void * ptr,size_t size)1232 cs_prefetch_h2d(void *ptr,
1233 size_t size)
1234 {
1235 #if defined(HAVE_CUDA)
1236
1237 cs_cuda_prefetch_h2d(ptr, size);
1238
1239 #elif defined(HAVE_OPENMP_TARGET)
1240
1241 char *host_ptr = (char *)ptr;
1242 #pragma omp target enter data map(to:host_ptr[:size]) \
1243 nowait device(cs_glob_omp_target_device_id)
1244
1245 #endif
1246 }
1247
1248 /*----------------------------------------------------------------------------*/
1249 /*!
1250 * \brief Prefetch data from device to host.
1251 *
1252 * This function should only be used on arrays using shared host and device
1253 * memory, shuch as those allocated using CS_ALLOC_HOST_DEVICE_SHARED.
1254 * It should be usable on a subset of such an array.
1255 *
1256 * \param [in, out] ptr pointer to data to prefetch
1257 * \param [in] size number of bytes to prefetch
1258 */
1259 /*----------------------------------------------------------------------------*/
1260
1261 void
cs_prefetch_d2h(void * ptr,size_t size)1262 cs_prefetch_d2h(void *ptr,
1263 size_t size)
1264 {
1265 #if defined(HAVE_CUDA)
1266
1267 cs_cuda_prefetch_d2h(ptr, size);
1268
1269 #elif defined(HAVE_OPENMP_TARGET)
1270
1271 char *host_ptr = (char *)ptr;
1272 #pragma omp target exit data map(from:host_ptr[:size]) \
1273 nowait device(cs_glob_omp_target_device_id)
1274
1275 #endif
1276 }
1277
1278 /*----------------------------------------------------------------------------*/
1279 /*!
1280 * \brief Copy data from host to device.
1281 *
1282 * This function should be usable on subsets of arrays allocated on the host
1283 * and device.
1284 *
1285 * \param [out] dest pointer to destination data on device
1286 * \param [in, out] src pointer to source data on host
1287 * \param [in] size number of bytes to prefetch
1288 */
1289 /*----------------------------------------------------------------------------*/
1290
1291 void
cs_copy_h2d(void * dest,const void * src,size_t size)1292 cs_copy_h2d(void *dest,
1293 const void *src,
1294 size_t size)
1295 {
1296 #if defined(HAVE_CUDA)
1297
1298 cs_cuda_copy_h2d(dest, src, size);
1299
1300 #elif defined(HAVE_OPENMP_TARGET)
1301
1302 omp_target_memcpy(dest, src, size, 0, 0,
1303 cs_glob_omp_target_device_id, omp_get_initial_device());
1304
1305 #endif
1306 }
1307
1308 /*----------------------------------------------------------------------------*/
1309 /*!
1310 * \brief Copy data from device to host.
1311 *
1312 * This function should be usable on subsets of arrays allocated on the host
1313 * and device.
1314 *
1315 * \param [out] dest pointer to destination data on host
1316 * \param [in, out] src pointer to source data on device
1317 * \param [in] size number of bytes to prefetch
1318 */
1319 /*----------------------------------------------------------------------------*/
1320
1321 void
cs_copy_d2h(void * dest,const void * src,size_t size)1322 cs_copy_d2h(void *dest,
1323 const void *src,
1324 size_t size)
1325 {
1326 #if defined(HAVE_CUDA)
1327
1328 cs_cuda_copy_d2h(dest, src, size);
1329
1330 #elif defined(HAVE_OPENMP_TARGET)
1331
1332 omp_target_memcpy(dest, src, size, 0, 0,
1333 omp_get_initial_device(), cs_glob_omp_target_device_id);
1334
1335 #endif
1336 }
1337
1338 /*----------------------------------------------------------------------------*/
1339 /*!
1340 * \brief Copy data from device to device.
1341 *
1342 * This function should be usable on subsets of arrays allocated on the host
1343 * and device.
1344 *
1345 * \param [out] dest pointer to destination data on host
1346 * \param [in, out] src pointer to source data on device
1347 * \param [in] size number of bytes to prefetch
1348 */
1349 /*----------------------------------------------------------------------------*/
1350
1351 void
cs_copy_d2d(void * dest,const void * src,size_t size)1352 cs_copy_d2d(void *dest,
1353 const void *src,
1354 size_t size)
1355 {
1356 #if defined(HAVE_CUDA)
1357
1358 cs_cuda_copy_d2d(dest, src, size);
1359
1360 #elif defined(HAVE_OPENMP_TARGET)
1361
1362 omp_target_memcpy(dest, src, size, 0, 0,
1363 cs_glob_omp_target_device_id, cs_glob_omp_target_device_id);
1364
1365 #endif
1366 }
1367
1368 /*----------------------------------------------------------------------------*/
1369 /*!
1370 * \brief Return number of host-device allocations
1371 *
1372 * \returns current number of host-device allocations.
1373 */
1374 /*----------------------------------------------------------------------------*/
1375
1376 int
cs_get_n_allocations_hd(void)1377 cs_get_n_allocations_hd(void)
1378 {
1379 return _hd_alloc_map.size();
1380 }
1381
1382 /*----------------------------------------------------------------------------*/
1383 /*!
1384 * \brief Check if a given host pointer is allocated with associated with
1385 * cs_alloc_hd or cs_realloc_hd.
1386 *
1387 * \returns allocated memory size, or zero if not allocated with this
1388 * mechanism.
1389 */
1390 /*----------------------------------------------------------------------------*/
1391
1392 size_t
cs_get_allocation_hd_size(void * host_ptr)1393 cs_get_allocation_hd_size(void *host_ptr)
1394 {
1395 if (_hd_alloc_map.count(host_ptr) == 0)
1396 return 0;
1397
1398 _cs_base_accel_mem_map me = _hd_alloc_map[host_ptr];
1399 return me.size;
1400 }
1401
1402 #if defined(HAVE_OPENMP_TARGET)
1403
1404 /*----------------------------------------------------------------------------*/
1405 /*!
1406 * \brief Set OpenMP Offload device based on MPI rank and number of devices.
1407 *
1408 * \param[in] comm associated MPI communicator
1409 * \param[in] ranks_per_node number of ranks per node (min and max)
1410 *
1411 * \return selected device id, or -1 if no usable device is available
1412 */
1413 /*----------------------------------------------------------------------------*/
1414
1415 int
cs_omp_target_select_default_device(void)1416 cs_omp_target_select_default_device(void)
1417 {
1418 int device_id = omp_get_initial_device();
1419
1420 int n_devices = omp_get_num_devices();
1421
1422 if (cs_glob_rank_id > -1 && n_devices > 1) {
1423
1424 device_id = cs_glob_node_rank_id*n_devices / cs_glob_node_n_ranks;
1425
1426 assert(device_id > -1 && device_id < n_devices);
1427
1428 }
1429
1430 omp_set_default_device(device_id);
1431
1432 cs_glob_omp_target_device_id = device_id;
1433
1434 return device_id;
1435 }
1436
1437 #endif /* defined(HAVE_OPENMP_TARGET) */
1438
1439 /*----------------------------------------------------------------------------*/
1440
1441 END_C_DECLS
1442