1 /*============================================================================
2  * Definitions, global variables, and base functions for accelerators.
3  *============================================================================*/
4 
5 /*
6   This file is part of Code_Saturne, a general-purpose CFD tool.
7 
8   Copyright (C) 1998-2021 EDF S.A.
9 
10   This program is free software; you can redistribute it and/or modify it under
11   the terms of the GNU General Public License as published by the Free Software
12   Foundation; either version 2 of the License, or (at your option) any later
13   version.
14 
15   This program is distributed in the hope that it will be useful, but WITHOUT
16   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
17   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
18   details.
19 
20   You should have received a copy of the GNU General Public License along with
21   this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22   Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 */
24 
25 /*----------------------------------------------------------------------------*/
26 
27 #include "cs_defs.h"
28 
29 /*----------------------------------------------------------------------------
30  * Standard C library headers
31  *----------------------------------------------------------------------------*/
32 
33 #include <assert.h>
34 #include <string.h>
35 
36 /*----------------------------------------------------------------------------
37  * Standard C++ library headers
38  *----------------------------------------------------------------------------*/
39 
40 #include <map>
41 
42 /*----------------------------------------------------------------------------
43  * Local headers
44  *----------------------------------------------------------------------------*/
45 
46 #include "bft_error.h"
47 #include "bft_mem.h"
48 
49 #if defined(HAVE_CUDA)
50 #include "cs_base_cuda.h"
51 #endif
52 
53 /*----------------------------------------------------------------------------
54  *  Header for the current file
55  *----------------------------------------------------------------------------*/
56 
57 #include "cs_base_accel.h"
58 
59 /*----------------------------------------------------------------------------*/
60 
61 /*! \cond DOXYGEN_SHOULD_SKIP_THIS */
62 
63 /*============================================================================
64  * Local Macro Definitions
65  *============================================================================*/
66 
67 /*============================================================================
68  * Local Type Definitions
69  *============================================================================*/
70 
71 typedef struct
72 {
73   void  *host_ptr;          //!< host pointer
74   void  *device_ptr;        //!< host pointer
75 
76   size_t           size;    //! allocation size
77   cs_alloc_mode_t  mode;    //!< allocation mode
78 } _cs_base_accel_mem_map;
79 
80 /*============================================================================
81  *  Global variables
82  *============================================================================*/
83 
84 static std::map<const void *, _cs_base_accel_mem_map> _hd_alloc_map;
85 
86 static bool _initialized = false;
87 
88 /*! Default "host+device" allocation mode */
89 
90 cs_alloc_mode_t  cs_alloc_mode = CS_ALLOC_HOST_DEVICE_SHARED;
91 
92 /* Keep track of active device id using OpenMP; usually queried dynamically,
93    but saving the value in this variable can be useful when debugging */
94 
95 int  cs_glob_omp_target_device_id = -1;
96 
97 /*============================================================================
98  * Private function definitions
99  *============================================================================*/
100 
101 /*----------------------------------------------------------------------------*/
102 /*!
103  * \brief Reallocate memory on host and device for ni elements of size bytes.
104  *
105  * This function calls the appropriate reallocation function based on
106  * the requested mode, and allows introspection of the allocated memory.
107  *
108  * \param [in]  host_ptr   host pointer
109  * \param [in]  ni         number of elements
110  * \param [in]  size       element size
111  * \param [in]  var_name   allocated variable name string
112  * \param [in]  file_name  name of calling source file
113  * \param [in]  line_num   line number in calling source file
114  *
115  * \returns pointer to allocated memory.
116  */
117 /*----------------------------------------------------------------------------*/
118 
119 static void *
_realloc_host(void * host_ptr,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)120 _realloc_host(void            *host_ptr,
121               size_t           ni,
122               size_t           size,
123               const char      *var_name,
124               const char      *file_name,
125               int              line_num)
126 {
127   return cs_realloc_hd(host_ptr,
128                        CS_ALLOC_HOST,
129                        ni, size,
130                        var_name, file_name, line_num);
131 }
132 
133 /*----------------------------------------------------------------------------*/
134 /*!
135  * \brief Initialize memory mapping on device.
136  */
137 /*----------------------------------------------------------------------------*/
138 
139 static void
_initialize(void)140 _initialize(void)
141 {
142   bft_mem_alternative_set(cs_get_allocation_hd_size,
143                           _realloc_host,
144                           cs_free_hd);
145 
146   _initialized = true;
147 
148   if (cs_get_device_id() < 0)
149     cs_alloc_mode = CS_ALLOC_HOST;
150 }
151 
152 #if defined(HAVE_OPENMP_TARGET)
153 
154 /*----------------------------------------------------------------------------*/
155 /*!
156  * \brief Allocate n bytes of OpenMP device memory.
157  *
158  * A safety check is added.
159  *
160  * \param [in]  n          element size
161  * \param [in]  var_name   allocated variable name string
162  * \param [in]  file_name  name of calling source file
163  * \param [in]  line_num   line number in calling source file
164  *
165  * \returns pointer to allocated memory.
166  */
167 /*----------------------------------------------------------------------------*/
168 
169 static void *
_omp_target_mem_malloc_device(size_t n,const char * var_name,const char * file_name,int line_num)170 _omp_target_mem_malloc_device(size_t        n,
171                               const char   *var_name,
172                               const char   *file_name,
173                               int           line_num)
174 {
175   void *ptr = omp_target_alloc_device(n, cs_glob_omp_target_device_id);
176 
177   if (ptr == NULL)
178     bft_error(file_name, line_num, 0,
179               "[OpenMP offload error]: unable to allocate %llu bytes on device\n"
180               "  running: %s",
181               (unsigned long long)n, __func__);
182 
183   return ptr;
184 }
185 
186 /*----------------------------------------------------------------------------*/
187 /*!
188  * \brief Allocate n bytes of host memory using OpenMP Offload.
189  *
190  * No OpenMP standard way to mimick cudaMallocHost today aka Host pinned memory
191  * allocation + GPU driver acceleration (DMA/zero copy).
192  *
193  * Closest is Intel proprietary omp_target_alloc_host (accepted in OMP 6.0) or
194  * new omp allocator (pinned) + explicit data transfer
195  * Note: omp_target_alloc_host supports implicit data transfert.
196  *
197  * A safety check is added.
198  *
199  * \param [in]  n          element size
200  * \param [in]  var_name   allocated variable name string
201  * \param [in]  file_name  name of calling source file
202  * \param [in]  line_num   line number in calling source file
203  *
204  * \returns pointer to allocated memory.
205  */
206 /*----------------------------------------------------------------------------*/
207 
208 static void *
_omp_target_mem_malloc_host(size_t n,const char * var_name,const char * file_name,int line_num)209 _omp_target_mem_malloc_host(size_t        n,
210                             const char   *var_name,
211                             const char   *file_name,
212                             int           line_num)
213 {
214   void *ptr = NULL;
215 
216 #if defined(__INTEL_LLVM_COMPILER)
217   ptr = omp_target_alloc_host(n, cs_glob_omp_target_device_id);
218 #else
219   assert(0 && "Not implemented yet");
220 #endif
221 
222   if (ptr == NULL)
223     bft_error(file_name, line_num, 0,
224               "[OpenMP offload error]: unable to allocate %llu bytes on host\n"
225               "  running: %s",
226               (unsigned long long)n, __func__);
227 
228   return ptr;
229 }
230 
231 /*----------------------------------------------------------------------------*/
232 /*!
233  * \brief Allocate n bytes of OpenMP Offload managed memory.
234  *
235  * Standards define pragma unified_shared_memory to drive
236  * omp_target_alloc to allocate USM
237  *
238  * Intel proprietary omp_target_alloc_shared (accepted in OMP 6.0) is
239  * another convenient way to do so.
240  *
241  * A safety check is added.
242  *
243  * \param [in]  n          element size
244  * \param [in]  var_name   allocated variable name string
245  * \param [in]  file_name  name of calling source file
246  * \param [in]  line_num   line number in calling source file
247  *
248  * \returns pointer to allocated memory.
249  */
250 /*----------------------------------------------------------------------------*/
251 
252 static void *
_omp_target_mem_malloc_managed(size_t n,const char * var_name,const char * file_name,int line_num)253 _omp_target_mem_malloc_managed(size_t        n,
254                                const char   *var_name,
255                                const char   *file_name,
256                                int           line_num)
257 {
258 #if defined(__INTEL_LLVM_COMPILER)
259 
260   void *ptr = omp_target_alloc_shared(n, cs_glob_omp_target_device_id);
261 
262 #else
263 
264 #pragma omp requires unified_shared_memory
265   void *ptr = omp_target_alloc(n, cs_glob_omp_target_device_id);
266 
267 #endif
268 
269   if (ptr == NULL)
270     bft_error(file_name, line_num, 0,
271               "[OpenMP offload error]: unable to allocate %llu bytes\n"
272               "  running: %s",
273               (unsigned long long)n, __func__);
274 
275   return ptr;
276 }
277 
278 #endif /* defined(HAVE_OPENMP_TARGET) */
279 
280 /*! (DOXYGEN_SHOULD_SKIP_THIS) \endcond */
281 
282 BEGIN_C_DECLS
283 
284 /*============================================================================
285  * Public function definitions
286  *============================================================================*/
287 
288 /*----------------------------------------------------------------------------*/
289 /*!
290  * \brief Return currently associated device id.
291  *
292  * \returns currently available device id, or -1 if none is available.
293  */
294 /*----------------------------------------------------------------------------*/
295 
296 int
cs_get_device_id(void)297 cs_get_device_id(void)
298 {
299   int retval = -1;
300 
301 #if defined(HAVE_CUDA)
302 
303   retval = cs_base_cuda_get_device();
304 
305 #elif defined (HAVE_OPENMP_TARGET)
306 
307   retval = omp_get_default_device();
308 
309 #endif
310 
311   return retval;
312 }
313 
314 /*----------------------------------------------------------------------------*/
315 /*!
316  * \brief Allocate memory on host and device for ni elements of size bytes.
317  *
318  * This function calls the appropriate allocation function based on
319  * the requested mode, and allows introspection of the allocated memory.
320  *
321  * If separate pointers are used on the host and device,
322  * the host pointer is returned.
323  *
324  * \param [in]  mode       allocation mode
325  * \param [in]  ni         number of elements
326  * \param [in]  size       element size
327  * \param [in]  var_name   allocated variable name string
328  * \param [in]  file_name  name of calling source file
329  * \param [in]  line_num   line number in calling source file
330  *
331  * \returns pointer to allocated memory.
332  */
333 /*----------------------------------------------------------------------------*/
334 
335 void *
cs_malloc_hd(cs_alloc_mode_t mode,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)336 cs_malloc_hd(cs_alloc_mode_t   mode,
337              size_t            ni,
338              size_t            size,
339              const char       *var_name,
340              const char       *file_name,
341              int               line_num)
342 {
343   if (_initialized == false) {
344    _initialize();
345    mode = cs_alloc_mode;
346   }
347 
348   if (ni == 0)
349     return NULL;
350 
351   _cs_base_accel_mem_map  me = {
352     .host_ptr = NULL,
353     .device_ptr = NULL,
354     .size = ni * size,
355     .mode = mode};
356 
357   if (mode < CS_ALLOC_HOST_DEVICE_PINNED)
358     me.host_ptr = bft_mem_malloc(ni, size, var_name, file_name, line_num);
359 
360   // Device allocation will be postponed later thru call to
361   // cs_get_device_ptr. This applies for CS_ALLOC_HOST_DEVICE
362   // and CS_ALLOC_HOST_DEVICE_PINNED modes
363 
364 #if defined(HAVE_CUDA)
365 
366   else if (mode == CS_ALLOC_HOST_DEVICE_PINNED)
367     me.host_ptr = cs_cuda_mem_malloc_host(me.size,
368                                           var_name,
369                                           file_name,
370                                           line_num);
371 
372   else if (mode == CS_ALLOC_HOST_DEVICE_SHARED) {
373     me.host_ptr = cs_cuda_mem_malloc_managed(me.size,
374                                              var_name,
375                                              file_name,
376                                              line_num);
377     me.device_ptr = me.host_ptr;
378   }
379 
380   else if (mode == CS_ALLOC_DEVICE)
381     me.device_ptr = cs_cuda_mem_malloc_device(me.size,
382                                               var_name,
383                                               file_name,
384                                               line_num);
385 
386 #elif defined(HAVE_OPENMP_TARGET)
387 
388   else if (mode == CS_ALLOC_HOST_DEVICE_PINNED)
389     me.host_ptr = _omp_target_mem_malloc_host(me.size,
390                                               var_name,
391                                               file_name,
392                                               line_num);
393 
394   else if (mode == CS_ALLOC_HOST_DEVICE_SHARED) {
395     me.host_ptr = _omp_target_mem_malloc_managed(me.size,
396                                                  var_name,
397                                                  file_name,
398                                                  line_num);
399     me.device_ptr = me.host_ptr;
400   }
401 
402   else if (mode == CS_ALLOC_DEVICE)
403     me.device_ptr = _omp_target_mem_malloc_device(me.size,
404                                                   var_name,
405                                                   file_name,
406                                                   line_num);
407 
408 #endif
409 
410   if (me.host_ptr != NULL)
411     _hd_alloc_map[me.host_ptr] = me;
412   else if (me.device_ptr != NULL)
413     _hd_alloc_map[me.device_ptr] = me;
414 
415   /* Return pointer to allocated memory */
416 
417   if (me.host_ptr != NULL)
418     return me.host_ptr;
419   else
420     return me.device_ptr;
421 }
422 
423 /*----------------------------------------------------------------------------*/
424 /*!
425  * \brief Reallocate memory on host and device for ni elements of size bytes.
426  *
427  * This function calls the appropriate reallocation function based on
428  * the requested mode, and allows introspection of the allocated memory.
429  *
430  * If separate pointers are used on the host and device,
431  * the host pointer should be used with this function.
432  *
433  * If the allocation parameters are unchanged, no actual reallocation
434  * occurs.
435  *
436  * \param [in]  ptr        pointer to previously allocated memory
437  * \param [in]  mode       allocation mode
438  * \param [in]  ni         number of elements
439  * \param [in]  size       element size
440  * \param [in]  var_name   allocated variable name string
441  * \param [in]  file_name  name of calling source file
442  * \param [in]  line_num   line number in calling source file
443  *
444  * \returns pointer to allocated memory.
445  */
446 /*----------------------------------------------------------------------------*/
447 
448 void *
cs_realloc_hd(void * ptr,cs_alloc_mode_t mode,size_t ni,size_t size,const char * var_name,const char * file_name,int line_num)449 cs_realloc_hd(void            *ptr,
450               cs_alloc_mode_t  mode,
451               size_t           ni,
452               size_t           size,
453               const char      *var_name,
454               const char      *file_name,
455               int              line_num)
456 {
457   void *ret_ptr = ptr;
458   size_t new_size = ni*size;
459 
460   if (ptr == NULL) {
461     return cs_malloc_hd(mode, ni, size, var_name, file_name, line_num);
462   }
463   else if (new_size == 0) {
464     cs_free_hd(ptr, var_name, file_name, line_num);
465     return NULL;
466   }
467 
468   _cs_base_accel_mem_map  me;
469 
470   if (_hd_alloc_map.count(ptr) == 0) {  /* Case where memory was allocated
471                                            on host only (through BFT_MALLOC) */
472     me = {.host_ptr = ptr,
473           .device_ptr = NULL,
474           .size = bft_mem_get_block_size(ptr),
475           .mode = CS_ALLOC_HOST};
476     _hd_alloc_map[me.host_ptr] = me;
477   }
478   else {
479     me = _hd_alloc_map[ptr];
480   }
481 
482   if (new_size == me.size && mode == me.mode) {
483     if (me.host_ptr != NULL)
484       return me.host_ptr;
485     else
486       return me.device_ptr;
487   }
488 
489   if (   me.mode <= CS_ALLOC_HOST_DEVICE
490       && me.mode == mode) {
491     me.host_ptr = bft_mem_realloc(me.host_ptr, ni, size,
492                                   var_name, file_name, line_num);
493     me.size = new_size;
494     _hd_alloc_map.erase(ptr);
495     _hd_alloc_map[me.host_ptr] = me;
496 
497     ret_ptr = me.host_ptr;
498   }
499   else {
500     ret_ptr = cs_malloc_hd(mode, 1, me.size,
501                            var_name, file_name, line_num);
502 
503     memcpy(ret_ptr, ptr, me.size);
504 
505     cs_free_hd(ptr, var_name, file_name, line_num);
506   }
507 
508   return ret_ptr;
509 }
510 
511 /*----------------------------------------------------------------------------*/
512 /*!
513  * \brief Free memory on host and device for a given host pointer.
514  *
515  * If separate pointers are used on the host and device,
516  * the host pointer should be used with this function.
517  *
518  * \param [in]  ptr        pointer to free
519  * \param [in]  var_name   allocated variable name string
520  * \param [in]  file_name  name of calling source file
521  * \param [in]  line_num   line number in calling source file
522  */
523 /*----------------------------------------------------------------------------*/
524 
525 void
cs_free_hd(void * ptr,const char * var_name,const char * file_name,int line_num)526 cs_free_hd(void        *ptr,
527            const char  *var_name,
528            const char  *file_name,
529            int          line_num)
530 {
531   if (ptr == NULL)
532     return;
533 
534   if (_hd_alloc_map.count(ptr) == 0)
535     bft_error(__FILE__, __LINE__, 0,
536               _("%s: No host or device pointer matching %p."),
537               __func__, ptr);
538 
539   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
540 
541   if (me.mode < CS_ALLOC_HOST_DEVICE_PINNED) {
542     bft_mem_free(me.host_ptr, var_name, file_name, line_num);
543     me.host_ptr = NULL;
544   }
545 
546   if (me.host_ptr != NULL) {
547 
548 #if defined(HAVE_CUDA)
549 
550     if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED) {
551       cs_cuda_mem_free(me.host_ptr, var_name, file_name, line_num);
552       me.device_ptr = NULL;
553     }
554 
555     else
556       cs_cuda_mem_free_host(me.host_ptr, var_name, file_name, line_num);
557 
558     me.host_ptr = NULL;
559 
560 #elif defined(HAVE_OPENMP_TARGET)
561 
562     omp_target_free(me.host_ptr, cs_glob_omp_target_device_id);
563     if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED)
564       me.device_ptr = NULL;
565 
566     me.host_ptr = NULL;
567 
568 #endif
569 
570   }
571 
572   if (me.device_ptr != NULL) {
573 
574 #if defined(HAVE_CUDA)
575 
576     cs_cuda_mem_free(me.device_ptr, var_name, file_name, line_num);
577     me.device_ptr = NULL;
578 
579 #elif defined(HAVE_OPENMP_TARGET)
580 
581     omp_target_free(me.device_ptr, cs_glob_omp_target_device_id);
582     me.device_ptr = NULL;
583 
584 #endif
585 
586   }
587 
588   _hd_alloc_map.erase(ptr);
589 }
590 
591 /*----------------------------------------------------------------------------*/
592 /*!
593  * \brief Free memory on host and device for a given pointer.
594  *
595  * Compared to \cs_free_hd, this function also allows freeing memory
596  * allocated through BFT_MEM_MALLOC / bft_mem_malloc.
597  *
598  * \param [in]  ptr        pointer to free
599  * \param [in]  var_name   allocated variable name string
600  * \param [in]  file_name  name of calling source file
601  * \param [in]  line_num   line number in calling source file
602  */
603 /*----------------------------------------------------------------------------*/
604 
605 void
cs_free(void * ptr,const char * var_name,const char * file_name,int line_num)606 cs_free(void        *ptr,
607         const char  *var_name,
608         const char  *file_name,
609         int          line_num)
610 {
611   if (ptr == NULL)
612     return;
613 
614   else if (_hd_alloc_map.count(ptr) == 0) {
615     bft_mem_free(ptr, var_name, file_name, line_num);
616     return;
617   }
618   else
619     cs_free_hd(ptr, var_name, file_name, line_num);
620 }
621 
622 /*----------------------------------------------------------------------------*/
623 /*!
624  * \brief Return matching device pointer for a given pointer.
625  *
626  * If separate pointers are used on the host and device,
627  * the host pointer should be used with this function.
628  *
629  * If memory is not allocated on device yet at the call site, it will
630  * be allocated automatically by this function.
631  *
632  * \param [in]  ptr  pointer
633  *
634  * \returns pointer to device memory.
635  */
636 /*----------------------------------------------------------------------------*/
637 
638 void *
cs_get_device_ptr(void * ptr)639 cs_get_device_ptr(void  *ptr)
640 {
641   if (ptr == NULL)
642     return NULL;
643 
644   if (_hd_alloc_map.count(ptr) == 0) {
645     bft_error(__FILE__, __LINE__, 0,
646               _("%s: No host or device pointer matching %p."), __func__, ptr);
647     return NULL;
648   }
649 
650   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
651 
652   /* Allocate on device if not done yet */
653 
654   if (me.device_ptr == NULL) {
655     if (   me.mode == CS_ALLOC_HOST_DEVICE
656         || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
657 #if defined(HAVE_CUDA)
658 
659       me.device_ptr = cs_cuda_mem_malloc_device(me.size,
660                                                 "me.device_ptr",
661                                                 __FILE__,
662                                                 __LINE__);
663 
664 #elif defined(HAVE_OPENMP_TARGET)
665 
666       me.device_ptr = _omp_target_mem_malloc_device(me.size,
667                                                     "me.device_ptr",
668                                                     __FILE__,
669                                                     __LINE__);
670 
671       if (omp_target_associate_ptr(me.host_ptr, me.device_ptr, me.size, 0,
672                                    cs_glob_omp_target_device_id))
673         bft_error(__FILE__, __LINE__, 0,
674                   _("%s: Can't associate host pointer %p to device pointer %p."),
675                   "omp_target_associate_ptr", me.host_ptr, me.device_ptr);
676 
677 #endif
678 
679       _hd_alloc_map[ptr] = me;
680 
681     }
682   }
683 
684   return me.device_ptr;
685 }
686 
687 /*----------------------------------------------------------------------------*/
688 /*!
689  * \brief Return matching device pointer for a given constant pointer.
690  *
691  * If separate pointers are used on the host and device,
692  * the host pointer should be used with this function.
693  *
694  * If memory is not allocated on device yet at the call site, it will
695  * be allocated automatically by this function.
696  *
697  * \param [in]  ptr  pointer
698  *
699  * \returns pointer to device memory.
700  */
701 /*----------------------------------------------------------------------------*/
702 
703 const void *
cs_get_device_ptr_const(const void * ptr)704 cs_get_device_ptr_const(const void  *ptr)
705 {
706   if (ptr == NULL)
707     return NULL;
708 
709   if (_hd_alloc_map.count(ptr) == 0) {
710     bft_error(__FILE__, __LINE__, 0,
711               _("%s: No host or device pointer matching %p."), __func__, ptr);
712     return NULL;
713   }
714 
715   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
716 
717   /* Allocate and sync on device if not done yet */
718 
719   if (me.device_ptr == NULL) {
720     if (   me.mode == CS_ALLOC_HOST_DEVICE
721         || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
722 #if defined(HAVE_CUDA)
723 
724       me.device_ptr = cs_cuda_mem_malloc_device(me.size,
725                                                 "me.device_ptr",
726                                                 __FILE__,
727                                                 __LINE__);
728 
729 #elif defined(HAVE_OPENMP_TARGET)
730 
731       me.device_ptr = _omp_target_mem_malloc_device(me.size,
732                                                     "me.device_ptr",
733                                                     __FILE__,
734                                                     __LINE__);
735 
736 #endif
737 
738       _hd_alloc_map[ptr] = me;
739       cs_sync_h2d(ptr);
740 
741     }
742   }
743 
744   return me.device_ptr;
745 }
746 
747 /*----------------------------------------------------------------------------*/
748 /*!
749  * \brief Return matching device pointer for a given constant pointer,
750  *        prefetching if applicable.
751  *
752  * If separate pointers are used on the host and device, the host pointer
753  * should be used with this function. In this case, it is assumed that
754  * the host and device values have already been synchronized, unless
755  * memory is not allocated on device yet at the call site, in which case
756  * it will be allocated automatically by this function.
757  *
758  * \param [in]  ptr  pointer
759  *
760  * \returns pointer to device memory.
761  */
762 /*----------------------------------------------------------------------------*/
763 
764 const void *
cs_get_device_ptr_const_pf(const void * ptr)765 cs_get_device_ptr_const_pf(const void  *ptr)
766 {
767   if (ptr == NULL)
768     return NULL;
769 
770   if (_hd_alloc_map.count(ptr) == 0) {
771     bft_error(__FILE__, __LINE__, 0,
772               _("%s: No host or device pointer matching %p."), __func__, ptr);
773     return NULL;
774   }
775 
776   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
777 
778   /* Allocate and sync on device if not done yet */
779 
780   if (me.device_ptr == NULL) {
781     if (   me.mode == CS_ALLOC_HOST_DEVICE
782         || me.mode == CS_ALLOC_HOST_DEVICE_PINNED) {
783 #if defined(HAVE_CUDA)
784 
785       me.device_ptr = cs_cuda_mem_malloc_device(me.size,
786                                                 "me.device_ptr",
787                                                 __FILE__,
788                                                 __LINE__);
789 
790 #elif defined(HAVE_OPENMP_TARGET)
791 
792       me.device_ptr = _omp_target_mem_malloc_device(me.size,
793                                                     "me.device_ptr",
794                                                     __FILE__,
795                                                     __LINE__);
796 
797 #endif
798 
799       _hd_alloc_map[ptr] = me;
800       cs_sync_h2d(ptr);
801 
802     }
803   }
804 
805   /* Prefetch if shared */
806 
807   else if (me.mode == CS_ALLOC_HOST_DEVICE_SHARED)
808     cs_prefetch_h2d(me.host_ptr, me.size);
809 
810   return me.device_ptr;
811 }
812 
813 /*----------------------------------------------------------------------------*/
814 /*!
815  * \brief Check if a pointer is associated with a device.
816  *
817  * If separate pointers are used on the host and device,
818  * the host pointer should be used with this function.
819  *
820  * \returns allocation mode associated with pointer
821  */
822 /*----------------------------------------------------------------------------*/
823 
824 cs_alloc_mode_t
cs_check_device_ptr(const void * ptr)825 cs_check_device_ptr(const void  *ptr)
826 {
827   if (_hd_alloc_map.count(ptr) == 0)
828     return CS_ALLOC_HOST;
829 
830   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
831   return me.mode;
832 }
833 
834 /*----------------------------------------------------------------------------*/
835 /*!
836  * \brief Associate device memory with a given host memory pointer.
837  *
838  * If the host memory is already associated with the device, the existing
839  * device pointer is returned. Otherwise, a new device allocation is
840  * called and returned.
841  *
842  * \param [in]  host_ptr  host pointer
843  * \param [in]  ni        number of elements
844  * \param [in]  size      element size
845  *
846  * \returns pointer to allocated memory.
847  */
848 /*----------------------------------------------------------------------------*/
849 
850 void *
cs_associate_device_ptr(void * host_ptr,size_t ni,size_t size)851 cs_associate_device_ptr(void    *host_ptr,
852                         size_t   ni,
853                         size_t   size)
854 {
855   if (_hd_alloc_map.count(host_ptr) == 0) {
856 
857     _cs_base_accel_mem_map  me = {
858       .host_ptr = host_ptr,
859       .device_ptr = NULL,
860       .size = ni * size,
861       .mode = CS_ALLOC_HOST_DEVICE};
862 
863     _hd_alloc_map[me.host_ptr] = me;
864 
865   }
866 
867   return cs_get_device_ptr(host_ptr);
868 }
869 
870 /*----------------------------------------------------------------------------*/
871 /*!
872  * \brief Detach device memory from a given host memory pointer.
873  *
874  * If the host memory is shared with the device (i.e. using CS_ALLOC_SHARED),
875  * device memory stays shared.
876  *
877  * \param [in]  host_ptr  host pointer
878  */
879 /*----------------------------------------------------------------------------*/
880 
881 void
cs_dissassociate_device_ptr(void * host_ptr)882 cs_dissassociate_device_ptr(void  *host_ptr)
883 {
884   if (_hd_alloc_map.count(host_ptr) == 0)
885     return;
886 
887   _cs_base_accel_mem_map  me = _hd_alloc_map[host_ptr];
888 
889   if (me.device_ptr != NULL) {
890 
891 #if defined(HAVE_CUDA)
892 
893     if (me.mode == CS_ALLOC_HOST_DEVICE)
894       cs_cuda_mem_free(me.device_ptr, "me.device_ptr", __FILE__, __LINE__);
895 
896 #elif defined(HAVE_OPENMP_TARGET)
897 
898     if (me.mode == CS_ALLOC_HOST_DEVICE)
899       omp_target_free(me.device_ptr, cs_glob_omp_target_device_id);
900 
901 #endif
902 
903   }
904 }
905 
906 /*----------------------------------------------------------------------------*/
907 /*!
908  * \brief Set allocation mode for an already allocated pointer.
909  *
910  * If the allocation mode is different from the previous one,
911  * the associated memory will be reallocated with the desired mode,
912  * and the previous allocation freed.
913  *
914  * \param [in, out]  host_ptr   pointer to host pointer to modify
915  * \param [in]       mode       desired allocation mode
916  */
917 /*----------------------------------------------------------------------------*/
918 
919 void
cs_set_alloc_mode(void ** host_ptr,cs_alloc_mode_t mode)920 cs_set_alloc_mode(void             **host_ptr,
921                   cs_alloc_mode_t    mode)
922 {
923   if (host_ptr == NULL)
924     return;
925 
926   void *ret_ptr = *host_ptr;
927 
928   void *_host_ptr = *host_ptr;
929 
930   if (_host_ptr == NULL)
931     return;
932 
933   if (_hd_alloc_map.count(_host_ptr) == 0) {
934 
935     _cs_base_accel_mem_map  me = {
936       .host_ptr = _host_ptr,
937       .device_ptr = NULL,
938       .size = bft_mem_get_block_size(_host_ptr),
939       .mode = CS_ALLOC_HOST};
940 
941     _hd_alloc_map[me.host_ptr] = me;
942 
943   }
944 
945   cs_alloc_mode_t old_mode = cs_check_device_ptr(_host_ptr);
946 
947   if (mode != old_mode) {
948 
949     _cs_base_accel_mem_map  me = _hd_alloc_map[_host_ptr];
950 
951     if (old_mode == CS_ALLOC_HOST_DEVICE)
952       cs_dissassociate_device_ptr(_host_ptr);
953 
954     if (   mode == CS_ALLOC_HOST_DEVICE_SHARED
955         || old_mode == CS_ALLOC_HOST_DEVICE_SHARED) {
956 
957       ret_ptr = cs_malloc_hd(mode, 1, me.size,
958                              "me.host_ptr", __FILE__, __LINE__);
959 
960       /* TODO: check if we have multiple OpenMP threads, in which
961          case applying a "first-touch" policy might be useful here */
962 
963       memcpy(ret_ptr, _host_ptr, me.size);
964 
965       cs_free_hd(_host_ptr, "me.host_ptr", __FILE__, __LINE__);
966 
967     }
968 
969   }
970 
971   *host_ptr = ret_ptr;
972 }
973 
974 /*----------------------------------------------------------------------------*/
975 /*!
976  * \brief Synchronize data from host to device.
977  *
978  * If separate pointers are used on the host and device,
979  * the host pointer should be used with this function.
980  *
981  * Depending on the allocation type, this can imply a copy, data prefetch,
982  * or a no-op.
983  *
984  * This function assumes the provided pointer was allocated using
985  * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
986  * determine associated metadata.
987  *
988  * \param [in, out]  ptr  host pointer to values to copy or prefetch
989  */
990 /*----------------------------------------------------------------------------*/
991 
992 void
cs_sync_h2d(const void * ptr)993 cs_sync_h2d(const void  *ptr)
994 {
995   if (_hd_alloc_map.count(ptr) == 0)
996     return;
997 
998   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
999 
1000   if (me.device_ptr == NULL)
1001     me.device_ptr = const_cast<void *>(cs_get_device_ptr_const(ptr));
1002 
1003 #if defined(HAVE_CUDA)
1004 
1005   switch (me.mode) {
1006   case CS_ALLOC_HOST:
1007     bft_error(__FILE__, __LINE__, 0,
1008               _("%s: %p allocated on host only."),
1009               __func__, ptr);
1010     break;
1011   case CS_ALLOC_HOST_DEVICE:
1012     cs_cuda_copy_h2d(me.device_ptr, me.host_ptr, me.size);
1013     break;
1014   case CS_ALLOC_HOST_DEVICE_PINNED:
1015     cs_cuda_copy_h2d_async(me.device_ptr, me.host_ptr, me.size);
1016     break;
1017   case CS_ALLOC_HOST_DEVICE_SHARED:
1018     cs_cuda_prefetch_h2d(me.device_ptr, me.size);
1019     break;
1020   case CS_ALLOC_DEVICE:
1021     bft_error(__FILE__, __LINE__, 0,
1022               _("%s: %p allocated on device only."),
1023               __func__, ptr);
1024     break;
1025   }
1026 
1027 #elif defined(HAVE_OPENMP_TARGET)
1028 
1029   switch (me.mode) {
1030 
1031   case CS_ALLOC_HOST:
1032     bft_error(__FILE__, __LINE__, 0,
1033               _("%s: %p allocated on host only."),
1034               __func__, ptr);
1035     break;
1036   case CS_ALLOC_HOST_DEVICE:
1037     omp_target_memcpy(me.device_ptr, me.host_ptr, me.size, 0, 0,
1038                       cs_glob_omp_target_device_id, omp_get_initial_device());
1039     break;
1040   case CS_ALLOC_HOST_DEVICE_PINNED:
1041     {
1042     char *host_ptr = (char *)me.device_ptr;
1043     #pragma omp target enter data map(to:host_ptr[:me.size]) \
1044       nowait device(cs_glob_omp_target_device_id)
1045     }
1046     break;
1047   case CS_ALLOC_HOST_DEVICE_SHARED:
1048     {
1049       char *host_ptr = (char *)me.host_ptr;
1050       #pragma omp target enter data map(to:host_ptr[:me.size]) \
1051         nowait device(cs_glob_omp_target_device_id)
1052     }
1053     break;
1054   case CS_ALLOC_DEVICE:
1055     bft_error(__FILE__, __LINE__, 0,
1056               _("%s: %p allocated on device only."),
1057               __func__, ptr);
1058     break;
1059   }
1060 
1061 #endif
1062 }
1063 
1064 /*----------------------------------------------------------------------------*/
1065 /*!
1066  * \brief Initiate synchronization of data from host to device for
1067  *        future access.
1068  *
1069  * If separate pointers are used on the host and device,
1070  * the host pointer should be used with this function.
1071  * In this case, synchronization is done are started (asynchronously
1072  * if the allocation mode supports it).
1073  *
1074  * In other cases, synchronization will be delayed until actual use.
1075  * the host pointer should be used with this function.
1076  *
1077  * Depending on the allocation type, this can imply a copy, data prefetch,
1078  * or a no-op.
1079  *
1080  * This function assumes the provided pointer was allocated using
1081  * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
1082  * determine associated metadata.
1083  *
1084  * \param [in, out]  ptr  host pointer to values to copy or prefetch
1085  */
1086 /*----------------------------------------------------------------------------*/
1087 
1088 void
cs_sync_h2d_future(const void * ptr)1089 cs_sync_h2d_future(const void  *ptr)
1090 {
1091   if (_hd_alloc_map.count(ptr) == 0)
1092     return;
1093 
1094   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
1095 
1096 #if defined(HAVE_CUDA)
1097 
1098   switch (me.mode) {
1099   case CS_ALLOC_HOST_DEVICE:
1100     cs_cuda_copy_h2d(me.device_ptr, me.host_ptr, me.size);
1101     break;
1102   case CS_ALLOC_HOST_DEVICE_PINNED:
1103     cs_cuda_copy_h2d_async(me.device_ptr, me.host_ptr, me.size);
1104     break;
1105   default:
1106     break;
1107   }
1108 
1109 #elif defined(HAVE_OPENMP_TARGET)
1110 
1111   switch (me.mode) {
1112   case CS_ALLOC_HOST_DEVICE:
1113     omp_target_memcpy(me.device_ptr, me.host_ptr, me.size, 0, 0,
1114                       cs_glob_omp_target_device_id, omp_get_initial_device());
1115     break;
1116   case CS_ALLOC_HOST_DEVICE_PINNED:
1117     {
1118       char *host_ptr = (char *)me.device_ptr;
1119       #pragma omp target enter data map(to:host_ptr[:me.size]) \
1120         nowait device(cs_glob_omp_target_device_id)
1121     }
1122     break;
1123   default:
1124     break;
1125   }
1126 
1127 #endif
1128 }
1129 
1130 /*----------------------------------------------------------------------------*/
1131 /*!
1132  * \brief Synchronize data from device to host.
1133  *
1134  * If separate allocations are used on the host and device
1135  * (mode == CS_ALLOC_HOST_DEVICE), the host pointer should be passed to this
1136  * function.
1137  *
1138  * Depending on the allocaton type, this can imply a copy, data prefetch,
1139  * or a no-op.
1140  *
1141  * This function assumes the provided pointer was allocated using
1142  * CS_MALLOC_HD or CS_REALLOC_HD, as it uses the associated mapping to
1143  * determine associated metadata.
1144  *
1145  * \param [in, out]  ptr  pointer to values to copy or prefetch
1146  */
1147 /*----------------------------------------------------------------------------*/
1148 
1149 void
cs_sync_d2h(void * ptr)1150 cs_sync_d2h(void  *ptr)
1151 {
1152   if (_hd_alloc_map.count(ptr) == 0)
1153     return;
1154 
1155   _cs_base_accel_mem_map  me = _hd_alloc_map[ptr];
1156 
1157 #if defined(HAVE_CUDA)
1158 
1159   switch (me.mode) {
1160   case CS_ALLOC_HOST:
1161     bft_error(__FILE__, __LINE__, 0,
1162               _("%s: %p allocated on host only."),
1163               __func__, ptr);
1164     break;
1165   case CS_ALLOC_HOST_DEVICE:
1166     cs_cuda_copy_d2h(me.host_ptr, me.device_ptr, me.size);
1167     break;
1168   case CS_ALLOC_HOST_DEVICE_PINNED:
1169     cs_cuda_copy_d2h_async(me.host_ptr, me.device_ptr, me.size);
1170     break;
1171   case CS_ALLOC_HOST_DEVICE_SHARED:
1172     cs_cuda_prefetch_d2h(me.host_ptr, me.size);
1173     break;
1174   case CS_ALLOC_DEVICE:
1175     bft_error(__FILE__, __LINE__, 0,
1176               _("%s: %p allocated on device only."),
1177               __func__, ptr);
1178     break;
1179   }
1180 
1181 #elif defined(HAVE_OPENMP_TARGET)
1182 
1183   switch (me.mode) {
1184   case CS_ALLOC_HOST:
1185     bft_error(__FILE__, __LINE__, 0,
1186               _("%s: %p allocated on host only."),
1187               __func__, ptr);
1188     break;
1189   case CS_ALLOC_HOST_DEVICE:
1190     omp_target_memcpy(me.host_ptr, me.device_ptr, me.size, 0, 0,
1191                       omp_get_initial_device(), cs_glob_omp_target_device_id);
1192 
1193     break;
1194   case CS_ALLOC_HOST_DEVICE_PINNED:
1195     {
1196       char *host_ptr = (char *)me.host_ptr;
1197       #pragma omp target exit data map(from:host_ptr[:me.size]) \
1198         nowait device(cs_glob_omp_target_device_id)
1199     }
1200     break;
1201   case CS_ALLOC_HOST_DEVICE_SHARED:
1202     {
1203       char *host_ptr = (char *)me.host_ptr;
1204       #pragma omp target exit data map(from:host_ptr[:me.size]) \
1205         nowait device(cs_glob_omp_target_device_id)
1206     }
1207     break;
1208   case CS_ALLOC_DEVICE:
1209     bft_error(__FILE__, __LINE__, 0,
1210               _("%s: %p allocated on device only."),
1211               __func__, ptr);
1212     break;
1213   }
1214 
1215 #endif
1216 }
1217 
1218 /*----------------------------------------------------------------------------*/
1219 /*!
1220  * \brief Prefetch data from host to device.
1221  *
1222  * This function should only be used on arrays using shared host and device
1223  * memory, shuch as those allocated using CS_ALLOC_HOST_DEVICE_SHARED.
1224  * It should be usable on a subset of such an array.
1225  *
1226  * \param [in, out]  ptr   pointer to data to prefetch
1227  * \param [in]       size  number of bytes to prefetch
1228  */
1229 /*----------------------------------------------------------------------------*/
1230 
1231 void
cs_prefetch_h2d(void * ptr,size_t size)1232 cs_prefetch_h2d(void    *ptr,
1233                 size_t   size)
1234 {
1235 #if defined(HAVE_CUDA)
1236 
1237   cs_cuda_prefetch_h2d(ptr, size);
1238 
1239 #elif defined(HAVE_OPENMP_TARGET)
1240 
1241   char *host_ptr = (char *)ptr;
1242   #pragma omp target enter data map(to:host_ptr[:size]) \
1243     nowait device(cs_glob_omp_target_device_id)
1244 
1245 #endif
1246 }
1247 
1248 /*----------------------------------------------------------------------------*/
1249 /*!
1250  * \brief Prefetch data from device to host.
1251  *
1252  * This function should only be used on arrays using shared host and device
1253  * memory, shuch as those allocated using CS_ALLOC_HOST_DEVICE_SHARED.
1254  * It should be usable on a subset of such an array.
1255  *
1256  * \param [in, out]  ptr   pointer to data to prefetch
1257  * \param [in]       size  number of bytes to prefetch
1258  */
1259 /*----------------------------------------------------------------------------*/
1260 
1261 void
cs_prefetch_d2h(void * ptr,size_t size)1262 cs_prefetch_d2h(void    *ptr,
1263                 size_t   size)
1264 {
1265 #if defined(HAVE_CUDA)
1266 
1267   cs_cuda_prefetch_d2h(ptr, size);
1268 
1269 #elif defined(HAVE_OPENMP_TARGET)
1270 
1271   char *host_ptr = (char *)ptr;
1272   #pragma omp target exit data map(from:host_ptr[:size]) \
1273     nowait device(cs_glob_omp_target_device_id)
1274 
1275 #endif
1276 }
1277 
1278 /*----------------------------------------------------------------------------*/
1279 /*!
1280  * \brief Copy data from host to device.
1281  *
1282  * This function should be usable on subsets of arrays allocated on the host
1283  * and device.
1284  *
1285  * \param [out]      dest  pointer to destination data on device
1286  * \param [in, out]  src   pointer to source data on host
1287  * \param [in]       size  number of bytes to prefetch
1288  */
1289 /*----------------------------------------------------------------------------*/
1290 
1291 void
cs_copy_h2d(void * dest,const void * src,size_t size)1292 cs_copy_h2d(void        *dest,
1293             const void  *src,
1294             size_t       size)
1295 {
1296 #if defined(HAVE_CUDA)
1297 
1298   cs_cuda_copy_h2d(dest, src, size);
1299 
1300 #elif defined(HAVE_OPENMP_TARGET)
1301 
1302   omp_target_memcpy(dest, src, size, 0, 0,
1303                     cs_glob_omp_target_device_id, omp_get_initial_device());
1304 
1305 #endif
1306 }
1307 
1308 /*----------------------------------------------------------------------------*/
1309 /*!
1310  * \brief Copy data from device to host.
1311  *
1312  * This function should be usable on subsets of arrays allocated on the host
1313  * and device.
1314  *
1315  * \param [out]      dest  pointer to destination data on host
1316  * \param [in, out]  src   pointer to source data on device
1317  * \param [in]       size  number of bytes to prefetch
1318  */
1319 /*----------------------------------------------------------------------------*/
1320 
1321 void
cs_copy_d2h(void * dest,const void * src,size_t size)1322 cs_copy_d2h(void        *dest,
1323             const void  *src,
1324             size_t       size)
1325 {
1326 #if defined(HAVE_CUDA)
1327 
1328   cs_cuda_copy_d2h(dest, src, size);
1329 
1330 #elif defined(HAVE_OPENMP_TARGET)
1331 
1332   omp_target_memcpy(dest, src, size, 0, 0,
1333                     omp_get_initial_device(), cs_glob_omp_target_device_id);
1334 
1335 #endif
1336 }
1337 
1338 /*----------------------------------------------------------------------------*/
1339 /*!
1340  * \brief Copy data from device to device.
1341  *
1342  * This function should be usable on subsets of arrays allocated on the host
1343  * and device.
1344  *
1345  * \param [out]      dest  pointer to destination data on host
1346  * \param [in, out]  src   pointer to source data on device
1347  * \param [in]       size  number of bytes to prefetch
1348  */
1349 /*----------------------------------------------------------------------------*/
1350 
1351 void
cs_copy_d2d(void * dest,const void * src,size_t size)1352 cs_copy_d2d(void        *dest,
1353             const void  *src,
1354             size_t       size)
1355 {
1356 #if defined(HAVE_CUDA)
1357 
1358   cs_cuda_copy_d2d(dest, src, size);
1359 
1360 #elif defined(HAVE_OPENMP_TARGET)
1361 
1362   omp_target_memcpy(dest, src, size, 0, 0,
1363                     cs_glob_omp_target_device_id, cs_glob_omp_target_device_id);
1364 
1365 #endif
1366 }
1367 
1368 /*----------------------------------------------------------------------------*/
1369 /*!
1370  * \brief Return number of host-device allocations
1371  *
1372  * \returns current number of host-device allocations.
1373  */
1374 /*----------------------------------------------------------------------------*/
1375 
1376 int
cs_get_n_allocations_hd(void)1377 cs_get_n_allocations_hd(void)
1378 {
1379   return _hd_alloc_map.size();
1380 }
1381 
1382 /*----------------------------------------------------------------------------*/
1383 /*!
1384  * \brief Check if a given host pointer is allocated with associated with
1385  *        cs_alloc_hd or cs_realloc_hd.
1386  *
1387  * \returns allocated memory size, or zero if not allocated with this
1388  *          mechanism.
1389  */
1390 /*----------------------------------------------------------------------------*/
1391 
1392 size_t
cs_get_allocation_hd_size(void * host_ptr)1393 cs_get_allocation_hd_size(void  *host_ptr)
1394 {
1395   if (_hd_alloc_map.count(host_ptr) == 0)
1396     return 0;
1397 
1398   _cs_base_accel_mem_map  me = _hd_alloc_map[host_ptr];
1399   return me.size;
1400 }
1401 
1402 #if defined(HAVE_OPENMP_TARGET)
1403 
1404 /*----------------------------------------------------------------------------*/
1405 /*!
1406  * \brief Set OpenMP Offload device based on MPI rank and number of devices.
1407  *
1408  * \param[in]  comm            associated MPI communicator
1409  * \param[in]  ranks_per_node  number of ranks per node (min and max)
1410  *
1411  * \return  selected device id, or -1 if no usable device is available
1412  */
1413 /*----------------------------------------------------------------------------*/
1414 
1415 int
cs_omp_target_select_default_device(void)1416 cs_omp_target_select_default_device(void)
1417 {
1418   int device_id = omp_get_initial_device();
1419 
1420   int n_devices = omp_get_num_devices();
1421 
1422   if (cs_glob_rank_id > -1 && n_devices > 1) {
1423 
1424     device_id = cs_glob_node_rank_id*n_devices / cs_glob_node_n_ranks;
1425 
1426     assert(device_id > -1 && device_id < n_devices);
1427 
1428   }
1429 
1430   omp_set_default_device(device_id);
1431 
1432   cs_glob_omp_target_device_id = device_id;
1433 
1434   return device_id;
1435 }
1436 
1437 #endif /* defined(HAVE_OPENMP_TARGET) */
1438 
1439 /*----------------------------------------------------------------------------*/
1440 
1441 END_C_DECLS
1442