1 subroutine util_setup_gpu_affinity 2#ifdef USE_CUDA_AFFINITY 3 use iso_fortran_env 4 use cudafor 5 use openacc 6#include "errquit.fh" 7#include "global.fh" 8 integer(INT32) :: num_devices, use_ngpus, my_gpu 9 integer :: ppn, node_rank, me 10 character*255 :: char_use_ngpus 11 integer(acc_device_kind) :: devicetype 12 me = ga_nodeid() 13 devicetype = NVIDIA 14 ! 15 ! CUDA stuff 16 ! 17 ! how many GPUs are detected 18 err = cudaGetDeviceCount(num_devices) 19 if (err.ne.0) call errquit('cudaGetDeviceCount',err,UNKNOWN_ERR) 20 if (num_devices.lt.1) call errquit('No GPU found!',0,UNKNOWN_ERR) 21 ! user prescribes how many to use 22 call util_getenv('NWCHEM_OPENACC_USE_NGPUS',char_use_ngpus) 23 if (me.eq.0) then 24 print*,'CU NWCHEM_OPENACC_USE_NGPUS=',trim(char_use_ngpus) 25 endif 26 if (len(trim(char_use_ngpus)).gt.0) then 27 read(char_use_ngpus,'(i255)') use_ngpus 28 if (use_ngpus.gt.num_devices) then 29 write(6,600) use_ngpus,num_devices 30 use_ngpus = num_devices 31 endif 32 600 format('CU you asked for ', 33 & i2,' GPUs but only ', 34 & i2,' are present') 35 else 36 use_ngpus=num_devices 37 endif 38 ! GA PPN undercounts with ARMCI-MPI 39 call util_getppn(ppn) ! THIS IS COLLECTIVE 40 ! this assumes PPN is constant across nodes (reasonable) 41 node_rank = modulo(me,ppn) 42 ! assign GPUs to GA process ranks within a node (round-robin) 43 my_gpu = modulo(node_rank,use_ngpus) 44 if (me.lt.ppn) then 45 write(6,700) 'node_rank',node_rank 46 write(6,700) 'use_ngpus',use_ngpus 47 write(6,700) 'ppn ',ppn 48 write(6,700) 'ga_nodeid',me 49 write(6,700) 'my_gpu ',my_gpu 50 700 format('CU ',a12,'=',i5) 51 endif 52 err = cudaSetDevice(my_gpu) 53 if (err.ne.0) call errquit('cudaSetDevice',my_gpu,UNKNOWN_ERR) 54 call acc_set_device_num(my_gpu,devicetype) 55#endif 56 end subroutine util_setup_gpu_affinity 57