1!! Copyright (C) 2010-2016 X. Andrade
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA)
22#error "Cannot compile with OpenCL and Cuda support at the same time"
23#endif
24
25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA)
26#define HAVE_ACCEL 1
27#endif
28
29module accel_oct_m
30  use alloc_cache_oct_m
31#ifdef HAVE_OPENCL
32  use cl
33#endif
34#ifdef HAVE_CLBLAS
35  use clblas
36#endif
37  use cuda_oct_m
38#ifdef HAVE_CLFFT
39  use clfft
40#endif
41  use global_oct_m
42  use iso_c_binding
43  use loct_oct_m
44  use messages_oct_m
45  use mpi_oct_m
46  use namespace_oct_m
47  use types_oct_m
48  use parser_oct_m
49  use profiling_oct_m
50  use unit_system_oct_m
51
52  implicit none
53
54  private
55
56  public ::                       &
57    accel_context_t,              &
58    accel_device_t,               &
59    accel_mem_t,                  &
60    accel_kernel_t,               &
61    accel_t,                      &
62    accel_is_enabled,             &
63    accel_init,                   &
64    accel_end,                    &
65    accel_padded_size,            &
66    accel_mem_nullify,            &
67    accel_kernel_start_call,      &
68    accel_kernel_build,           &
69    accel_create_buffer,          &
70    accel_write_buffer,           &
71    accel_read_buffer,            &
72    accel_release_buffer,         &
73    accel_buffer_is_allocated,    &
74    accel_finish,                 &
75    accel_set_kernel_arg,         &
76    accel_max_workgroup_size,     &
77    accel_kernel_workgroup_size,  &
78    accel_kernel_run,             &
79    accel_set_buffer_to_zero,     &
80    accel_use_shared_mem,         &
81    clblas_print_error,           &
82    clfft_print_error,            &
83    accel_local_memory_size,      &
84    accel_global_memory_size,     &
85    accel_max_size_per_dim,       &
86    accel_get_device_pointer,     &
87    accel_set_stream,             &
88    accel_synchronize_all_streams
89
90#ifdef HAVE_OPENCL
91  integer, public, parameter ::                 &
92    ACCEL_MEM_READ_ONLY  = CL_MEM_READ_ONLY,    &
93    ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE,   &
94    ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY
95#else
96  integer, public, parameter ::                 &
97    ACCEL_MEM_READ_ONLY  = 0,                   &
98    ACCEL_MEM_READ_WRITE = 1,                   &
99    ACCEL_MEM_WRITE_ONLY = 2
100#endif
101
102  type accel_context_t
103    ! Components are public by default
104#ifdef HAVE_OPENCL
105    type(cl_context) :: cl_context
106#elif defined(HAVE_CUDA)
107    type(c_ptr)      :: cuda_context
108#else
109    integer          :: dummy
110#endif
111  end type accel_context_t
112
113  type accel_device_t
114    ! Components are public by default
115#ifdef HAVE_OPENCL
116    type(cl_device_id) :: cl_device
117#elif defined(HAVE_CUDA)
118    type(c_ptr)      :: cuda_device
119#else
120    integer         :: dummy
121#endif
122  end type accel_device_t
123
124  type accel_t
125    ! Components are public by default
126    type(accel_context_t)  :: context
127    type(accel_device_t)   :: device
128#ifdef HAVE_OPENCL
129    type(cl_command_queue) :: command_queue
130#endif
131    type(c_ptr)            :: cublas_handle
132    type(c_ptr)            :: cuda_stream
133    type(c_ptr)            :: module_map
134    integer                :: max_workgroup_size
135    integer(8)             :: local_memory_size
136    integer(8)             :: global_memory_size
137    logical                :: enabled
138    logical                :: shared_mem
139    logical                :: cuda_mpi
140    integer                :: warp_size
141  end type accel_t
142
143  type accel_mem_t
144    ! Components are public by default
145#ifdef HAVE_OPENCL
146    type(cl_mem)           :: mem
147#else
148    type(c_ptr)            :: mem
149#endif
150    integer(SIZEOF_SIZE_T) :: size
151    type(type_t)           :: type
152    integer                :: flags
153    logical                :: allocated
154  end type accel_mem_t
155
156  type accel_kernel_t
157    ! Components are public by default
158#ifdef HAVE_OPENCL
159    type(cl_kernel)               :: kernel
160#endif
161#ifdef HAVE_CUDA
162    type(c_ptr)                   :: cuda_kernel
163    type(c_ptr)                   :: cuda_module
164    type(c_ptr)                   :: arguments
165#endif
166    integer(8)                    :: cuda_shared_mem
167    logical                       :: initialized = .false.
168    type(accel_kernel_t), pointer :: next
169    integer                       :: arg_count
170  end type accel_kernel_t
171
172  type(accel_t), public :: accel
173
174  ! the kernels
175  type(accel_kernel_t), public, target, save :: kernel_vpsi
176  type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors
177  type(accel_kernel_t), public, target, save :: kernel_daxpy
178  type(accel_kernel_t), public, target, save :: kernel_zaxpy
179  type(accel_kernel_t), public, target, save :: kernel_copy
180  type(accel_kernel_t), public, target, save :: dpack
181  type(accel_kernel_t), public, target, save :: zpack
182  type(accel_kernel_t), public, target, save :: dunpack
183  type(accel_kernel_t), public, target, save :: zunpack
184  type(accel_kernel_t), public, target, save :: kernel_subarray_gather
185  type(accel_kernel_t), public, target, save :: kernel_density_real
186  type(accel_kernel_t), public, target, save :: kernel_density_complex
187  type(accel_kernel_t), public, target, save :: kernel_density_spinors
188  type(accel_kernel_t), public, target, save :: kernel_phase
189  type(accel_kernel_t), public, target, save :: kernel_phase_spiral
190  type(accel_kernel_t), public, target, save :: dkernel_dot_matrix
191  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix
192  type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors
193  type(accel_kernel_t), public, target, save :: dkernel_batch_axpy
194  type(accel_kernel_t), public, target, save :: zkernel_batch_axpy
195  type(accel_kernel_t), public, target, save :: dkernel_batch_dotp
196  type(accel_kernel_t), public, target, save :: zkernel_batch_dotp
197  type(accel_kernel_t), public, target, save :: dzmul
198  type(accel_kernel_t), public, target, save :: zzmul
199  type(accel_kernel_t), public, target, save :: set_one
200
201  ! kernels used locally
202  type(accel_kernel_t), save :: set_zero
203
204  interface accel_create_buffer
205    module procedure accel_create_buffer_4, accel_create_buffer_8
206  end interface accel_create_buffer
207
208  interface accel_write_buffer
209    module procedure iaccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0
210    module procedure iaccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1
211    module procedure iaccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2
212    module procedure iaccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3
213  end interface accel_write_buffer
214
215  interface accel_read_buffer
216    module procedure iaccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1
217    module procedure iaccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2
218    module procedure iaccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3
219  end interface accel_read_buffer
220
221  interface accel_set_kernel_arg
222    module procedure                       &
223      accel_set_kernel_arg_buffer,  &
224      iaccel_set_kernel_arg_data,   &
225      daccel_set_kernel_arg_data,   &
226      zaccel_set_kernel_arg_data,   &
227      accel_set_kernel_arg_local
228  end interface accel_set_kernel_arg
229
230  interface accel_get_device_pointer
231    module procedure iaccel_get_device_pointer_1
232    module procedure iaccel_get_device_pointer_2
233    module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1
234    module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2
235  end interface accel_get_device_pointer
236
237  type(profile_t), save :: prof_read, prof_write
238
239  integer, parameter  ::      &
240    OPENCL_GPU         = -1,  &
241    OPENCL_CPU         = -2,  &
242    OPENCL_ACCELERATOR = -3,  &
243    OPENCL_DEFAULT     = -4
244
245
246  integer, parameter  ::      &
247    CL_PLAT_INVALID   = -1,   &
248    CL_PLAT_AMD       = -2,   &
249    CL_PLAT_NVIDIA    = -3,   &
250    CL_PLAT_ATI       = -4,   &
251    CL_PLAT_INTEL     = -5
252
253  ! a "convenience" public variable
254  integer, public :: cl_status
255
256  integer :: buffer_alloc_count
257  integer(8) :: allocated_mem
258  type(accel_kernel_t), pointer :: head
259  type(alloc_cache_t) :: memcache
260
261contains
262
263  pure logical function accel_is_enabled() result(enabled)
264#ifdef HAVE_ACCEL
265    enabled = accel%enabled
266#else
267    enabled = .false.
268#endif
269  end function accel_is_enabled
270
271  ! ------------------------------------------
272
273  subroutine accel_init(base_grp, namespace)
274    type(mpi_grp_t),     intent(inout) :: base_grp
275    type(namespace_t),   intent(in)    :: namespace
276
277    logical  :: disable, default, run_benchmark
278    integer  :: idevice, iplatform
279#ifdef HAVE_OPENCL
280    integer  :: device_type
281    integer :: cl_status, idev
282    integer  :: ndevices, ret_devices, nplatforms, iplat
283    character(len=256) :: device_name
284    type(cl_platform_id) :: platform_id
285    type(cl_program) :: prog
286    type(cl_platform_id), allocatable :: allplatforms(:)
287    type(cl_device_id), allocatable :: alldevices(:)
288    type(profile_t), save :: prof_init
289#endif
290
291    PUSH_SUB(accel_init)
292
293    buffer_alloc_count = 0
294
295    !%Variable DisableAccel
296    !%Type logical
297    !%Default yes
298    !%Section Execution::Accel
299    !%Description
300    !% If Octopus was compiled with OpenCL or CUDA support, it will
301    !% try to initialize and use an accelerator device. By setting this
302    !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available.
303    !%End
304    call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel')
305#ifdef HAVE_ACCEL
306    default = .false.
307#else
308    default = .true.
309#endif
310    call parse_variable(namespace, 'DisableAccel', default, disable)
311    accel%enabled = .not. disable
312
313#ifndef HAVE_ACCEL
314    if(accel%enabled) then
315      message(1) = 'Octopus was compiled without OpenCL or Cuda support.'
316      call messages_fatal(1)
317    end if
318#endif
319
320    if(.not. accel_is_enabled()) then
321      POP_SUB(accel_init)
322      return
323    end if
324
325    !%Variable AccelPlatform
326    !%Type integer
327    !%Default 0
328    !%Section Execution::Accel
329    !%Description
330    !% This variable selects the OpenCL platform that Octopus will
331    !% use. You can give an explicit platform number or use one of
332    !% the options that select a particular vendor
333    !% implementation. Platform 0 is used by default.
334    !%
335    !% This variable has no effect for CUDA.
336    !%Option amd -2
337    !% Use the AMD OpenCL platform.
338    !%Option nvidia -3
339    !% Use the Nvidia OpenCL platform.
340    !%Option ati -4
341    !% Use the ATI (old AMD) OpenCL platform.
342    !%Option intel -5
343    !% Use the Intel OpenCL platform.
344    !%End
345    call parse_variable(namespace, 'AccelPlatform', 0, iplatform)
346
347    call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform')
348
349    !%Variable AccelDevice
350    !%Type integer
351    !%Default gpu
352    !%Section Execution::Accel
353    !%Description
354    !% This variable selects the OpenCL or CUDA accelerator device
355    !% that Octopus will use. You can specify one of the options below
356    !% or a numerical id to select a specific device.
357    !%
358    !% Values >= 0 select the device to be used. In case of MPI enabled runs
359    !% devices are distributed in a round robin fashion, starting at this value.
360    !%Option gpu -1
361    !% If available, Octopus will use a GPU.
362    !%Option cpu -2
363    !% If available, Octopus will use a CPU (only for OpenCL).
364    !%Option accelerator -3
365    !% If available, Octopus will use an accelerator (only for OpenCL).
366    !%Option accel_default -4
367    !% Octopus will use the default device specified by the implementation.
368    !% implementation.
369    !%End
370    call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice)
371
372    call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice')
373
374    if(idevice < OPENCL_DEFAULT) then
375      call messages_write('Invalid AccelDevice')
376      call messages_fatal()
377    end if
378
379    call messages_print_stress(stdout, "GPU acceleration")
380
381#ifdef HAVE_CUDA
382    if(idevice<0) idevice = 0
383    call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, &
384      idevice, base_grp%rank)
385#ifdef HAVE_MPI
386    write(message(1), '(A, I5.5, A, I5.5)') "Rank ", base_grp%rank, " uses device number ", idevice
387    call messages_info(1, all_nodes = .true.)
388#endif
389
390    ! no shared mem support in our cuda interface (for the moment)
391    accel%shared_mem = .true.
392
393    call cublas_init(accel%cublas_handle, accel%cuda_stream)
394#endif
395
396#ifdef HAVE_OPENCL
397    call profiling_in(prof_init, 'CL_INIT')
398
399    call clGetPlatformIDs(nplatforms, cl_status)
400    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
401
402    SAFE_ALLOCATE(allplatforms(1:nplatforms))
403
404    call clGetPlatformIDs(allplatforms, iplat, cl_status)
405    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs")
406
407    call messages_write('Info: Available CL platforms: ')
408    call messages_write(nplatforms)
409    call messages_info()
410
411    do iplat = 1, nplatforms
412
413      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status)
414
415      if(iplatform < 0) then
416        if(iplatform == get_platform_id(device_name)) iplatform = iplat - 1
417      end if
418
419      if(iplatform == iplat - 1) then
420        call messages_write('    * Platform ')
421      else
422        call messages_write('      Platform ')
423      end if
424
425      call messages_write(iplat - 1)
426      call messages_write(' : '//device_name)
427      call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status)
428      call messages_write(' ('//trim(device_name)//')')
429      call messages_info()
430    end do
431
432    call messages_info()
433
434    if(iplatform >= nplatforms .or. iplatform < 0) then
435      call messages_write('Requested CL platform does not exist')
436      if(iplatform > 0) then
437        call messages_write('(platform = ')
438        call messages_write(iplatform)
439        call messages_write(').')
440      end if
441      call messages_fatal()
442    end if
443
444    platform_id = allplatforms(iplatform + 1)
445
446    SAFE_DEALLOCATE_A(allplatforms)
447
448    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status)
449
450    call messages_write('Info: Available CL devices: ')
451    call messages_write(ndevices)
452    call messages_info()
453
454    SAFE_ALLOCATE(alldevices(1:ndevices))
455
456    ! list all devices
457
458    call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
459
460    do idev = 1, ndevices
461      call messages_write('      Device ')
462      call messages_write(idev - 1)
463      call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status)
464      call messages_write(' : '//device_name)
465      call messages_info()
466    end do
467
468    select case(idevice)
469    case(OPENCL_GPU)
470      device_type = CL_DEVICE_TYPE_GPU
471    case(OPENCL_CPU)
472      device_type = CL_DEVICE_TYPE_CPU
473    case(OPENCL_ACCELERATOR)
474      device_type = CL_DEVICE_TYPE_ACCELERATOR
475    case(OPENCL_DEFAULT)
476      device_type = CL_DEVICE_TYPE_DEFAULT
477    case default
478      device_type = CL_DEVICE_TYPE_ALL
479    end select
480
481    ! now get a list of the selected type
482    call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status)
483
484    if(ret_devices < 1) then
485      ! we didnt find a device of the selected type, we ask for the default device
486      call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status)
487
488      if(ret_devices < 1) then
489        ! if this does not work, we ask for all devices
490        call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status)
491      end if
492
493      if(ret_devices < 1) then
494        call messages_write('Cannot find an OpenCL device')
495        call messages_fatal()
496      end if
497    end if
498
499    ! the number of devices can be smaller
500    ndevices = ret_devices
501
502    if(idevice < 0) then
503      if(base_grp%size > 1) then
504        ! with MPI we have to select the device so multiple GPUs in one
505        ! node are correctly distributed
506        call select_device(idevice)
507      else
508        idevice = 0
509      end if
510    end if
511
512    if(idevice >= ndevices) then
513      call messages_write('Requested CL device does not exist (device = ')
514      call messages_write(idevice)
515      call messages_write(', platform = ')
516      call messages_write(iplatform)
517      call messages_write(').')
518      call messages_fatal()
519    end if
520
521    accel%device%cl_device = alldevices(idevice + 1)
522
523    ! create the context
524    accel%context%cl_context = clCreateContext(platform_id, accel%device%cl_device, cl_status)
525    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateContext")
526
527    SAFE_DEALLOCATE_A(alldevices)
528
529    accel%command_queue = clCreateCommandQueue(accel%context%cl_context, accel%device%cl_device, &
530      CL_QUEUE_PROFILING_ENABLE, cl_status)
531    if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateCommandQueue")
532
533    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, device_type, cl_status)
534
535    select case(device_type)
536    case(CL_DEVICE_TYPE_GPU)
537      accel%shared_mem = .true.
538    case(CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_ACCELERATOR)
539      accel%shared_mem = .false.
540    case default
541      accel%shared_mem = .false.
542    end select
543
544#ifdef HAVE_CLBLAS
545    call clblasSetup(cl_status)
546    if(cl_status /= clblasSuccess) call clblas_print_error(cl_status, 'clblasSetup')
547#endif
548
549#ifdef HAVE_CLFFT
550    call clfftSetup(cl_status)
551    if(cl_status /= CLFFT_SUCCESS) call clfft_print_error(cl_status, 'clfftSetup')
552#endif
553
554    call profiling_out(prof_init)
555#endif
556
557    ! Get some device information that we will need later
558
559    ! total memory
560#ifdef HAVE_OPENCL
561    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_SIZE, accel%global_memory_size, cl_status)
562    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_LOCAL_MEM_SIZE, accel%local_memory_size, cl_status)
563    call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, accel%max_workgroup_size, cl_status)
564    accel%warp_size = 1
565#endif
566#ifdef HAVE_CUDA
567    call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size)
568    call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size)
569    call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size)
570    call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size)
571#endif
572
573    if(mpi_grp_is_root(base_grp)) call device_info()
574
575    ! initialize the cache used to speed up allocations
576    call alloc_cache_init(memcache, nint(CNST(0.25)*accel%global_memory_size, 8))
577
578    ! now initialize the kernels
579    call accel_kernel_global_init()
580
581    call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero")
582    call accel_kernel_start_call(set_one, 'set_one.cl', "set_one")
583    call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi")
584    call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors")
585    call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE')
586    call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX')
587    call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", flags = '-lineinfo -DRTYPE_DOUBLE')
588    call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", flags = '-lineinfo -DRTYPE_COMPLEX')
589    call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp", flags = '-lineinfo')
590    call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp", flags = '-lineinfo')
591    call accel_kernel_start_call(dpack, 'pack.cl', "dpack")
592    call accel_kernel_start_call(zpack, 'pack.cl', "zpack")
593    call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack")
594    call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack")
595    call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy")
596    call accel_kernel_start_call(kernel_subarray_gather, 'subarray.cl', "subarray_gather")
597    call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real")
598    call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex")
599    call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors")
600    call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase")
601    call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix")
602    call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix")
603    call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors")
604
605
606    call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE')
607    call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX')
608
609    !%Variable AccelBenchmark
610    !%Type logical
611    !%Default no
612    !%Section Execution::Accel
613    !%Description
614    !% If this variable is set to yes, Octopus will run some
615    !% routines to benchmark the performance of the accelerator device.
616    !%End
617    call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark)
618
619    call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark')
620
621    if(run_benchmark) then
622      call opencl_check_bandwidth()
623    end if
624
625    !%Variable CudaAwareMPI
626    !%Type logical
627    !%Section Execution::Accel
628    !%Description
629    !% If Octopus was compiled with CUDA support and MPI support and if the MPI
630    !% implementation is CUDA-aware (i.e., it supports communication using device pointers),
631    !% this switch can be set to true to use the CUDA-aware MPI features. The advantage
632    !% of this approach is that it can do, e.g., peer-to-peer copies between devices without
633    !% going through the host memmory.
634    !% The default is false, except when the configure switch --enable-cudampi is set, in which
635    !% case this variable is set to true.
636    !%End
637#ifdef HAVE_CUDA_MPI
638    default = .true.
639#else
640    default = .false.
641#endif
642    call parse_variable(namespace, 'CudaAwareMPI', default, accel%cuda_mpi)
643    if(accel%cuda_mpi) then
644      call messages_write("Using CUDA-aware MPI.")
645      call messages_info()
646    end if
647
648    call messages_print_stress(stdout)
649
650    POP_SUB(accel_init)
651
652  contains
653
654    subroutine select_device(idevice)
655      integer, intent(inout) :: idevice
656#if defined(HAVE_MPI) && defined(HAVE_OPENCL)
657      integer :: irank
658      character(len=256) :: device_name
659
660      PUSH_SUB(accel_init.select_device)
661
662      idevice = mod(base_grp%rank, ndevices)
663
664      call MPI_Barrier(base_grp%comm, mpi_err)
665      call messages_write('Info: CL device distribution:')
666      call messages_info()
667      do irank = 0, base_grp%size - 1
668        if(irank == base_grp%rank) then
669          call clGetDeviceInfo(alldevices(idevice + 1), CL_DEVICE_NAME, device_name, cl_status)
670          call messages_write('      MPI node ')
671          call messages_write(base_grp%rank)
672          call messages_write(' -> CL device ')
673          call messages_write(idevice)
674          call messages_write(' : '//device_name)
675          call messages_info(all_nodes = .true.)
676        end if
677        call MPI_Barrier(base_grp%comm, mpi_err)
678      end do
679#endif
680
681      POP_SUB(accel_init.select_device)
682    end subroutine select_device
683
684    subroutine device_info()
685#ifdef HAVE_OPENCL
686      integer(8) :: val
687#endif
688#ifdef HAVE_CUDA
689      integer :: version
690#endif
691      integer :: major, minor
692      character(len=256) :: val_str
693
694      PUSH_SUB(accel_init.device_info)
695
696      call messages_new_line()
697      call messages_write('Selected device:')
698      call messages_new_line()
699
700#ifdef HAVE_OPENCL
701      call messages_write('      Framework              : OpenCL')
702#endif
703#ifdef HAVE_CUDA
704      call messages_write('      Framework              : CUDA')
705#endif
706      call messages_info()
707
708#ifdef HAVE_CUDA
709      call messages_write('      Device type            : GPU', new_line = .true.)
710      call messages_write('      Device vendor          : NVIDIA Corporation', new_line = .true.)
711#endif
712
713#ifdef HAVE_OPENCL
714      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, val, cl_status)
715      call messages_write('      Device type            :')
716      select case(int(val, 4))
717      case(CL_DEVICE_TYPE_GPU)
718        call messages_write(' GPU')
719      case(CL_DEVICE_TYPE_CPU)
720        call messages_write(' CPU')
721      case(CL_DEVICE_TYPE_ACCELERATOR)
722        call messages_write(' accelerator')
723      end select
724      call messages_new_line()
725
726      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_VENDOR, val_str, cl_status)
727      call messages_write('      Device vendor          : '//trim(val_str))
728      call messages_new_line()
729#endif
730
731#ifdef HAVE_OPENCL
732      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_NAME, val_str, cl_status)
733#endif
734#ifdef HAVE_CUDA
735      call cuda_device_name(accel%device%cuda_device, val_str)
736#endif
737      call messages_write('      Device name            : '//trim(val_str))
738      call messages_new_line()
739
740#ifdef HAVE_CUDA
741      call cuda_device_capability(accel%device%cuda_device, major, minor)
742#endif
743      call messages_write('      Cuda capabilities      :')
744      call messages_write(major, fmt = '(i2)')
745      call messages_write('.')
746      call messages_write(minor, fmt = '(i1)')
747      call messages_new_line()
748
749      ! VERSION
750#ifdef HAVE_OPENCL
751      call clGetDeviceInfo(accel%device%cl_device, CL_DRIVER_VERSION, val_str, cl_status)
752      call messages_write('      Driver version         : '//trim(val_str))
753#endif
754#ifdef HAVE_CUDA
755      call cuda_driver_version(version)
756      call messages_write('      Driver version         : ')
757      call messages_write(version)
758#endif
759      call messages_new_line()
760
761
762#ifdef HAVE_OPENCL
763      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_COMPUTE_UNITS, val, cl_status)
764      call messages_write('      Compute units          :')
765      call messages_write(val)
766      call messages_new_line()
767
768      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, val, cl_status)
769      call messages_write('      Clock frequency        :')
770      call messages_write(val)
771      call messages_write(' GHz')
772      call messages_new_line()
773#endif
774
775      call messages_write('      Device memory          :')
776      call messages_write(accel%global_memory_size, units = unit_megabytes)
777      call messages_new_line()
778
779      call messages_write('      Local/shared memory    :')
780      call messages_write(accel%local_memory_size, units = unit_kilobytes)
781      call messages_new_line()
782
783
784#ifdef HAVE_OPENCL
785      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, val, cl_status)
786      call messages_write('      Max alloc size         :')
787      call messages_write(val, units = unit_megabytes)
788      call messages_new_line()
789
790      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, val, cl_status)
791      call messages_write('      Device cache           :')
792      call messages_write(val, units = unit_kilobytes)
793      call messages_new_line()
794
795      call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, val, cl_status)
796      call messages_write('      Constant memory        :')
797      call messages_write(val, units = unit_kilobytes)
798      call messages_new_line()
799#endif
800
801      call messages_write('      Max. group/block size  :')
802      call messages_write(accel%max_workgroup_size)
803      call messages_new_line()
804
805
806#ifdef HAVE_OPENCL
807      call messages_write('      Extension cl_khr_fp64  :')
808      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64"))
809      call messages_new_line()
810
811      call messages_write('      Extension cl_amd_fp64  :')
812      call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64"))
813      call messages_new_line()
814#endif
815
816      call messages_info()
817
818
819      POP_SUB(accel_init.device_info)
820    end subroutine device_info
821
822  end subroutine accel_init
823
824  ! ------------------------------------------
825
826  integer function get_platform_id(platform_name) result(platform_id)
827    character(len=*), intent(in) :: platform_name
828
829    platform_id = CL_PLAT_INVALID
830    if(index(platform_name, 'AMD') > 0)    platform_id = CL_PLAT_AMD
831    if(index(platform_name, 'ATI') > 0)    platform_id = CL_PLAT_ATI
832    if(index(platform_name, 'NVIDIA') > 0) platform_id = CL_PLAT_NVIDIA
833    if(index(platform_name, 'Intel') > 0)  platform_id = CL_PLAT_INTEL
834  end function get_platform_id
835
836  ! ------------------------------------------
837
838  subroutine accel_end()
839#ifdef HAVE_OPENCL
840    integer :: ierr
841#endif
842    integer(8) :: hits, misses
843    real(8) :: volume_hits, volume_misses
844    logical :: found
845    type(accel_mem_t) :: tmp
846
847    PUSH_SUB(accel_end)
848
849    if(accel_is_enabled()) then
850
851      do
852        call alloc_cache_get(memcache, ALLOC_CACHE_ANY_SIZE, found, tmp%mem)
853        if(.not. found) exit
854
855#ifdef HAVE_OPENCL
856        call clReleaseMemObject(tmp%mem, ierr)
857        if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
858#endif
859#ifdef HAVE_CUDA
860        call cuda_mem_free(tmp%mem)
861#endif
862      end do
863
864      call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses)
865
866      call messages_print_stress(stdout, "Acceleration-device allocation cache")
867
868      call messages_new_line()
869      call messages_write('    Number of allocations    =')
870      call messages_write(hits + misses, new_line = .true.)
871      call messages_write('    Volume of allocations    =')
872      call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., &
873        new_line = .true.)
874      call messages_write('    Hit ratio                =')
875      call messages_write(hits/TOFLOAT(hits + misses)*100, fmt='(f6.1)', align_left = .true.)
876      call messages_write('%', new_line = .true.)
877      call messages_write('    Volume hit ratio         =')
878      call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.)
879      call messages_write('%')
880      call messages_new_line()
881      call messages_info()
882
883      call messages_print_stress(stdout)
884    end if
885
886    call accel_kernel_global_end()
887
888#ifdef HAVE_CLBLAS
889    call clblasTearDown()
890#endif
891
892#ifdef HAVE_CLFFT
893    call clfftTearDown()
894#endif
895
896    if(accel_is_enabled()) then
897#ifdef HAVE_CUDA
898      call cublas_end(accel%cublas_handle)
899      call cuda_end(accel%context%cuda_context, accel%device%cuda_device)
900#endif
901
902#ifdef HAVE_OPENCL
903      call clReleaseCommandQueue(accel%command_queue, ierr)
904
905      if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "ReleaseCommandQueue")
906      call clReleaseContext(accel%context%cl_context, cl_status)
907#endif
908
909      if(buffer_alloc_count /= 0) then
910        call messages_write('Accel:')
911        call messages_write(TOFLOAT(allocated_mem), fmt = 'f12.1', units = unit_megabytes, align_left = .true.)
912        call messages_write(' in ')
913        call messages_write(buffer_alloc_count)
914        call messages_write(' buffers were not deallocated.')
915        call messages_fatal()
916      end if
917
918    end if
919
920    POP_SUB(accel_end)
921  end subroutine accel_end
922
923  ! ------------------------------------------
924
925  elemental subroutine accel_mem_nullify(this)
926    type(accel_mem_t), intent(out) :: this
927
928    !> To be implemented.
929    this%size = 0
930    this%flags = 0
931    this%allocated = .false.
932
933  end subroutine accel_mem_nullify
934
935  ! ------------------------------------------
936
937  integer function accel_padded_size(nn) result(psize)
938    integer,        intent(in) :: nn
939
940    integer :: modnn, bsize
941
942    psize = nn
943
944    if(accel_is_enabled()) then
945
946      bsize = accel_max_workgroup_size()
947
948      psize = nn
949      modnn = mod(nn, bsize)
950      if(modnn /= 0) psize = psize + bsize - modnn
951
952    end if
953
954  end function accel_padded_size
955
956  ! ------------------------------------------
957
958  subroutine accel_create_buffer_4(this, flags, type, size)
959    type(accel_mem_t),  intent(inout) :: this
960    integer,            intent(in)    :: flags
961    type(type_t),       intent(in)    :: type
962    integer,            intent(in)    :: size
963
964    call accel_create_buffer_8(this, flags, type, int(size, 8))
965  end subroutine accel_create_buffer_4
966
967  ! ------------------------------------------
968
969  subroutine accel_create_buffer_8(this, flags, type, size)
970    type(accel_mem_t),  intent(inout) :: this
971    integer,            intent(in)    :: flags
972    type(type_t),       intent(in)    :: type
973    integer(8),         intent(in)    :: size
974
975    integer(8) :: fsize
976    logical    :: found
977#ifdef HAVE_OPENCL
978    integer :: ierr
979#endif
980
981    PUSH_SUB(accel_create_buffer_8)
982
983    this%type = type
984    this%size = size
985    this%flags = flags
986    fsize = int(size, 8)*types_get_size(type)
987    this%allocated = .true.
988
989    if(fsize > 0) then
990
991      call alloc_cache_get(memcache, fsize, found, this%mem)
992
993      if(.not. found) then
994#ifdef HAVE_OPENCL
995        this%mem = clCreateBuffer(accel%context%cl_context, flags, fsize, ierr)
996        if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateBuffer")
997#endif
998#ifdef HAVE_CUDA
999        call cuda_mem_alloc(this%mem, fsize)
1000#endif
1001      end if
1002
1003      INCR(buffer_alloc_count, 1)
1004      INCR(allocated_mem, fsize)
1005
1006    end if
1007
1008    POP_SUB(accel_create_buffer_8)
1009  end subroutine accel_create_buffer_8
1010
1011  ! ------------------------------------------
1012
1013  subroutine accel_release_buffer(this)
1014    type(accel_mem_t), intent(inout) :: this
1015
1016#ifdef HAVE_OPENCL
1017    integer :: ierr
1018#endif
1019    logical :: put
1020    integer(8) :: fsize
1021
1022    PUSH_SUB(accel_release_buffer)
1023
1024    if(this%size > 0) then
1025
1026      fsize = int(this%size, 8)*types_get_size(this%type)
1027
1028      call alloc_cache_put(memcache, fsize, this%mem, put)
1029
1030      if(.not. put) then
1031#ifdef HAVE_OPENCL
1032        call clReleaseMemObject(this%mem, ierr)
1033        if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject")
1034#endif
1035#ifdef HAVE_CUDA
1036        call cuda_mem_free(this%mem)
1037#endif
1038      end if
1039
1040      INCR(buffer_alloc_count, -1)
1041      INCR(allocated_mem, fsize)
1042
1043    end if
1044
1045    this%size = 0
1046    this%flags = 0
1047
1048    this%allocated = .false.
1049
1050    POP_SUB(accel_release_buffer)
1051  end subroutine accel_release_buffer
1052
1053  ! ------------------------------------------
1054
1055  logical pure function accel_buffer_is_allocated(this) result(allocated)
1056    type(accel_mem_t), intent(in) :: this
1057
1058    allocated = this%allocated
1059  end function accel_buffer_is_allocated
1060
1061  ! ------------------------------------------
1062
1063  integer(SIZEOF_SIZE_T) pure function opencl_get_buffer_size(this) result(size)
1064    type(accel_mem_t), intent(in) :: this
1065
1066    size = this%size
1067  end function opencl_get_buffer_size
1068
1069  ! -----------------------------------------
1070
1071  type(type_t) pure function opencl_get_buffer_type(this) result(type)
1072    type(accel_mem_t), intent(in) :: this
1073
1074    type = this%type
1075  end function opencl_get_buffer_type
1076
1077  ! -----------------------------------------
1078
1079  subroutine accel_finish()
1080#ifdef HAVE_OPENCL
1081    integer :: ierr
1082#endif
1083
1084    ! no push_sub, called too frequently
1085
1086    if(accel_is_enabled()) then
1087#ifdef HAVE_OPENCL
1088      call clFinish(accel%command_queue, ierr)
1089      if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, 'clFinish')
1090#endif
1091#ifdef HAVE_CUDA
1092      call cuda_context_synchronize()
1093#endif
1094    end if
1095  end subroutine accel_finish
1096
1097  ! ------------------------------------------
1098
1099  subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer)
1100    type(accel_kernel_t), intent(inout) :: kernel
1101    integer,              intent(in)    :: narg
1102    type(accel_mem_t),    intent(in)    :: buffer
1103
1104#ifdef HAVE_OPENCL
1105    integer :: ierr
1106#endif
1107
1108    ASSERT(accel_buffer_is_allocated(buffer))
1109
1110    ! no push_sub, called too frequently
1111#ifdef HAVE_OPENCL
1112    call clSetKernelArg(kernel%kernel, narg, buffer%mem, ierr)
1113    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clSetKernelArg_buf")
1114#endif
1115
1116#ifdef HAVE_CUDA
1117    call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg)
1118#endif
1119
1120  end subroutine accel_set_kernel_arg_buffer
1121
1122  ! ------------------------------------------
1123
1124  subroutine accel_set_kernel_arg_local(kernel, narg, type, size)
1125    type(accel_kernel_t), intent(inout) :: kernel
1126    integer,              intent(in)    :: narg
1127    type(type_t),         intent(in)    :: type
1128    integer,              intent(in)    :: size
1129
1130#ifdef HAVE_OPENCL
1131    integer :: ierr
1132#endif
1133    integer(8) :: size_in_bytes
1134
1135    PUSH_SUB(accel_set_kernel_arg_local)
1136
1137
1138    size_in_bytes = int(size, 8)*types_get_size(type)
1139
1140    if(size_in_bytes > accel%local_memory_size) then
1141      write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", TOFLOAT(size_in_bytes)/1024.0, " Kb"
1142      write(message(2), '(a,f12.6,a)') "          available local memory: ", TOFLOAT(accel%local_memory_size)/1024.0, " Kb"
1143      call messages_fatal(2)
1144    else if(size_in_bytes <= 0) then
1145      write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes
1146      call messages_fatal(1)
1147    end if
1148
1149#ifdef HAVE_CUDA
1150    kernel%cuda_shared_mem = size_in_bytes
1151#endif
1152
1153#ifdef HAVE_OPENCL
1154    call clSetKernelArgLocal(kernel%kernel, narg, size_in_bytes, ierr)
1155    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "set_kernel_arg_local")
1156#endif
1157
1158    POP_SUB(accel_set_kernel_arg_local)
1159  end subroutine accel_set_kernel_arg_local
1160
1161  ! ------------------------------------------
1162
1163  subroutine accel_kernel_run(kernel, globalsizes, localsizes)
1164    type(accel_kernel_t), intent(inout) :: kernel
1165    integer,              intent(in)    :: globalsizes(:)
1166    integer,              intent(in)    :: localsizes(:)
1167
1168    integer :: dim
1169#ifdef HAVE_OPENCL
1170    integer :: ierr
1171#endif
1172    integer(8) :: gsizes(1:3)
1173    integer(8) :: lsizes(1:3)
1174
1175    ! no push_sub, called too frequently
1176
1177    ! cuda needs all dimensions
1178    gsizes = 1
1179    lsizes = 1
1180
1181    dim = ubound(globalsizes, dim = 1)
1182
1183    ASSERT(dim == ubound(localsizes, dim = 1))
1184
1185    ! if one size is zero, there is nothing to do
1186    if(any(globalsizes == 0)) return
1187
1188    ASSERT(all(localsizes > 0))
1189    ASSERT(all(localsizes <= accel_max_workgroup_size()))
1190    ASSERT(all(mod(globalsizes, localsizes) == 0))
1191
1192    gsizes(1:dim) = int(globalsizes(1:dim), 8)
1193    lsizes(1:dim) = int(localsizes(1:dim), 8)
1194
1195#ifdef HAVE_OPENCL
1196    call clEnqueueNDRangeKernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr)
1197    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1198#endif
1199
1200#ifdef HAVE_CUDA
1201    gsizes(1:3) = gsizes(1:3)/lsizes(1:3)
1202
1203    ASSERT(gsizes(1) < 2_8**31 - 1_8)
1204    ASSERT(all(gsizes(2:3) <= 65535_8))
1205
1206    call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments)
1207
1208    kernel%cuda_shared_mem = 0
1209#endif
1210
1211  end subroutine accel_kernel_run
1212
1213  ! -----------------------------------------------
1214
1215  integer pure function accel_max_workgroup_size() result(max_workgroup_size)
1216    max_workgroup_size = accel%max_workgroup_size
1217  end function accel_max_workgroup_size
1218
1219  ! -----------------------------------------------
1220
1221  integer function accel_kernel_workgroup_size(kernel) result(workgroup_size)
1222    type(accel_kernel_t), intent(inout) :: kernel
1223
1224#ifdef HAVE_OPENCL
1225    integer(8) :: workgroup_size8
1226    integer :: ierr
1227#endif
1228
1229    workgroup_size = 0
1230
1231#ifdef HAVE_OPENCL
1232    call clGetKernelWorkGroupInfo(kernel%kernel, accel%device%cl_device, CL_KERNEL_WORK_GROUP_SIZE, workgroup_size8, ierr)
1233    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel")
1234    workgroup_size = workgroup_size8
1235#endif
1236
1237#ifdef HAVE_CUDA
1238    workgroup_size = accel%max_workgroup_size
1239#endif
1240
1241  end function accel_kernel_workgroup_size
1242
1243  ! -----------------------------------------------
1244
1245#ifdef HAVE_OPENCL
1246  subroutine opencl_build_program(prog, filename, flags)
1247    type(cl_program),           intent(inout) :: prog
1248    character(len=*),           intent(in)    :: filename
1249    character(len=*), optional, intent(in)    :: flags
1250
1251    character(len = 1000) :: string
1252    character(len = 256) :: share_string
1253    integer :: ierr, ierrlog, iunit, irec, newlen
1254
1255    PUSH_SUB(opencl_build_program)
1256
1257    string = '#include "'//trim(filename)//'"'
1258
1259    if(debug%info) then
1260      call messages_write("Building CL program '"//trim(filename)//"'.")
1261      call messages_info()
1262    end if
1263
1264    prog = clCreateProgramWithSource(accel%context%cl_context, trim(string), ierr)
1265    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateProgramWithSource")
1266
1267    ! build the compilation flags
1268    string='-w'
1269    ! full optimization
1270    string=trim(string)//' -cl-denorms-are-zero'
1271    ! The following flag gives an error with the Xeon Phi
1272    !    string=trim(string)//' -cl-strict-aliasing'
1273    string=trim(string)//' -cl-mad-enable'
1274    string=trim(string)//' -cl-unsafe-math-optimizations'
1275    string=trim(string)//' -cl-finite-math-only'
1276    string=trim(string)//' -cl-fast-relaxed-math'
1277
1278    share_string='-I'//trim(conf%share)//'/opencl/'
1279
1280    if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then
1281      string = trim(string)//' -DEXT_KHR_FP64'
1282    else if(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then
1283      string = trim(string)//' -DEXT_AMD_FP64'
1284    else
1285      call messages_write('Octopus requires an OpenCL device with double-precision support.')
1286      call messages_fatal()
1287    end if
1288
1289    if(accel_use_shared_mem()) then
1290      string = trim(string)//' -DSHARED_MEM'
1291    end if
1292
1293    if(present(flags)) then
1294      string = trim(string)//' '//trim(flags)
1295    end if
1296
1297    if(debug%info) then
1298      call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.)
1299      call messages_write('  '//trim(share_string)//"'.")
1300      call messages_info()
1301    end if
1302
1303    string = trim(string)//' '//trim(share_string)
1304
1305    call clBuildProgram(prog, trim(string), ierr)
1306
1307    call clGetProgramBuildInfo(prog, accel%device%cl_device, CL_PROGRAM_BUILD_LOG, string, ierrlog)
1308    if(ierrlog /= CL_SUCCESS) call opencl_print_error(ierrlog, "clGetProgramBuildInfo")
1309
1310    ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it
1311    newlen = scan(string, achar(010), back = .true.) - 1
1312    if(newlen >= 0) string = string(1:newlen)
1313
1314    if(len(trim(string)) > 0) write(stderr, '(a)') trim(string)
1315
1316    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clBuildProgram")
1317
1318    POP_SUB(opencl_build_program)
1319  end subroutine opencl_build_program
1320#endif
1321
1322  ! -----------------------------------------------
1323#ifdef HAVE_OPENCL
1324  subroutine opencl_release_program(prog)
1325    type(cl_program),    intent(inout) :: prog
1326
1327    integer :: ierr
1328
1329    PUSH_SUB(opencl_release_program)
1330
1331    call clReleaseProgram(prog, ierr)
1332    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseProgram")
1333
1334    POP_SUB(opencl_release_program)
1335  end subroutine opencl_release_program
1336#endif
1337
1338  ! -----------------------------------------------
1339
1340#ifdef HAVE_OPENCL
1341  subroutine opencl_release_kernel(prog)
1342    type(cl_kernel),      intent(inout) :: prog
1343
1344    integer :: ierr
1345
1346    PUSH_SUB(opencl_release_kernel)
1347
1348#ifdef HAVE_OPENCL
1349    call clReleaseKernel(prog, ierr)
1350    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseKernel")
1351#endif
1352
1353    POP_SUB(opencl_release_kernel)
1354  end subroutine opencl_release_kernel
1355#endif
1356
1357#ifdef HAVE_OPENCL
1358  ! -----------------------------------------------
1359  subroutine opencl_create_kernel(kernel, prog, name)
1360    type(cl_kernel),  intent(inout) :: kernel
1361    type(cl_program), intent(inout) :: prog
1362    character(len=*), intent(in)    :: name
1363
1364    integer :: ierr
1365    type(profile_t), save :: prof
1366
1367    PUSH_SUB(opencl_create_kernel)
1368    call profiling_in(prof, "CL_BUILD_KERNEL", exclude = .true.)
1369
1370#ifdef HAVE_OPENCL
1371    kernel = clCreateKernel(prog, name, ierr)
1372    if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateKernel")
1373#endif
1374
1375    call profiling_out(prof)
1376    POP_SUB(opencl_create_kernel)
1377  end subroutine opencl_create_kernel
1378#endif
1379
1380  ! ------------------------------------------------
1381
1382  subroutine opencl_print_error(ierr, name)
1383    integer,          intent(in) :: ierr
1384    character(len=*), intent(in) :: name
1385
1386    character(len=40) :: errcode
1387
1388    PUSH_SUB(opencl_print_error)
1389
1390#ifdef HAVE_OPENCL
1391    select case(ierr)
1392    case(CL_SUCCESS); errcode = 'CL_SUCCESS '
1393    case(CL_DEVICE_NOT_FOUND); errcode = 'CL_DEVICE_NOT_FOUND '
1394    case(CL_DEVICE_NOT_AVAILABLE); errcode = 'CL_DEVICE_NOT_AVAILABLE '
1395    case(CL_COMPILER_NOT_AVAILABLE); errcode = 'CL_COMPILER_NOT_AVAILABLE '
1396    case(CL_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE '
1397    case(CL_OUT_OF_RESOURCES); errcode = 'CL_OUT_OF_RESOURCES '
1398    case(CL_OUT_OF_HOST_MEMORY); errcode = 'CL_OUT_OF_HOST_MEMORY '
1399    case(CL_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE '
1400    case(CL_MEM_COPY_OVERLAP); errcode = 'CL_MEM_COPY_OVERLAP '
1401    case(CL_IMAGE_FORMAT_MISMATCH); errcode = 'CL_IMAGE_FORMAT_MISMATCH '
1402    case(CL_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED '
1403    case(CL_BUILD_PROGRAM_FAILURE); errcode = 'CL_BUILD_PROGRAM_FAILURE '
1404    case(CL_MAP_FAILURE); errcode = 'CL_MAP_FAILURE '
1405    case(CL_INVALID_VALUE); errcode = 'CL_INVALID_VALUE '
1406    case(CL_INVALID_DEVICE_TYPE); errcode = 'CL_INVALID_DEVICE_TYPE '
1407    case(CL_INVALID_PLATFORM); errcode = 'CL_INVALID_PLATFORM '
1408    case(CL_INVALID_DEVICE); errcode = 'CL_INVALID_DEVICE '
1409    case(CL_INVALID_CONTEXT); errcode = 'CL_INVALID_CONTEXT '
1410    case(CL_INVALID_QUEUE_PROPERTIES); errcode = 'CL_INVALID_QUEUE_PROPERTIES '
1411    case(CL_INVALID_COMMAND_QUEUE); errcode = 'CL_INVALID_COMMAND_QUEUE '
1412    case(CL_INVALID_HOST_PTR); errcode = 'CL_INVALID_HOST_PTR '
1413    case(CL_INVALID_MEM_OBJECT); errcode = 'CL_INVALID_MEM_OBJECT '
1414    case(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR '
1415    case(CL_INVALID_IMAGE_SIZE); errcode = 'CL_INVALID_IMAGE_SIZE '
1416    case(CL_INVALID_SAMPLER); errcode = 'CL_INVALID_SAMPLER '
1417    case(CL_INVALID_BINARY); errcode = 'CL_INVALID_BINARY '
1418    case(CL_INVALID_BUILD_OPTIONS); errcode = 'CL_INVALID_BUILD_OPTIONS '
1419    case(CL_INVALID_PROGRAM); errcode = 'CL_INVALID_PROGRAM '
1420    case(CL_INVALID_PROGRAM_EXECUTABLE); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE '
1421    case(CL_INVALID_KERNEL_NAME); errcode = 'CL_INVALID_KERNEL_NAME '
1422    case(CL_INVALID_KERNEL_DEFINITION); errcode = 'CL_INVALID_KERNEL_DEFINITION '
1423    case(CL_INVALID_KERNEL); errcode = 'CL_INVALID_KERNEL '
1424    case(CL_INVALID_ARG_INDEX); errcode = 'CL_INVALID_ARG_INDEX '
1425    case(CL_INVALID_ARG_VALUE); errcode = 'CL_INVALID_ARG_VALUE '
1426    case(CL_INVALID_ARG_SIZE); errcode = 'CL_INVALID_ARG_SIZE '
1427    case(CL_INVALID_KERNEL_ARGS); errcode = 'CL_INVALID_KERNEL_ARGS '
1428    case(CL_INVALID_WORK_DIMENSION); errcode = 'CL_INVALID_WORK_DIMENSION '
1429    case(CL_INVALID_WORK_GROUP_SIZE); errcode = 'CL_INVALID_WORK_GROUP_SIZE '
1430    case(CL_INVALID_WORK_ITEM_SIZE); errcode = 'CL_INVALID_WORK_ITEM_SIZE '
1431    case(CL_INVALID_GLOBAL_OFFSET); errcode = 'CL_INVALID_GLOBAL_OFFSET '
1432    case(CL_INVALID_EVENT_WAIT_LIST); errcode = 'CL_INVALID_EVENT_WAIT_LIST '
1433    case(CL_INVALID_EVENT); errcode = 'CL_INVALID_EVENT '
1434    case(CL_INVALID_OPERATION); errcode = 'CL_INVALID_OPERATION '
1435    case(CL_INVALID_GL_OBJECT); errcode = 'CL_INVALID_GL_OBJECT '
1436    case(CL_INVALID_BUFFER_SIZE); errcode = 'CL_INVALID_BUFFER_SIZE '
1437    case(CL_INVALID_MIP_LEVEL); errcode = 'CL_INVALID_MIP_LEVEL '
1438    case(CL_INVALID_GLOBAL_WORK_SIZE); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE '
1439    case(CL_PLATFORM_NOT_FOUND_KHR); errcode = 'CL_PLATFORM_NOT_FOUND_KHR'
1440    case default
1441      write(errcode, '(i10)') ierr
1442      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1443    end select
1444#endif
1445
1446    message(1) = 'OpenCL '//trim(name)//' '//trim(errcode)
1447    call messages_fatal(1)
1448
1449    POP_SUB(opencl_print_error)
1450  end subroutine opencl_print_error
1451
1452  ! ----------------------------------------------------
1453
1454  subroutine clblas_print_error(ierr, name)
1455    integer,          intent(in) :: ierr
1456    character(len=*), intent(in) :: name
1457
1458    character(len=40) :: errcode
1459
1460    PUSH_SUB(clblas_print_error)
1461#ifdef HAVE_CLBLAS
1462    select case(ierr)
1463    case(clblasSuccess);                    errcode = 'clblasSuccess'
1464    case(clblasInvalidValue);               errcode = 'clblasInvalidValue'
1465    case(clblasInvalidCommandQueue);        errcode = 'clblasInvalidCommandQueue'
1466    case(clblasInvalidContext);             errcode = 'clblasInvalidContext'
1467    case(clblasInvalidMemObject);           errcode = 'clblasInvalidMemObject'
1468    case(clblasInvalidDevice);              errcode = 'clblasInvalidDevice'
1469    case(clblasInvalidEventWaitList);       errcode = 'clblasInvalidEventWaitList'
1470    case(clblasOutOfResources);             errcode = 'clblasOutOfResources'
1471    case(clblasOutOfHostMemory);            errcode = 'clblasOutOfHostMemory'
1472    case(clblasInvalidOperation);           errcode = 'clblasInvalidOperation'
1473    case(clblasCompilerNotAvailable);       errcode = 'clblasCompilerNotAvailable'
1474    case(clblasBuildProgramFailure );       errcode = 'clblasBuildProgramFailure'
1475    case(clblasNotImplemented);             errcode = 'clblasNotImplemented'
1476    case(clblasNotInitialized);             errcode = 'clblasNotInitialized'
1477    case(clblasInvalidMatA);                errcode = 'clblasInvalidMatA'
1478    case(clblasInvalidMatB);                errcode = 'clblasInvalidMatB'
1479    case(clblasInvalidMatC);                errcode = 'clblasInvalidMatC'
1480    case(clblasInvalidVecX);                errcode = 'clblasInvalidVecX'
1481    case(clblasInvalidVecY);                errcode = 'clblasInvalidVecY'
1482    case(clblasInvalidDim);                 errcode = 'clblasInvalidDim'
1483    case(clblasInvalidLeadDimA);            errcode = 'clblasInvalidLeadDimA'
1484    case(clblasInvalidLeadDimB);            errcode = 'clblasInvalidLeadDimB'
1485    case(clblasInvalidLeadDimC);            errcode = 'clblasInvalidLeadDimC'
1486    case(clblasInvalidIncX);                errcode = 'clblasInvalidIncX'
1487    case(clblasInvalidIncY);                errcode = 'clblasInvalidIncY'
1488    case(clblasInsufficientMemMatA);        errcode = 'clblasInsufficientMemMatA'
1489    case(clblasInsufficientMemMatB);        errcode = 'clblasInsufficientMemMatB'
1490    case(clblasInsufficientMemMatC);        errcode = 'clblasInsufficientMemMatC'
1491    case(clblasInsufficientMemVecX);        errcode = 'clblasInsufficientMemVecX'
1492    case(clblasInsufficientMemVecY);        errcode = 'clblasInsufficientMemVecY'
1493    case default
1494      write(errcode, '(i10)') ierr
1495      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1496    end select
1497#endif
1498
1499    message(1) = 'clblas '//trim(name)//' '//trim(errcode)
1500    call messages_fatal(1)
1501
1502    POP_SUB(clblas_print_error)
1503  end subroutine clblas_print_error
1504
1505  ! ----------------------------------------------------
1506  subroutine clfft_print_error(ierr, name)
1507    integer,          intent(in) :: ierr
1508    character(len=*), intent(in) :: name
1509
1510    character(len=40) :: errcode
1511
1512    PUSH_SUB(clfft_print_error)
1513#ifdef HAVE_CLFFT
1514    select case(ierr)
1515    case(CLFFT_INVALID_GLOBAL_WORK_SIZE);          errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE'
1516    case(CLFFT_INVALID_MIP_LEVEL);                 errcode = 'CLFFT_INVALID_MIP_LEVEL'
1517    case(CLFFT_INVALID_BUFFER_SIZE);               errcode = 'CLFFT_INVALID_BUFFER_SIZE'
1518    case(CLFFT_INVALID_GL_OBJECT);                 errcode = 'CLFFT_INVALID_GL_OBJECT'
1519    case(CLFFT_INVALID_OPERATION);                 errcode = 'CLFFT_INVALID_OPERATION'
1520    case(CLFFT_INVALID_EVENT);                     errcode = 'CLFFT_INVALID_EVENT'
1521    case(CLFFT_INVALID_EVENT_WAIT_LIST);           errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST'
1522    case(CLFFT_INVALID_GLOBAL_OFFSET);             errcode = 'CLFFT_INVALID_GLOBAL_OFFSET'
1523    case(CLFFT_INVALID_WORK_ITEM_SIZE);            errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE'
1524    case(CLFFT_INVALID_WORK_GROUP_SIZE);           errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE'
1525    case(CLFFT_INVALID_WORK_DIMENSION);            errcode = 'CLFFT_INVALID_WORK_DIMENSION'
1526    case(CLFFT_INVALID_KERNEL_ARGS);               errcode = 'CLFFT_INVALID_KERNEL_ARGS'
1527    case(CLFFT_INVALID_ARG_SIZE);                  errcode = 'CLFFT_INVALID_ARG_SIZE'
1528    case(CLFFT_INVALID_ARG_VALUE);                 errcode = 'CLFFT_INVALID_ARG_VALUE'
1529    case(CLFFT_INVALID_ARG_INDEX);                 errcode = 'CLFFT_INVALID_ARG_INDEX'
1530    case(CLFFT_INVALID_KERNEL);                    errcode = 'CLFFT_INVALID_KERNEL'
1531    case(CLFFT_INVALID_KERNEL_DEFINITION);         errcode = 'CLFFT_INVALID_KERNEL_DEFINITION'
1532    case(CLFFT_INVALID_KERNEL_NAME);               errcode = 'CLFFT_INVALID_KERNEL_NAME'
1533    case(CLFFT_INVALID_PROGRAM_EXECUTABLE);        errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE'
1534    case(CLFFT_INVALID_PROGRAM);                   errcode = 'CLFFT_INVALID_PROGRAM'
1535    case(CLFFT_INVALID_BUILD_OPTIONS);             errcode = 'CLFFT_INVALID_BUILD_OPTIONS'
1536    case(CLFFT_INVALID_BINARY);                    errcode = 'CLFFT_INVALID_BINARY'
1537    case(CLFFT_INVALID_SAMPLER);                   errcode = 'CLFFT_INVALID_SAMPLER'
1538    case(CLFFT_INVALID_IMAGE_SIZE);                errcode = 'CLFFT_INVALID_IMAGE_SIZE'
1539    case(CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR);   errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR'
1540    case(CLFFT_INVALID_MEM_OBJECT);                errcode = 'CLFFT_INVALID_MEM_OBJECT'
1541    case(CLFFT_INVALID_HOST_PTR);                  errcode = 'CLFFT_INVALID_HOST_PTR'
1542    case(CLFFT_INVALID_COMMAND_QUEUE);             errcode = 'CLFFT_INVALID_COMMAND_QUEUE'
1543    case(CLFFT_INVALID_QUEUE_PROPERTIES);          errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES'
1544    case(CLFFT_INVALID_CONTEXT);                   errcode = 'CLFFT_INVALID_CONTEXT'
1545    case(CLFFT_INVALID_DEVICE);                    errcode = 'CLFFT_INVALID_DEVICE'
1546    case(CLFFT_INVALID_PLATFORM);                  errcode = 'CLFFT_INVALID_PLATFORM'
1547    case(CLFFT_INVALID_DEVICE_TYPE);               errcode = 'CLFFT_INVALID_DEVICE_TYPE'
1548    case(CLFFT_INVALID_VALUE);                     errcode = 'CLFFT_INVALID_VALUE'
1549    case(CLFFT_MAP_FAILURE);                       errcode = 'CLFFT_MAP_FAILURE'
1550    case(CLFFT_BUILD_PROGRAM_FAILURE);             errcode = 'CLFFT_BUILD_PROGRAM_FAILURE'
1551    case(CLFFT_IMAGE_FORMAT_NOT_SUPPORTED);        errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED'
1552    case(CLFFT_IMAGE_FORMAT_MISMATCH);             errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH'
1553    case(CLFFT_MEM_COPY_OVERLAP);                  errcode = 'CLFFT_MEM_COPY_OVERLAP'
1554    case(CLFFT_PROFILING_INFO_NOT_AVAILABLE);      errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE'
1555    case(CLFFT_OUT_OF_HOST_MEMORY);                errcode = 'CLFFT_OUT_OF_HOST_MEMORY'
1556    case(CLFFT_OUT_OF_RESOURCES);                  errcode = 'CLFFT_OUT_OF_RESOURCES'
1557    case(CLFFT_MEM_OBJECT_ALLOCATION_FAILURE);     errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE'
1558    case(CLFFT_COMPILER_NOT_AVAILABLE);            errcode = 'CLFFT_COMPILER_NOT_AVAILABLE'
1559    case(CLFFT_DEVICE_NOT_AVAILABLE);              errcode = 'CLFFT_DEVICE_NOT_AVAILABLE'
1560    case(CLFFT_DEVICE_NOT_FOUND);                  errcode = 'CLFFT_DEVICE_NOT_FOUND'
1561    case(CLFFT_SUCCESS);                           errcode = 'CLFFT_SUCCESS'
1562    case(CLFFT_BUGCHECK);                          errcode = 'CLFFT_BUGCHECK'
1563    case(CLFFT_NOTIMPLEMENTED);                    errcode = 'CLFFT_NOTIMPLEMENTED'
1564    case(CLFFT_FILE_NOT_FOUND);                    errcode = 'CLFFT_FILE_NOT_FOUND'
1565    case(CLFFT_FILE_CREATE_FAILURE);               errcode = 'CLFFT_FILE_CREATE_FAILURE'
1566    case(CLFFT_VERSION_MISMATCH);                  errcode = 'CLFFT_VERSION_MISMATCH'
1567    case(CLFFT_INVALID_PLAN);                      errcode = 'CLFFT_INVALID_PLAN'
1568    case(CLFFT_DEVICE_NO_DOUBLE);                  errcode = 'CLFFT_DEVICE_NO_DOUBLE'
1569    case(CLFFT_ENDSTATUS);                         errcode = 'CLFFT_ENDSTATUS'
1570    case default
1571      write(errcode, '(i10)') ierr
1572      errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')'
1573    end select
1574#endif
1575
1576    message(1) = 'clfft '//trim(name)//' '//trim(errcode)
1577    call messages_fatal(1)
1578
1579    POP_SUB(clfft_print_error)
1580  end subroutine clfft_print_error
1581
1582  ! ----------------------------------------------------
1583
1584#ifdef HAVE_OPENCL
1585  logical function f90_cl_device_has_extension(device, extension) result(has)
1586    type(cl_device_id), intent(inout) :: device
1587    character(len=*),   intent(in)    :: extension
1588
1589    integer :: cl_status
1590    character(len=2048) :: all_extensions
1591
1592#ifdef HAVE_OPENCL
1593    call clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, all_extensions, cl_status)
1594#endif
1595
1596    has = index(all_extensions, extension) /= 0
1597
1598  end function f90_cl_device_has_extension
1599#endif
1600
1601  ! ---------------------------------------------------------
1602
1603  integer pure function opencl_pad(size, blk) result(pad)
1604    integer, intent(in) :: size
1605    integer, intent(in) :: blk
1606
1607    integer :: mm
1608
1609    mm = mod(size, blk)
1610    if(mm == 0) then
1611      pad = size
1612    else
1613      pad = size + blk - mm
1614    end if
1615  end function opencl_pad
1616
1617  ! ----------------------------------------------------
1618
1619  subroutine accel_set_buffer_to_zero(buffer, type, nval, offset)
1620    type(accel_mem_t),  intent(inout) :: buffer
1621    type(type_t),       intent(in)    :: type
1622    integer,            intent(in)    :: nval
1623    integer, optional,  intent(in)    :: offset
1624
1625    integer :: nval_real, bsize, offset_real
1626
1627    PUSH_SUB(accel_set_buffer_to_zero)
1628
1629    ASSERT(type == TYPE_CMPLX .or. type == TYPE_FLOAT)
1630
1631    if(nval > 0) then
1632
1633      nval_real = nval*(types_get_size(type)/8)
1634      offset_real = optional_default(offset, 0)*(types_get_size(type)/8)
1635
1636      ASSERT(nval_real > 0)
1637
1638      call accel_set_kernel_arg(set_zero, 0, nval_real)
1639      call accel_set_kernel_arg(set_zero, 1, offset_real)
1640      call accel_set_kernel_arg(set_zero, 2, buffer)
1641
1642      bsize = accel_kernel_workgroup_size(set_zero)
1643
1644
1645      call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ bsize /))
1646      call accel_finish()
1647
1648    end if
1649
1650    POP_SUB(accel_set_buffer_to_zero)
1651  end subroutine accel_set_buffer_to_zero
1652
1653  ! ----------------------------------------------------
1654
1655  subroutine opencl_check_bandwidth()
1656    integer :: itime
1657    integer, parameter :: times = 10
1658    integer :: size
1659    FLOAT   :: time, stime
1660    FLOAT   :: read_bw, write_bw
1661    type(accel_mem_t) :: buff
1662    FLOAT, allocatable :: data(:)
1663
1664    call messages_new_line()
1665    call messages_write('Info: Benchmarking the bandwidth between main memory and device memory')
1666    call messages_new_line()
1667    call messages_info()
1668
1669    call messages_write(' Buffer size   Read bw  Write bw')
1670    call messages_new_line()
1671    call messages_write('       [MiB]   [MiB/s]   [MiB/s]')
1672    call messages_info()
1673
1674    size = 15000
1675    do
1676      SAFE_ALLOCATE(data(1:size))
1677      call accel_create_buffer(buff, ACCEL_MEM_READ_WRITE, TYPE_FLOAT, size)
1678
1679      stime = loct_clock()
1680      do itime = 1, times
1681        call accel_write_buffer(buff, size, data)
1682        call accel_finish()
1683      end do
1684      time = (loct_clock() - stime)/TOFLOAT(times)
1685
1686      write_bw = TOFLOAT(size)*CNST(8.0)/time
1687
1688      stime = loct_clock()
1689      do itime = 1, times
1690        call accel_read_buffer(buff, size, data)
1691      end do
1692      call accel_finish()
1693
1694      time = (loct_clock() - stime)/TOFLOAT(times)
1695      read_bw = TOFLOAT(size)*CNST(8.0)/time
1696
1697      call messages_write(size*CNST(8.0)/CNST(1024.0)**2)
1698      call messages_write(write_bw/CNST(1024.0)**2, fmt = '(f10.1)')
1699      call messages_write(read_bw/CNST(1024.0)**2, fmt = '(f10.1)')
1700      call messages_info()
1701
1702      call accel_release_buffer(buff)
1703
1704      SAFE_DEALLOCATE_A(data)
1705
1706      size = int(size*2.0)
1707
1708      if(size > 50000000) exit
1709    end do
1710  end subroutine opencl_check_bandwidth
1711
1712  ! ----------------------------------------------------
1713
1714  logical pure function accel_use_shared_mem() result(use_shared_mem)
1715
1716    use_shared_mem = accel%shared_mem
1717
1718  end function accel_use_shared_mem
1719
1720  !------------------------------------------------------------
1721
1722  subroutine accel_kernel_global_init()
1723
1724    PUSH_SUB(accel_kernel_global_init)
1725
1726    nullify(head)
1727
1728    call cuda_module_map_init(accel%module_map)
1729
1730    POP_SUB(accel_kernel_global_init)
1731  end subroutine accel_kernel_global_init
1732
1733  !------------------------------------------------------------
1734
1735  subroutine accel_kernel_global_end()
1736    type(accel_kernel_t), pointer :: next_head
1737
1738    PUSH_SUB(accel_kernel_global_end)
1739
1740    do
1741      if(.not. associated(head)) exit
1742      next_head => head%next
1743      call accel_kernel_end(head)
1744      head => next_head
1745    end do
1746
1747    if(accel_is_enabled()) then
1748      call cuda_module_map_end(accel%module_map)
1749    end if
1750
1751    POP_SUB(accel_kernel_global_end)
1752  end subroutine accel_kernel_global_end
1753
1754  !------------------------------------------------------------
1755
1756  subroutine accel_kernel_build(this, file_name, kernel_name, flags)
1757    type(accel_kernel_t),        intent(inout) :: this
1758    character(len=*),            intent(in)    :: file_name
1759    character(len=*),            intent(in)    :: kernel_name
1760    character(len=*), optional,  intent(in)    :: flags
1761
1762    type(profile_t), save :: prof
1763#ifdef HAVE_OPENCL
1764    type(cl_program) :: prog
1765#endif
1766#ifdef HAVE_CUDA
1767    character(len=1000) :: all_flags
1768    type(c_ptr) :: cuda_module
1769#endif
1770
1771    PUSH_SUB(accel_kernel_build)
1772
1773    call profiling_in(prof, "ACCEL_COMPILE", exclude = .true.)
1774
1775#ifdef HAVE_CUDA
1776    all_flags = '-I'//trim(conf%share)//'/opencl/'
1777
1778    if(accel_use_shared_mem()) then
1779      all_flags = trim(all_flags)//' -DSHARED_MEM'
1780    end if
1781
1782    if(present(flags)) then
1783      all_flags = trim(all_flags)//' '//trim(flags)
1784    end if
1785
1786    call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags))
1787
1788    call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name))
1789    call cuda_alloc_arg_array(this%arguments)
1790
1791    this%cuda_shared_mem = 0
1792#endif
1793
1794#ifdef HAVE_OPENCL
1795    call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags)
1796    call opencl_create_kernel(this%kernel, prog, trim(kernel_name))
1797    call opencl_release_program(prog)
1798#endif
1799
1800    this%initialized = .true.
1801
1802    call profiling_out(prof)
1803
1804    POP_SUB(accel_kernel_build)
1805  end subroutine accel_kernel_build
1806
1807  !------------------------------------------------------------
1808
1809  subroutine accel_kernel_end(this)
1810    type(accel_kernel_t), intent(inout) :: this
1811#ifdef HAVE_OPENCL
1812    integer :: ierr
1813#endif
1814
1815      PUSH_SUB(accel_kernel_end)
1816
1817#ifdef HAVE_CUDA
1818      call cuda_free_arg_array(this%arguments)
1819      call cuda_release_kernel(this%cuda_kernel)
1820      ! modules are not released here, since they are not associated to a kernel
1821#endif
1822
1823#ifdef HAVE_OPENCL
1824      call clReleaseKernel(this%kernel, ierr)
1825      if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "release_kernel")
1826#endif
1827      this%initialized = .false.
1828
1829      POP_SUB(accel_kernel_end)
1830  end subroutine accel_kernel_end
1831
1832  !------------------------------------------------------------
1833
1834  subroutine accel_kernel_start_call(this, file_name, kernel_name, flags)
1835    type(accel_kernel_t), target, intent(inout) :: this
1836    character(len=*),             intent(in)    :: file_name
1837    character(len=*),             intent(in)    :: kernel_name
1838    character(len=*), optional,   intent(in)    :: flags
1839
1840    PUSH_SUB(accel_kernel_start_call)
1841
1842    if(.not. this%initialized) then
1843      call accel_kernel_build(this, file_name, kernel_name, flags)
1844      this%next => head
1845      head => this
1846    end if
1847
1848    POP_SUB(accel_kernel_start_call)
1849  end subroutine accel_kernel_start_call
1850
1851  !--------------------------------------------------------------
1852
1853  integer(8) pure function accel_global_memory_size() result(size)
1854
1855    size = accel%global_memory_size
1856
1857  end function accel_global_memory_size
1858
1859  !--------------------------------------------------------------
1860
1861  integer(8) pure function accel_local_memory_size() result(size)
1862
1863    size = accel%local_memory_size
1864
1865  end function accel_local_memory_size
1866
1867  !--------------------------------------------------------------
1868
1869  integer pure function accel_max_size_per_dim(dim) result(size)
1870    integer, intent(in) :: dim
1871
1872    size = 0
1873#ifdef HAVE_OPENCL
1874    size = 2**30
1875#endif
1876#ifdef HAVE_CUDA
1877    if(dim == 1) size = 2**30
1878    size = 32768
1879#endif
1880  end function accel_max_size_per_dim
1881
1882  ! ------------------------------------------------------
1883
1884  subroutine accel_set_stream(stream_number)
1885    integer, intent(in) :: stream_number
1886
1887    PUSH_SUB(accel_set_stream)
1888
1889    if(accel_is_enabled()) then
1890#ifdef HAVE_CUDA
1891      call cuda_set_stream(accel%cuda_stream, stream_number)
1892      call cublas_set_stream(accel%cublas_handle, accel%cuda_stream)
1893#endif
1894    end if
1895
1896    POP_SUB(accel_set_stream)
1897  end subroutine accel_set_stream
1898
1899  ! ------------------------------------------------------
1900
1901  subroutine accel_synchronize_all_streams()
1902    PUSH_SUB(accel_synchronize_all_streams)
1903
1904#ifdef HAVE_CUDA
1905    call cuda_synchronize_all_streams()
1906#endif
1907
1908    POP_SUB(accel_synchronize_all_streams)
1909  end subroutine accel_synchronize_all_streams
1910
1911#include "undef.F90"
1912#include "real.F90"
1913#include "accel_inc.F90"
1914
1915#include "undef.F90"
1916#include "complex.F90"
1917#include "accel_inc.F90"
1918
1919#include "undef.F90"
1920#include "integer.F90"
1921#include "accel_inc.F90"
1922
1923end module accel_oct_m
1924
1925!! Local Variables:
1926!! mode: f90
1927!! coding: utf-8
1928!! End:
1929