1!! Copyright (C) 2010-2016 X. Andrade 2!! 3!! This program is free software; you can redistribute it and/or modify 4!! it under the terms of the GNU General Public License as published by 5!! the Free Software Foundation; either version 2, or (at your option) 6!! any later version. 7!! 8!! This program is distributed in the hope that it will be useful, 9!! but WITHOUT ANY WARRANTY; without even the implied warranty of 10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11!! GNU General Public License for more details. 12!! 13!! You should have received a copy of the GNU General Public License 14!! along with this program; if not, write to the Free Software 15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16!! 02110-1301, USA. 17!! 18 19#include "global.h" 20 21#if defined(HAVE_OPENCL) && defined(HAVE_CUDA) 22#error "Cannot compile with OpenCL and Cuda support at the same time" 23#endif 24 25#if defined(HAVE_OPENCL) || defined(HAVE_CUDA) 26#define HAVE_ACCEL 1 27#endif 28 29module accel_oct_m 30 use alloc_cache_oct_m 31#ifdef HAVE_OPENCL 32 use cl 33#endif 34#ifdef HAVE_CLBLAS 35 use clblas 36#endif 37 use cuda_oct_m 38#ifdef HAVE_CLFFT 39 use clfft 40#endif 41 use global_oct_m 42 use iso_c_binding 43 use loct_oct_m 44 use messages_oct_m 45 use mpi_oct_m 46 use namespace_oct_m 47 use types_oct_m 48 use parser_oct_m 49 use profiling_oct_m 50 use unit_system_oct_m 51 52 implicit none 53 54 private 55 56 public :: & 57 accel_context_t, & 58 accel_device_t, & 59 accel_mem_t, & 60 accel_kernel_t, & 61 accel_t, & 62 accel_is_enabled, & 63 accel_init, & 64 accel_end, & 65 accel_padded_size, & 66 accel_mem_nullify, & 67 accel_kernel_start_call, & 68 accel_kernel_build, & 69 accel_create_buffer, & 70 accel_write_buffer, & 71 accel_read_buffer, & 72 accel_release_buffer, & 73 accel_buffer_is_allocated, & 74 accel_finish, & 75 accel_set_kernel_arg, & 76 accel_max_workgroup_size, & 77 accel_kernel_workgroup_size, & 78 accel_kernel_run, & 79 accel_set_buffer_to_zero, & 80 accel_use_shared_mem, & 81 clblas_print_error, & 82 clfft_print_error, & 83 accel_local_memory_size, & 84 accel_global_memory_size, & 85 accel_max_size_per_dim, & 86 accel_get_device_pointer, & 87 accel_set_stream, & 88 accel_synchronize_all_streams 89 90#ifdef HAVE_OPENCL 91 integer, public, parameter :: & 92 ACCEL_MEM_READ_ONLY = CL_MEM_READ_ONLY, & 93 ACCEL_MEM_READ_WRITE = CL_MEM_READ_WRITE, & 94 ACCEL_MEM_WRITE_ONLY = CL_MEM_WRITE_ONLY 95#else 96 integer, public, parameter :: & 97 ACCEL_MEM_READ_ONLY = 0, & 98 ACCEL_MEM_READ_WRITE = 1, & 99 ACCEL_MEM_WRITE_ONLY = 2 100#endif 101 102 type accel_context_t 103 ! Components are public by default 104#ifdef HAVE_OPENCL 105 type(cl_context) :: cl_context 106#elif defined(HAVE_CUDA) 107 type(c_ptr) :: cuda_context 108#else 109 integer :: dummy 110#endif 111 end type accel_context_t 112 113 type accel_device_t 114 ! Components are public by default 115#ifdef HAVE_OPENCL 116 type(cl_device_id) :: cl_device 117#elif defined(HAVE_CUDA) 118 type(c_ptr) :: cuda_device 119#else 120 integer :: dummy 121#endif 122 end type accel_device_t 123 124 type accel_t 125 ! Components are public by default 126 type(accel_context_t) :: context 127 type(accel_device_t) :: device 128#ifdef HAVE_OPENCL 129 type(cl_command_queue) :: command_queue 130#endif 131 type(c_ptr) :: cublas_handle 132 type(c_ptr) :: cuda_stream 133 type(c_ptr) :: module_map 134 integer :: max_workgroup_size 135 integer(8) :: local_memory_size 136 integer(8) :: global_memory_size 137 logical :: enabled 138 logical :: shared_mem 139 logical :: cuda_mpi 140 integer :: warp_size 141 end type accel_t 142 143 type accel_mem_t 144 ! Components are public by default 145#ifdef HAVE_OPENCL 146 type(cl_mem) :: mem 147#else 148 type(c_ptr) :: mem 149#endif 150 integer(SIZEOF_SIZE_T) :: size 151 type(type_t) :: type 152 integer :: flags 153 logical :: allocated 154 end type accel_mem_t 155 156 type accel_kernel_t 157 ! Components are public by default 158#ifdef HAVE_OPENCL 159 type(cl_kernel) :: kernel 160#endif 161#ifdef HAVE_CUDA 162 type(c_ptr) :: cuda_kernel 163 type(c_ptr) :: cuda_module 164 type(c_ptr) :: arguments 165#endif 166 integer(8) :: cuda_shared_mem 167 logical :: initialized = .false. 168 type(accel_kernel_t), pointer :: next 169 integer :: arg_count 170 end type accel_kernel_t 171 172 type(accel_t), public :: accel 173 174 ! the kernels 175 type(accel_kernel_t), public, target, save :: kernel_vpsi 176 type(accel_kernel_t), public, target, save :: kernel_vpsi_spinors 177 type(accel_kernel_t), public, target, save :: kernel_daxpy 178 type(accel_kernel_t), public, target, save :: kernel_zaxpy 179 type(accel_kernel_t), public, target, save :: kernel_copy 180 type(accel_kernel_t), public, target, save :: dpack 181 type(accel_kernel_t), public, target, save :: zpack 182 type(accel_kernel_t), public, target, save :: dunpack 183 type(accel_kernel_t), public, target, save :: zunpack 184 type(accel_kernel_t), public, target, save :: kernel_subarray_gather 185 type(accel_kernel_t), public, target, save :: kernel_density_real 186 type(accel_kernel_t), public, target, save :: kernel_density_complex 187 type(accel_kernel_t), public, target, save :: kernel_density_spinors 188 type(accel_kernel_t), public, target, save :: kernel_phase 189 type(accel_kernel_t), public, target, save :: kernel_phase_spiral 190 type(accel_kernel_t), public, target, save :: dkernel_dot_matrix 191 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix 192 type(accel_kernel_t), public, target, save :: zkernel_dot_matrix_spinors 193 type(accel_kernel_t), public, target, save :: dkernel_batch_axpy 194 type(accel_kernel_t), public, target, save :: zkernel_batch_axpy 195 type(accel_kernel_t), public, target, save :: dkernel_batch_dotp 196 type(accel_kernel_t), public, target, save :: zkernel_batch_dotp 197 type(accel_kernel_t), public, target, save :: dzmul 198 type(accel_kernel_t), public, target, save :: zzmul 199 type(accel_kernel_t), public, target, save :: set_one 200 201 ! kernels used locally 202 type(accel_kernel_t), save :: set_zero 203 204 interface accel_create_buffer 205 module procedure accel_create_buffer_4, accel_create_buffer_8 206 end interface accel_create_buffer 207 208 interface accel_write_buffer 209 module procedure iaccel_write_buffer_0, daccel_write_buffer_0, zaccel_write_buffer_0 210 module procedure iaccel_write_buffer_1, daccel_write_buffer_1, zaccel_write_buffer_1 211 module procedure iaccel_write_buffer_2, daccel_write_buffer_2, zaccel_write_buffer_2 212 module procedure iaccel_write_buffer_3, daccel_write_buffer_3, zaccel_write_buffer_3 213 end interface accel_write_buffer 214 215 interface accel_read_buffer 216 module procedure iaccel_read_buffer_1, daccel_read_buffer_1, zaccel_read_buffer_1 217 module procedure iaccel_read_buffer_2, daccel_read_buffer_2, zaccel_read_buffer_2 218 module procedure iaccel_read_buffer_3, daccel_read_buffer_3, zaccel_read_buffer_3 219 end interface accel_read_buffer 220 221 interface accel_set_kernel_arg 222 module procedure & 223 accel_set_kernel_arg_buffer, & 224 iaccel_set_kernel_arg_data, & 225 daccel_set_kernel_arg_data, & 226 zaccel_set_kernel_arg_data, & 227 accel_set_kernel_arg_local 228 end interface accel_set_kernel_arg 229 230 interface accel_get_device_pointer 231 module procedure iaccel_get_device_pointer_1 232 module procedure iaccel_get_device_pointer_2 233 module procedure daccel_get_device_pointer_1, zaccel_get_device_pointer_1 234 module procedure daccel_get_device_pointer_2, zaccel_get_device_pointer_2 235 end interface accel_get_device_pointer 236 237 type(profile_t), save :: prof_read, prof_write 238 239 integer, parameter :: & 240 OPENCL_GPU = -1, & 241 OPENCL_CPU = -2, & 242 OPENCL_ACCELERATOR = -3, & 243 OPENCL_DEFAULT = -4 244 245 246 integer, parameter :: & 247 CL_PLAT_INVALID = -1, & 248 CL_PLAT_AMD = -2, & 249 CL_PLAT_NVIDIA = -3, & 250 CL_PLAT_ATI = -4, & 251 CL_PLAT_INTEL = -5 252 253 ! a "convenience" public variable 254 integer, public :: cl_status 255 256 integer :: buffer_alloc_count 257 integer(8) :: allocated_mem 258 type(accel_kernel_t), pointer :: head 259 type(alloc_cache_t) :: memcache 260 261contains 262 263 pure logical function accel_is_enabled() result(enabled) 264#ifdef HAVE_ACCEL 265 enabled = accel%enabled 266#else 267 enabled = .false. 268#endif 269 end function accel_is_enabled 270 271 ! ------------------------------------------ 272 273 subroutine accel_init(base_grp, namespace) 274 type(mpi_grp_t), intent(inout) :: base_grp 275 type(namespace_t), intent(in) :: namespace 276 277 logical :: disable, default, run_benchmark 278 integer :: idevice, iplatform 279#ifdef HAVE_OPENCL 280 integer :: device_type 281 integer :: cl_status, idev 282 integer :: ndevices, ret_devices, nplatforms, iplat 283 character(len=256) :: device_name 284 type(cl_platform_id) :: platform_id 285 type(cl_program) :: prog 286 type(cl_platform_id), allocatable :: allplatforms(:) 287 type(cl_device_id), allocatable :: alldevices(:) 288 type(profile_t), save :: prof_init 289#endif 290 291 PUSH_SUB(accel_init) 292 293 buffer_alloc_count = 0 294 295 !%Variable DisableAccel 296 !%Type logical 297 !%Default yes 298 !%Section Execution::Accel 299 !%Description 300 !% If Octopus was compiled with OpenCL or CUDA support, it will 301 !% try to initialize and use an accelerator device. By setting this 302 !% variable to <tt>yes</tt> you force Octopus not to use an accelerator even it is available. 303 !%End 304 call messages_obsolete_variable(namespace, 'DisableOpenCL', 'DisableAccel') 305#ifdef HAVE_ACCEL 306 default = .false. 307#else 308 default = .true. 309#endif 310 call parse_variable(namespace, 'DisableAccel', default, disable) 311 accel%enabled = .not. disable 312 313#ifndef HAVE_ACCEL 314 if(accel%enabled) then 315 message(1) = 'Octopus was compiled without OpenCL or Cuda support.' 316 call messages_fatal(1) 317 end if 318#endif 319 320 if(.not. accel_is_enabled()) then 321 POP_SUB(accel_init) 322 return 323 end if 324 325 !%Variable AccelPlatform 326 !%Type integer 327 !%Default 0 328 !%Section Execution::Accel 329 !%Description 330 !% This variable selects the OpenCL platform that Octopus will 331 !% use. You can give an explicit platform number or use one of 332 !% the options that select a particular vendor 333 !% implementation. Platform 0 is used by default. 334 !% 335 !% This variable has no effect for CUDA. 336 !%Option amd -2 337 !% Use the AMD OpenCL platform. 338 !%Option nvidia -3 339 !% Use the Nvidia OpenCL platform. 340 !%Option ati -4 341 !% Use the ATI (old AMD) OpenCL platform. 342 !%Option intel -5 343 !% Use the Intel OpenCL platform. 344 !%End 345 call parse_variable(namespace, 'AccelPlatform', 0, iplatform) 346 347 call messages_obsolete_variable(namespace, 'OpenCLPlatform', 'AccelPlatform') 348 349 !%Variable AccelDevice 350 !%Type integer 351 !%Default gpu 352 !%Section Execution::Accel 353 !%Description 354 !% This variable selects the OpenCL or CUDA accelerator device 355 !% that Octopus will use. You can specify one of the options below 356 !% or a numerical id to select a specific device. 357 !% 358 !% Values >= 0 select the device to be used. In case of MPI enabled runs 359 !% devices are distributed in a round robin fashion, starting at this value. 360 !%Option gpu -1 361 !% If available, Octopus will use a GPU. 362 !%Option cpu -2 363 !% If available, Octopus will use a CPU (only for OpenCL). 364 !%Option accelerator -3 365 !% If available, Octopus will use an accelerator (only for OpenCL). 366 !%Option accel_default -4 367 !% Octopus will use the default device specified by the implementation. 368 !% implementation. 369 !%End 370 call parse_variable(namespace, 'AccelDevice', OPENCL_GPU, idevice) 371 372 call messages_obsolete_variable(namespace, 'OpenCLDevice', 'AccelDevice') 373 374 if(idevice < OPENCL_DEFAULT) then 375 call messages_write('Invalid AccelDevice') 376 call messages_fatal() 377 end if 378 379 call messages_print_stress(stdout, "GPU acceleration") 380 381#ifdef HAVE_CUDA 382 if(idevice<0) idevice = 0 383 call cuda_init(accel%context%cuda_context, accel%device%cuda_device, accel%cuda_stream, & 384 idevice, base_grp%rank) 385#ifdef HAVE_MPI 386 write(message(1), '(A, I5.5, A, I5.5)') "Rank ", base_grp%rank, " uses device number ", idevice 387 call messages_info(1, all_nodes = .true.) 388#endif 389 390 ! no shared mem support in our cuda interface (for the moment) 391 accel%shared_mem = .true. 392 393 call cublas_init(accel%cublas_handle, accel%cuda_stream) 394#endif 395 396#ifdef HAVE_OPENCL 397 call profiling_in(prof_init, 'CL_INIT') 398 399 call clGetPlatformIDs(nplatforms, cl_status) 400 if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs") 401 402 SAFE_ALLOCATE(allplatforms(1:nplatforms)) 403 404 call clGetPlatformIDs(allplatforms, iplat, cl_status) 405 if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "GetPlatformIDs") 406 407 call messages_write('Info: Available CL platforms: ') 408 call messages_write(nplatforms) 409 call messages_info() 410 411 do iplat = 1, nplatforms 412 413 call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_NAME, device_name, cl_status) 414 415 if(iplatform < 0) then 416 if(iplatform == get_platform_id(device_name)) iplatform = iplat - 1 417 end if 418 419 if(iplatform == iplat - 1) then 420 call messages_write(' * Platform ') 421 else 422 call messages_write(' Platform ') 423 end if 424 425 call messages_write(iplat - 1) 426 call messages_write(' : '//device_name) 427 call clGetPlatformInfo(allplatforms(iplat), CL_PLATFORM_VERSION, device_name, cl_status) 428 call messages_write(' ('//trim(device_name)//')') 429 call messages_info() 430 end do 431 432 call messages_info() 433 434 if(iplatform >= nplatforms .or. iplatform < 0) then 435 call messages_write('Requested CL platform does not exist') 436 if(iplatform > 0) then 437 call messages_write('(platform = ') 438 call messages_write(iplatform) 439 call messages_write(').') 440 end if 441 call messages_fatal() 442 end if 443 444 platform_id = allplatforms(iplatform + 1) 445 446 SAFE_DEALLOCATE_A(allplatforms) 447 448 call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, ndevices, cl_status) 449 450 call messages_write('Info: Available CL devices: ') 451 call messages_write(ndevices) 452 call messages_info() 453 454 SAFE_ALLOCATE(alldevices(1:ndevices)) 455 456 ! list all devices 457 458 call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status) 459 460 do idev = 1, ndevices 461 call messages_write(' Device ') 462 call messages_write(idev - 1) 463 call clGetDeviceInfo(alldevices(idev), CL_DEVICE_NAME, device_name, cl_status) 464 call messages_write(' : '//device_name) 465 call messages_info() 466 end do 467 468 select case(idevice) 469 case(OPENCL_GPU) 470 device_type = CL_DEVICE_TYPE_GPU 471 case(OPENCL_CPU) 472 device_type = CL_DEVICE_TYPE_CPU 473 case(OPENCL_ACCELERATOR) 474 device_type = CL_DEVICE_TYPE_ACCELERATOR 475 case(OPENCL_DEFAULT) 476 device_type = CL_DEVICE_TYPE_DEFAULT 477 case default 478 device_type = CL_DEVICE_TYPE_ALL 479 end select 480 481 ! now get a list of the selected type 482 call clGetDeviceIDs(platform_id, device_type, alldevices, ret_devices, cl_status) 483 484 if(ret_devices < 1) then 485 ! we didnt find a device of the selected type, we ask for the default device 486 call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, alldevices, ret_devices, cl_status) 487 488 if(ret_devices < 1) then 489 ! if this does not work, we ask for all devices 490 call clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, alldevices, ret_devices, cl_status) 491 end if 492 493 if(ret_devices < 1) then 494 call messages_write('Cannot find an OpenCL device') 495 call messages_fatal() 496 end if 497 end if 498 499 ! the number of devices can be smaller 500 ndevices = ret_devices 501 502 if(idevice < 0) then 503 if(base_grp%size > 1) then 504 ! with MPI we have to select the device so multiple GPUs in one 505 ! node are correctly distributed 506 call select_device(idevice) 507 else 508 idevice = 0 509 end if 510 end if 511 512 if(idevice >= ndevices) then 513 call messages_write('Requested CL device does not exist (device = ') 514 call messages_write(idevice) 515 call messages_write(', platform = ') 516 call messages_write(iplatform) 517 call messages_write(').') 518 call messages_fatal() 519 end if 520 521 accel%device%cl_device = alldevices(idevice + 1) 522 523 ! create the context 524 accel%context%cl_context = clCreateContext(platform_id, accel%device%cl_device, cl_status) 525 if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateContext") 526 527 SAFE_DEALLOCATE_A(alldevices) 528 529 accel%command_queue = clCreateCommandQueue(accel%context%cl_context, accel%device%cl_device, & 530 CL_QUEUE_PROFILING_ENABLE, cl_status) 531 if(cl_status /= CL_SUCCESS) call opencl_print_error(cl_status, "CreateCommandQueue") 532 533 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, device_type, cl_status) 534 535 select case(device_type) 536 case(CL_DEVICE_TYPE_GPU) 537 accel%shared_mem = .true. 538 case(CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_ACCELERATOR) 539 accel%shared_mem = .false. 540 case default 541 accel%shared_mem = .false. 542 end select 543 544#ifdef HAVE_CLBLAS 545 call clblasSetup(cl_status) 546 if(cl_status /= clblasSuccess) call clblas_print_error(cl_status, 'clblasSetup') 547#endif 548 549#ifdef HAVE_CLFFT 550 call clfftSetup(cl_status) 551 if(cl_status /= CLFFT_SUCCESS) call clfft_print_error(cl_status, 'clfftSetup') 552#endif 553 554 call profiling_out(prof_init) 555#endif 556 557 ! Get some device information that we will need later 558 559 ! total memory 560#ifdef HAVE_OPENCL 561 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_SIZE, accel%global_memory_size, cl_status) 562 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_LOCAL_MEM_SIZE, accel%local_memory_size, cl_status) 563 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, accel%max_workgroup_size, cl_status) 564 accel%warp_size = 1 565#endif 566#ifdef HAVE_CUDA 567 call cuda_device_total_memory(accel%device%cuda_device, accel%global_memory_size) 568 call cuda_device_shared_memory(accel%device%cuda_device, accel%local_memory_size) 569 call cuda_device_max_threads_per_block(accel%device%cuda_device, accel%max_workgroup_size) 570 call cuda_device_get_warpsize(accel%device%cuda_device, accel%warp_size) 571#endif 572 573 if(mpi_grp_is_root(base_grp)) call device_info() 574 575 ! initialize the cache used to speed up allocations 576 call alloc_cache_init(memcache, nint(CNST(0.25)*accel%global_memory_size, 8)) 577 578 ! now initialize the kernels 579 call accel_kernel_global_init() 580 581 call accel_kernel_start_call(set_zero, 'set_zero.cl', "set_zero") 582 call accel_kernel_start_call(set_one, 'set_one.cl', "set_one") 583 call accel_kernel_start_call(kernel_vpsi, 'vpsi.cl', "vpsi") 584 call accel_kernel_start_call(kernel_vpsi_spinors, 'vpsi.cl', "vpsi_spinors") 585 call accel_kernel_start_call(kernel_daxpy, 'axpy.cl', "daxpy", flags = '-DRTYPE_DOUBLE') 586 call accel_kernel_start_call(kernel_zaxpy, 'axpy.cl', "zaxpy", flags = '-DRTYPE_COMPLEX') 587 call accel_kernel_start_call(dkernel_batch_axpy, 'axpy.cl', "dbatch_axpy_function", flags = '-lineinfo -DRTYPE_DOUBLE') 588 call accel_kernel_start_call(zkernel_batch_axpy, 'axpy.cl', "zbatch_axpy_function", flags = '-lineinfo -DRTYPE_COMPLEX') 589 call accel_kernel_start_call(dkernel_batch_dotp, 'mesh_batch_single.cl', "dbatch_mf_dotp", flags = '-lineinfo') 590 call accel_kernel_start_call(zkernel_batch_dotp, 'mesh_batch_single.cl', "zbatch_mf_dotp", flags = '-lineinfo') 591 call accel_kernel_start_call(dpack, 'pack.cl', "dpack") 592 call accel_kernel_start_call(zpack, 'pack.cl', "zpack") 593 call accel_kernel_start_call(dunpack, 'pack.cl', "dunpack") 594 call accel_kernel_start_call(zunpack, 'pack.cl', "zunpack") 595 call accel_kernel_start_call(kernel_copy, 'copy.cl', "copy") 596 call accel_kernel_start_call(kernel_subarray_gather, 'subarray.cl', "subarray_gather") 597 call accel_kernel_start_call(kernel_density_real, 'density.cl', "density_real") 598 call accel_kernel_start_call(kernel_density_complex, 'density.cl', "density_complex") 599 call accel_kernel_start_call(kernel_density_spinors, 'density.cl', "density_spinors") 600 call accel_kernel_start_call(kernel_phase, 'phase.cl', "phase") 601 call accel_kernel_start_call(dkernel_dot_matrix, 'mesh_batch.cl', "ddot_matrix") 602 call accel_kernel_start_call(zkernel_dot_matrix, 'mesh_batch.cl', "zdot_matrix") 603 call accel_kernel_start_call(zkernel_dot_matrix_spinors, 'mesh_batch.cl', "zdot_matrix_spinors") 604 605 606 call accel_kernel_start_call(dzmul, 'mul.cl', "dzmul", flags = '-DRTYPE_DOUBLE') 607 call accel_kernel_start_call(zzmul, 'mul.cl', "zzmul", flags = '-DRTYPE_COMPLEX') 608 609 !%Variable AccelBenchmark 610 !%Type logical 611 !%Default no 612 !%Section Execution::Accel 613 !%Description 614 !% If this variable is set to yes, Octopus will run some 615 !% routines to benchmark the performance of the accelerator device. 616 !%End 617 call parse_variable(namespace, 'AccelBenchmark', .false., run_benchmark) 618 619 call messages_obsolete_variable(namespace, 'OpenCLBenchmark', 'AccelBenchmark') 620 621 if(run_benchmark) then 622 call opencl_check_bandwidth() 623 end if 624 625 !%Variable CudaAwareMPI 626 !%Type logical 627 !%Section Execution::Accel 628 !%Description 629 !% If Octopus was compiled with CUDA support and MPI support and if the MPI 630 !% implementation is CUDA-aware (i.e., it supports communication using device pointers), 631 !% this switch can be set to true to use the CUDA-aware MPI features. The advantage 632 !% of this approach is that it can do, e.g., peer-to-peer copies between devices without 633 !% going through the host memmory. 634 !% The default is false, except when the configure switch --enable-cudampi is set, in which 635 !% case this variable is set to true. 636 !%End 637#ifdef HAVE_CUDA_MPI 638 default = .true. 639#else 640 default = .false. 641#endif 642 call parse_variable(namespace, 'CudaAwareMPI', default, accel%cuda_mpi) 643 if(accel%cuda_mpi) then 644 call messages_write("Using CUDA-aware MPI.") 645 call messages_info() 646 end if 647 648 call messages_print_stress(stdout) 649 650 POP_SUB(accel_init) 651 652 contains 653 654 subroutine select_device(idevice) 655 integer, intent(inout) :: idevice 656#if defined(HAVE_MPI) && defined(HAVE_OPENCL) 657 integer :: irank 658 character(len=256) :: device_name 659 660 PUSH_SUB(accel_init.select_device) 661 662 idevice = mod(base_grp%rank, ndevices) 663 664 call MPI_Barrier(base_grp%comm, mpi_err) 665 call messages_write('Info: CL device distribution:') 666 call messages_info() 667 do irank = 0, base_grp%size - 1 668 if(irank == base_grp%rank) then 669 call clGetDeviceInfo(alldevices(idevice + 1), CL_DEVICE_NAME, device_name, cl_status) 670 call messages_write(' MPI node ') 671 call messages_write(base_grp%rank) 672 call messages_write(' -> CL device ') 673 call messages_write(idevice) 674 call messages_write(' : '//device_name) 675 call messages_info(all_nodes = .true.) 676 end if 677 call MPI_Barrier(base_grp%comm, mpi_err) 678 end do 679#endif 680 681 POP_SUB(accel_init.select_device) 682 end subroutine select_device 683 684 subroutine device_info() 685#ifdef HAVE_OPENCL 686 integer(8) :: val 687#endif 688#ifdef HAVE_CUDA 689 integer :: version 690#endif 691 integer :: major, minor 692 character(len=256) :: val_str 693 694 PUSH_SUB(accel_init.device_info) 695 696 call messages_new_line() 697 call messages_write('Selected device:') 698 call messages_new_line() 699 700#ifdef HAVE_OPENCL 701 call messages_write(' Framework : OpenCL') 702#endif 703#ifdef HAVE_CUDA 704 call messages_write(' Framework : CUDA') 705#endif 706 call messages_info() 707 708#ifdef HAVE_CUDA 709 call messages_write(' Device type : GPU', new_line = .true.) 710 call messages_write(' Device vendor : NVIDIA Corporation', new_line = .true.) 711#endif 712 713#ifdef HAVE_OPENCL 714 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_TYPE, val, cl_status) 715 call messages_write(' Device type :') 716 select case(int(val, 4)) 717 case(CL_DEVICE_TYPE_GPU) 718 call messages_write(' GPU') 719 case(CL_DEVICE_TYPE_CPU) 720 call messages_write(' CPU') 721 case(CL_DEVICE_TYPE_ACCELERATOR) 722 call messages_write(' accelerator') 723 end select 724 call messages_new_line() 725 726 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_VENDOR, val_str, cl_status) 727 call messages_write(' Device vendor : '//trim(val_str)) 728 call messages_new_line() 729#endif 730 731#ifdef HAVE_OPENCL 732 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_NAME, val_str, cl_status) 733#endif 734#ifdef HAVE_CUDA 735 call cuda_device_name(accel%device%cuda_device, val_str) 736#endif 737 call messages_write(' Device name : '//trim(val_str)) 738 call messages_new_line() 739 740#ifdef HAVE_CUDA 741 call cuda_device_capability(accel%device%cuda_device, major, minor) 742#endif 743 call messages_write(' Cuda capabilities :') 744 call messages_write(major, fmt = '(i2)') 745 call messages_write('.') 746 call messages_write(minor, fmt = '(i1)') 747 call messages_new_line() 748 749 ! VERSION 750#ifdef HAVE_OPENCL 751 call clGetDeviceInfo(accel%device%cl_device, CL_DRIVER_VERSION, val_str, cl_status) 752 call messages_write(' Driver version : '//trim(val_str)) 753#endif 754#ifdef HAVE_CUDA 755 call cuda_driver_version(version) 756 call messages_write(' Driver version : ') 757 call messages_write(version) 758#endif 759 call messages_new_line() 760 761 762#ifdef HAVE_OPENCL 763 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_COMPUTE_UNITS, val, cl_status) 764 call messages_write(' Compute units :') 765 call messages_write(val) 766 call messages_new_line() 767 768 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, val, cl_status) 769 call messages_write(' Clock frequency :') 770 call messages_write(val) 771 call messages_write(' GHz') 772 call messages_new_line() 773#endif 774 775 call messages_write(' Device memory :') 776 call messages_write(accel%global_memory_size, units = unit_megabytes) 777 call messages_new_line() 778 779 call messages_write(' Local/shared memory :') 780 call messages_write(accel%local_memory_size, units = unit_kilobytes) 781 call messages_new_line() 782 783 784#ifdef HAVE_OPENCL 785 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, val, cl_status) 786 call messages_write(' Max alloc size :') 787 call messages_write(val, units = unit_megabytes) 788 call messages_new_line() 789 790 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, val, cl_status) 791 call messages_write(' Device cache :') 792 call messages_write(val, units = unit_kilobytes) 793 call messages_new_line() 794 795 call clGetDeviceInfo(accel%device%cl_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, val, cl_status) 796 call messages_write(' Constant memory :') 797 call messages_write(val, units = unit_kilobytes) 798 call messages_new_line() 799#endif 800 801 call messages_write(' Max. group/block size :') 802 call messages_write(accel%max_workgroup_size) 803 call messages_new_line() 804 805 806#ifdef HAVE_OPENCL 807 call messages_write(' Extension cl_khr_fp64 :') 808 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) 809 call messages_new_line() 810 811 call messages_write(' Extension cl_amd_fp64 :') 812 call messages_write(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) 813 call messages_new_line() 814#endif 815 816 call messages_info() 817 818 819 POP_SUB(accel_init.device_info) 820 end subroutine device_info 821 822 end subroutine accel_init 823 824 ! ------------------------------------------ 825 826 integer function get_platform_id(platform_name) result(platform_id) 827 character(len=*), intent(in) :: platform_name 828 829 platform_id = CL_PLAT_INVALID 830 if(index(platform_name, 'AMD') > 0) platform_id = CL_PLAT_AMD 831 if(index(platform_name, 'ATI') > 0) platform_id = CL_PLAT_ATI 832 if(index(platform_name, 'NVIDIA') > 0) platform_id = CL_PLAT_NVIDIA 833 if(index(platform_name, 'Intel') > 0) platform_id = CL_PLAT_INTEL 834 end function get_platform_id 835 836 ! ------------------------------------------ 837 838 subroutine accel_end() 839#ifdef HAVE_OPENCL 840 integer :: ierr 841#endif 842 integer(8) :: hits, misses 843 real(8) :: volume_hits, volume_misses 844 logical :: found 845 type(accel_mem_t) :: tmp 846 847 PUSH_SUB(accel_end) 848 849 if(accel_is_enabled()) then 850 851 do 852 call alloc_cache_get(memcache, ALLOC_CACHE_ANY_SIZE, found, tmp%mem) 853 if(.not. found) exit 854 855#ifdef HAVE_OPENCL 856 call clReleaseMemObject(tmp%mem, ierr) 857 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject") 858#endif 859#ifdef HAVE_CUDA 860 call cuda_mem_free(tmp%mem) 861#endif 862 end do 863 864 call alloc_cache_end(memcache, hits, misses, volume_hits, volume_misses) 865 866 call messages_print_stress(stdout, "Acceleration-device allocation cache") 867 868 call messages_new_line() 869 call messages_write(' Number of allocations =') 870 call messages_write(hits + misses, new_line = .true.) 871 call messages_write(' Volume of allocations =') 872 call messages_write(volume_hits + volume_misses, fmt = 'f18.1', units = unit_gigabytes, align_left = .true., & 873 new_line = .true.) 874 call messages_write(' Hit ratio =') 875 call messages_write(hits/TOFLOAT(hits + misses)*100, fmt='(f6.1)', align_left = .true.) 876 call messages_write('%', new_line = .true.) 877 call messages_write(' Volume hit ratio =') 878 call messages_write(volume_hits/(volume_hits + volume_misses)*100, fmt='(f6.1)', align_left = .true.) 879 call messages_write('%') 880 call messages_new_line() 881 call messages_info() 882 883 call messages_print_stress(stdout) 884 end if 885 886 call accel_kernel_global_end() 887 888#ifdef HAVE_CLBLAS 889 call clblasTearDown() 890#endif 891 892#ifdef HAVE_CLFFT 893 call clfftTearDown() 894#endif 895 896 if(accel_is_enabled()) then 897#ifdef HAVE_CUDA 898 call cublas_end(accel%cublas_handle) 899 call cuda_end(accel%context%cuda_context, accel%device%cuda_device) 900#endif 901 902#ifdef HAVE_OPENCL 903 call clReleaseCommandQueue(accel%command_queue, ierr) 904 905 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "ReleaseCommandQueue") 906 call clReleaseContext(accel%context%cl_context, cl_status) 907#endif 908 909 if(buffer_alloc_count /= 0) then 910 call messages_write('Accel:') 911 call messages_write(TOFLOAT(allocated_mem), fmt = 'f12.1', units = unit_megabytes, align_left = .true.) 912 call messages_write(' in ') 913 call messages_write(buffer_alloc_count) 914 call messages_write(' buffers were not deallocated.') 915 call messages_fatal() 916 end if 917 918 end if 919 920 POP_SUB(accel_end) 921 end subroutine accel_end 922 923 ! ------------------------------------------ 924 925 elemental subroutine accel_mem_nullify(this) 926 type(accel_mem_t), intent(out) :: this 927 928 !> To be implemented. 929 this%size = 0 930 this%flags = 0 931 this%allocated = .false. 932 933 end subroutine accel_mem_nullify 934 935 ! ------------------------------------------ 936 937 integer function accel_padded_size(nn) result(psize) 938 integer, intent(in) :: nn 939 940 integer :: modnn, bsize 941 942 psize = nn 943 944 if(accel_is_enabled()) then 945 946 bsize = accel_max_workgroup_size() 947 948 psize = nn 949 modnn = mod(nn, bsize) 950 if(modnn /= 0) psize = psize + bsize - modnn 951 952 end if 953 954 end function accel_padded_size 955 956 ! ------------------------------------------ 957 958 subroutine accel_create_buffer_4(this, flags, type, size) 959 type(accel_mem_t), intent(inout) :: this 960 integer, intent(in) :: flags 961 type(type_t), intent(in) :: type 962 integer, intent(in) :: size 963 964 call accel_create_buffer_8(this, flags, type, int(size, 8)) 965 end subroutine accel_create_buffer_4 966 967 ! ------------------------------------------ 968 969 subroutine accel_create_buffer_8(this, flags, type, size) 970 type(accel_mem_t), intent(inout) :: this 971 integer, intent(in) :: flags 972 type(type_t), intent(in) :: type 973 integer(8), intent(in) :: size 974 975 integer(8) :: fsize 976 logical :: found 977#ifdef HAVE_OPENCL 978 integer :: ierr 979#endif 980 981 PUSH_SUB(accel_create_buffer_8) 982 983 this%type = type 984 this%size = size 985 this%flags = flags 986 fsize = int(size, 8)*types_get_size(type) 987 this%allocated = .true. 988 989 if(fsize > 0) then 990 991 call alloc_cache_get(memcache, fsize, found, this%mem) 992 993 if(.not. found) then 994#ifdef HAVE_OPENCL 995 this%mem = clCreateBuffer(accel%context%cl_context, flags, fsize, ierr) 996 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateBuffer") 997#endif 998#ifdef HAVE_CUDA 999 call cuda_mem_alloc(this%mem, fsize) 1000#endif 1001 end if 1002 1003 INCR(buffer_alloc_count, 1) 1004 INCR(allocated_mem, fsize) 1005 1006 end if 1007 1008 POP_SUB(accel_create_buffer_8) 1009 end subroutine accel_create_buffer_8 1010 1011 ! ------------------------------------------ 1012 1013 subroutine accel_release_buffer(this) 1014 type(accel_mem_t), intent(inout) :: this 1015 1016#ifdef HAVE_OPENCL 1017 integer :: ierr 1018#endif 1019 logical :: put 1020 integer(8) :: fsize 1021 1022 PUSH_SUB(accel_release_buffer) 1023 1024 if(this%size > 0) then 1025 1026 fsize = int(this%size, 8)*types_get_size(this%type) 1027 1028 call alloc_cache_put(memcache, fsize, this%mem, put) 1029 1030 if(.not. put) then 1031#ifdef HAVE_OPENCL 1032 call clReleaseMemObject(this%mem, ierr) 1033 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseMemObject") 1034#endif 1035#ifdef HAVE_CUDA 1036 call cuda_mem_free(this%mem) 1037#endif 1038 end if 1039 1040 INCR(buffer_alloc_count, -1) 1041 INCR(allocated_mem, fsize) 1042 1043 end if 1044 1045 this%size = 0 1046 this%flags = 0 1047 1048 this%allocated = .false. 1049 1050 POP_SUB(accel_release_buffer) 1051 end subroutine accel_release_buffer 1052 1053 ! ------------------------------------------ 1054 1055 logical pure function accel_buffer_is_allocated(this) result(allocated) 1056 type(accel_mem_t), intent(in) :: this 1057 1058 allocated = this%allocated 1059 end function accel_buffer_is_allocated 1060 1061 ! ------------------------------------------ 1062 1063 integer(SIZEOF_SIZE_T) pure function opencl_get_buffer_size(this) result(size) 1064 type(accel_mem_t), intent(in) :: this 1065 1066 size = this%size 1067 end function opencl_get_buffer_size 1068 1069 ! ----------------------------------------- 1070 1071 type(type_t) pure function opencl_get_buffer_type(this) result(type) 1072 type(accel_mem_t), intent(in) :: this 1073 1074 type = this%type 1075 end function opencl_get_buffer_type 1076 1077 ! ----------------------------------------- 1078 1079 subroutine accel_finish() 1080#ifdef HAVE_OPENCL 1081 integer :: ierr 1082#endif 1083 1084 ! no push_sub, called too frequently 1085 1086 if(accel_is_enabled()) then 1087#ifdef HAVE_OPENCL 1088 call clFinish(accel%command_queue, ierr) 1089 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, 'clFinish') 1090#endif 1091#ifdef HAVE_CUDA 1092 call cuda_context_synchronize() 1093#endif 1094 end if 1095 end subroutine accel_finish 1096 1097 ! ------------------------------------------ 1098 1099 subroutine accel_set_kernel_arg_buffer(kernel, narg, buffer) 1100 type(accel_kernel_t), intent(inout) :: kernel 1101 integer, intent(in) :: narg 1102 type(accel_mem_t), intent(in) :: buffer 1103 1104#ifdef HAVE_OPENCL 1105 integer :: ierr 1106#endif 1107 1108 ASSERT(accel_buffer_is_allocated(buffer)) 1109 1110 ! no push_sub, called too frequently 1111#ifdef HAVE_OPENCL 1112 call clSetKernelArg(kernel%kernel, narg, buffer%mem, ierr) 1113 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clSetKernelArg_buf") 1114#endif 1115 1116#ifdef HAVE_CUDA 1117 call cuda_kernel_set_arg_buffer(kernel%arguments, buffer%mem, narg) 1118#endif 1119 1120 end subroutine accel_set_kernel_arg_buffer 1121 1122 ! ------------------------------------------ 1123 1124 subroutine accel_set_kernel_arg_local(kernel, narg, type, size) 1125 type(accel_kernel_t), intent(inout) :: kernel 1126 integer, intent(in) :: narg 1127 type(type_t), intent(in) :: type 1128 integer, intent(in) :: size 1129 1130#ifdef HAVE_OPENCL 1131 integer :: ierr 1132#endif 1133 integer(8) :: size_in_bytes 1134 1135 PUSH_SUB(accel_set_kernel_arg_local) 1136 1137 1138 size_in_bytes = int(size, 8)*types_get_size(type) 1139 1140 if(size_in_bytes > accel%local_memory_size) then 1141 write(message(1), '(a,f12.6,a)') "CL Error: requested local memory: ", TOFLOAT(size_in_bytes)/1024.0, " Kb" 1142 write(message(2), '(a,f12.6,a)') " available local memory: ", TOFLOAT(accel%local_memory_size)/1024.0, " Kb" 1143 call messages_fatal(2) 1144 else if(size_in_bytes <= 0) then 1145 write(message(1), '(a,i10)') "CL Error: invalid local memory size: ", size_in_bytes 1146 call messages_fatal(1) 1147 end if 1148 1149#ifdef HAVE_CUDA 1150 kernel%cuda_shared_mem = size_in_bytes 1151#endif 1152 1153#ifdef HAVE_OPENCL 1154 call clSetKernelArgLocal(kernel%kernel, narg, size_in_bytes, ierr) 1155 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "set_kernel_arg_local") 1156#endif 1157 1158 POP_SUB(accel_set_kernel_arg_local) 1159 end subroutine accel_set_kernel_arg_local 1160 1161 ! ------------------------------------------ 1162 1163 subroutine accel_kernel_run(kernel, globalsizes, localsizes) 1164 type(accel_kernel_t), intent(inout) :: kernel 1165 integer, intent(in) :: globalsizes(:) 1166 integer, intent(in) :: localsizes(:) 1167 1168 integer :: dim 1169#ifdef HAVE_OPENCL 1170 integer :: ierr 1171#endif 1172 integer(8) :: gsizes(1:3) 1173 integer(8) :: lsizes(1:3) 1174 1175 ! no push_sub, called too frequently 1176 1177 ! cuda needs all dimensions 1178 gsizes = 1 1179 lsizes = 1 1180 1181 dim = ubound(globalsizes, dim = 1) 1182 1183 ASSERT(dim == ubound(localsizes, dim = 1)) 1184 1185 ! if one size is zero, there is nothing to do 1186 if(any(globalsizes == 0)) return 1187 1188 ASSERT(all(localsizes > 0)) 1189 ASSERT(all(localsizes <= accel_max_workgroup_size())) 1190 ASSERT(all(mod(globalsizes, localsizes) == 0)) 1191 1192 gsizes(1:dim) = int(globalsizes(1:dim), 8) 1193 lsizes(1:dim) = int(localsizes(1:dim), 8) 1194 1195#ifdef HAVE_OPENCL 1196 call clEnqueueNDRangeKernel(accel%command_queue, kernel%kernel, gsizes(1:dim), lsizes(1:dim), ierr) 1197 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel") 1198#endif 1199 1200#ifdef HAVE_CUDA 1201 gsizes(1:3) = gsizes(1:3)/lsizes(1:3) 1202 1203 ASSERT(gsizes(1) < 2_8**31 - 1_8) 1204 ASSERT(all(gsizes(2:3) <= 65535_8)) 1205 1206 call cuda_launch_kernel(kernel%cuda_kernel, gsizes(1), lsizes(1), kernel%cuda_shared_mem, kernel%arguments) 1207 1208 kernel%cuda_shared_mem = 0 1209#endif 1210 1211 end subroutine accel_kernel_run 1212 1213 ! ----------------------------------------------- 1214 1215 integer pure function accel_max_workgroup_size() result(max_workgroup_size) 1216 max_workgroup_size = accel%max_workgroup_size 1217 end function accel_max_workgroup_size 1218 1219 ! ----------------------------------------------- 1220 1221 integer function accel_kernel_workgroup_size(kernel) result(workgroup_size) 1222 type(accel_kernel_t), intent(inout) :: kernel 1223 1224#ifdef HAVE_OPENCL 1225 integer(8) :: workgroup_size8 1226 integer :: ierr 1227#endif 1228 1229 workgroup_size = 0 1230 1231#ifdef HAVE_OPENCL 1232 call clGetKernelWorkGroupInfo(kernel%kernel, accel%device%cl_device, CL_KERNEL_WORK_GROUP_SIZE, workgroup_size8, ierr) 1233 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "EnqueueNDRangeKernel") 1234 workgroup_size = workgroup_size8 1235#endif 1236 1237#ifdef HAVE_CUDA 1238 workgroup_size = accel%max_workgroup_size 1239#endif 1240 1241 end function accel_kernel_workgroup_size 1242 1243 ! ----------------------------------------------- 1244 1245#ifdef HAVE_OPENCL 1246 subroutine opencl_build_program(prog, filename, flags) 1247 type(cl_program), intent(inout) :: prog 1248 character(len=*), intent(in) :: filename 1249 character(len=*), optional, intent(in) :: flags 1250 1251 character(len = 1000) :: string 1252 character(len = 256) :: share_string 1253 integer :: ierr, ierrlog, iunit, irec, newlen 1254 1255 PUSH_SUB(opencl_build_program) 1256 1257 string = '#include "'//trim(filename)//'"' 1258 1259 if(debug%info) then 1260 call messages_write("Building CL program '"//trim(filename)//"'.") 1261 call messages_info() 1262 end if 1263 1264 prog = clCreateProgramWithSource(accel%context%cl_context, trim(string), ierr) 1265 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateProgramWithSource") 1266 1267 ! build the compilation flags 1268 string='-w' 1269 ! full optimization 1270 string=trim(string)//' -cl-denorms-are-zero' 1271 ! The following flag gives an error with the Xeon Phi 1272 ! string=trim(string)//' -cl-strict-aliasing' 1273 string=trim(string)//' -cl-mad-enable' 1274 string=trim(string)//' -cl-unsafe-math-optimizations' 1275 string=trim(string)//' -cl-finite-math-only' 1276 string=trim(string)//' -cl-fast-relaxed-math' 1277 1278 share_string='-I'//trim(conf%share)//'/opencl/' 1279 1280 if (f90_cl_device_has_extension(accel%device%cl_device, "cl_khr_fp64")) then 1281 string = trim(string)//' -DEXT_KHR_FP64' 1282 else if(f90_cl_device_has_extension(accel%device%cl_device, "cl_amd_fp64")) then 1283 string = trim(string)//' -DEXT_AMD_FP64' 1284 else 1285 call messages_write('Octopus requires an OpenCL device with double-precision support.') 1286 call messages_fatal() 1287 end if 1288 1289 if(accel_use_shared_mem()) then 1290 string = trim(string)//' -DSHARED_MEM' 1291 end if 1292 1293 if(present(flags)) then 1294 string = trim(string)//' '//trim(flags) 1295 end if 1296 1297 if(debug%info) then 1298 call messages_write("Debug info: compilation flags '"//trim(string), new_line = .true.) 1299 call messages_write(' '//trim(share_string)//"'.") 1300 call messages_info() 1301 end if 1302 1303 string = trim(string)//' '//trim(share_string) 1304 1305 call clBuildProgram(prog, trim(string), ierr) 1306 1307 call clGetProgramBuildInfo(prog, accel%device%cl_device, CL_PROGRAM_BUILD_LOG, string, ierrlog) 1308 if(ierrlog /= CL_SUCCESS) call opencl_print_error(ierrlog, "clGetProgramBuildInfo") 1309 1310 ! CL_PROGRAM_BUILD_LOG seems to have a useless '\n' in it 1311 newlen = scan(string, achar(010), back = .true.) - 1 1312 if(newlen >= 0) string = string(1:newlen) 1313 1314 if(len(trim(string)) > 0) write(stderr, '(a)') trim(string) 1315 1316 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clBuildProgram") 1317 1318 POP_SUB(opencl_build_program) 1319 end subroutine opencl_build_program 1320#endif 1321 1322 ! ----------------------------------------------- 1323#ifdef HAVE_OPENCL 1324 subroutine opencl_release_program(prog) 1325 type(cl_program), intent(inout) :: prog 1326 1327 integer :: ierr 1328 1329 PUSH_SUB(opencl_release_program) 1330 1331 call clReleaseProgram(prog, ierr) 1332 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseProgram") 1333 1334 POP_SUB(opencl_release_program) 1335 end subroutine opencl_release_program 1336#endif 1337 1338 ! ----------------------------------------------- 1339 1340#ifdef HAVE_OPENCL 1341 subroutine opencl_release_kernel(prog) 1342 type(cl_kernel), intent(inout) :: prog 1343 1344 integer :: ierr 1345 1346 PUSH_SUB(opencl_release_kernel) 1347 1348#ifdef HAVE_OPENCL 1349 call clReleaseKernel(prog, ierr) 1350 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clReleaseKernel") 1351#endif 1352 1353 POP_SUB(opencl_release_kernel) 1354 end subroutine opencl_release_kernel 1355#endif 1356 1357#ifdef HAVE_OPENCL 1358 ! ----------------------------------------------- 1359 subroutine opencl_create_kernel(kernel, prog, name) 1360 type(cl_kernel), intent(inout) :: kernel 1361 type(cl_program), intent(inout) :: prog 1362 character(len=*), intent(in) :: name 1363 1364 integer :: ierr 1365 type(profile_t), save :: prof 1366 1367 PUSH_SUB(opencl_create_kernel) 1368 call profiling_in(prof, "CL_BUILD_KERNEL", exclude = .true.) 1369 1370#ifdef HAVE_OPENCL 1371 kernel = clCreateKernel(prog, name, ierr) 1372 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "clCreateKernel") 1373#endif 1374 1375 call profiling_out(prof) 1376 POP_SUB(opencl_create_kernel) 1377 end subroutine opencl_create_kernel 1378#endif 1379 1380 ! ------------------------------------------------ 1381 1382 subroutine opencl_print_error(ierr, name) 1383 integer, intent(in) :: ierr 1384 character(len=*), intent(in) :: name 1385 1386 character(len=40) :: errcode 1387 1388 PUSH_SUB(opencl_print_error) 1389 1390#ifdef HAVE_OPENCL 1391 select case(ierr) 1392 case(CL_SUCCESS); errcode = 'CL_SUCCESS ' 1393 case(CL_DEVICE_NOT_FOUND); errcode = 'CL_DEVICE_NOT_FOUND ' 1394 case(CL_DEVICE_NOT_AVAILABLE); errcode = 'CL_DEVICE_NOT_AVAILABLE ' 1395 case(CL_COMPILER_NOT_AVAILABLE); errcode = 'CL_COMPILER_NOT_AVAILABLE ' 1396 case(CL_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CL_MEM_OBJECT_ALLOCATION_FAILURE ' 1397 case(CL_OUT_OF_RESOURCES); errcode = 'CL_OUT_OF_RESOURCES ' 1398 case(CL_OUT_OF_HOST_MEMORY); errcode = 'CL_OUT_OF_HOST_MEMORY ' 1399 case(CL_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CL_PROFILING_INFO_NOT_AVAILABLE ' 1400 case(CL_MEM_COPY_OVERLAP); errcode = 'CL_MEM_COPY_OVERLAP ' 1401 case(CL_IMAGE_FORMAT_MISMATCH); errcode = 'CL_IMAGE_FORMAT_MISMATCH ' 1402 case(CL_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CL_IMAGE_FORMAT_NOT_SUPPORTED ' 1403 case(CL_BUILD_PROGRAM_FAILURE); errcode = 'CL_BUILD_PROGRAM_FAILURE ' 1404 case(CL_MAP_FAILURE); errcode = 'CL_MAP_FAILURE ' 1405 case(CL_INVALID_VALUE); errcode = 'CL_INVALID_VALUE ' 1406 case(CL_INVALID_DEVICE_TYPE); errcode = 'CL_INVALID_DEVICE_TYPE ' 1407 case(CL_INVALID_PLATFORM); errcode = 'CL_INVALID_PLATFORM ' 1408 case(CL_INVALID_DEVICE); errcode = 'CL_INVALID_DEVICE ' 1409 case(CL_INVALID_CONTEXT); errcode = 'CL_INVALID_CONTEXT ' 1410 case(CL_INVALID_QUEUE_PROPERTIES); errcode = 'CL_INVALID_QUEUE_PROPERTIES ' 1411 case(CL_INVALID_COMMAND_QUEUE); errcode = 'CL_INVALID_COMMAND_QUEUE ' 1412 case(CL_INVALID_HOST_PTR); errcode = 'CL_INVALID_HOST_PTR ' 1413 case(CL_INVALID_MEM_OBJECT); errcode = 'CL_INVALID_MEM_OBJECT ' 1414 case(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CL_INVALID_IMAGE_FORMAT_DESCRIPTOR ' 1415 case(CL_INVALID_IMAGE_SIZE); errcode = 'CL_INVALID_IMAGE_SIZE ' 1416 case(CL_INVALID_SAMPLER); errcode = 'CL_INVALID_SAMPLER ' 1417 case(CL_INVALID_BINARY); errcode = 'CL_INVALID_BINARY ' 1418 case(CL_INVALID_BUILD_OPTIONS); errcode = 'CL_INVALID_BUILD_OPTIONS ' 1419 case(CL_INVALID_PROGRAM); errcode = 'CL_INVALID_PROGRAM ' 1420 case(CL_INVALID_PROGRAM_EXECUTABLE); errcode = 'CL_INVALID_PROGRAM_EXECUTABLE ' 1421 case(CL_INVALID_KERNEL_NAME); errcode = 'CL_INVALID_KERNEL_NAME ' 1422 case(CL_INVALID_KERNEL_DEFINITION); errcode = 'CL_INVALID_KERNEL_DEFINITION ' 1423 case(CL_INVALID_KERNEL); errcode = 'CL_INVALID_KERNEL ' 1424 case(CL_INVALID_ARG_INDEX); errcode = 'CL_INVALID_ARG_INDEX ' 1425 case(CL_INVALID_ARG_VALUE); errcode = 'CL_INVALID_ARG_VALUE ' 1426 case(CL_INVALID_ARG_SIZE); errcode = 'CL_INVALID_ARG_SIZE ' 1427 case(CL_INVALID_KERNEL_ARGS); errcode = 'CL_INVALID_KERNEL_ARGS ' 1428 case(CL_INVALID_WORK_DIMENSION); errcode = 'CL_INVALID_WORK_DIMENSION ' 1429 case(CL_INVALID_WORK_GROUP_SIZE); errcode = 'CL_INVALID_WORK_GROUP_SIZE ' 1430 case(CL_INVALID_WORK_ITEM_SIZE); errcode = 'CL_INVALID_WORK_ITEM_SIZE ' 1431 case(CL_INVALID_GLOBAL_OFFSET); errcode = 'CL_INVALID_GLOBAL_OFFSET ' 1432 case(CL_INVALID_EVENT_WAIT_LIST); errcode = 'CL_INVALID_EVENT_WAIT_LIST ' 1433 case(CL_INVALID_EVENT); errcode = 'CL_INVALID_EVENT ' 1434 case(CL_INVALID_OPERATION); errcode = 'CL_INVALID_OPERATION ' 1435 case(CL_INVALID_GL_OBJECT); errcode = 'CL_INVALID_GL_OBJECT ' 1436 case(CL_INVALID_BUFFER_SIZE); errcode = 'CL_INVALID_BUFFER_SIZE ' 1437 case(CL_INVALID_MIP_LEVEL); errcode = 'CL_INVALID_MIP_LEVEL ' 1438 case(CL_INVALID_GLOBAL_WORK_SIZE); errcode = 'CL_INVALID_GLOBAL_WORK_SIZE ' 1439 case(CL_PLATFORM_NOT_FOUND_KHR); errcode = 'CL_PLATFORM_NOT_FOUND_KHR' 1440 case default 1441 write(errcode, '(i10)') ierr 1442 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')' 1443 end select 1444#endif 1445 1446 message(1) = 'OpenCL '//trim(name)//' '//trim(errcode) 1447 call messages_fatal(1) 1448 1449 POP_SUB(opencl_print_error) 1450 end subroutine opencl_print_error 1451 1452 ! ---------------------------------------------------- 1453 1454 subroutine clblas_print_error(ierr, name) 1455 integer, intent(in) :: ierr 1456 character(len=*), intent(in) :: name 1457 1458 character(len=40) :: errcode 1459 1460 PUSH_SUB(clblas_print_error) 1461#ifdef HAVE_CLBLAS 1462 select case(ierr) 1463 case(clblasSuccess); errcode = 'clblasSuccess' 1464 case(clblasInvalidValue); errcode = 'clblasInvalidValue' 1465 case(clblasInvalidCommandQueue); errcode = 'clblasInvalidCommandQueue' 1466 case(clblasInvalidContext); errcode = 'clblasInvalidContext' 1467 case(clblasInvalidMemObject); errcode = 'clblasInvalidMemObject' 1468 case(clblasInvalidDevice); errcode = 'clblasInvalidDevice' 1469 case(clblasInvalidEventWaitList); errcode = 'clblasInvalidEventWaitList' 1470 case(clblasOutOfResources); errcode = 'clblasOutOfResources' 1471 case(clblasOutOfHostMemory); errcode = 'clblasOutOfHostMemory' 1472 case(clblasInvalidOperation); errcode = 'clblasInvalidOperation' 1473 case(clblasCompilerNotAvailable); errcode = 'clblasCompilerNotAvailable' 1474 case(clblasBuildProgramFailure ); errcode = 'clblasBuildProgramFailure' 1475 case(clblasNotImplemented); errcode = 'clblasNotImplemented' 1476 case(clblasNotInitialized); errcode = 'clblasNotInitialized' 1477 case(clblasInvalidMatA); errcode = 'clblasInvalidMatA' 1478 case(clblasInvalidMatB); errcode = 'clblasInvalidMatB' 1479 case(clblasInvalidMatC); errcode = 'clblasInvalidMatC' 1480 case(clblasInvalidVecX); errcode = 'clblasInvalidVecX' 1481 case(clblasInvalidVecY); errcode = 'clblasInvalidVecY' 1482 case(clblasInvalidDim); errcode = 'clblasInvalidDim' 1483 case(clblasInvalidLeadDimA); errcode = 'clblasInvalidLeadDimA' 1484 case(clblasInvalidLeadDimB); errcode = 'clblasInvalidLeadDimB' 1485 case(clblasInvalidLeadDimC); errcode = 'clblasInvalidLeadDimC' 1486 case(clblasInvalidIncX); errcode = 'clblasInvalidIncX' 1487 case(clblasInvalidIncY); errcode = 'clblasInvalidIncY' 1488 case(clblasInsufficientMemMatA); errcode = 'clblasInsufficientMemMatA' 1489 case(clblasInsufficientMemMatB); errcode = 'clblasInsufficientMemMatB' 1490 case(clblasInsufficientMemMatC); errcode = 'clblasInsufficientMemMatC' 1491 case(clblasInsufficientMemVecX); errcode = 'clblasInsufficientMemVecX' 1492 case(clblasInsufficientMemVecY); errcode = 'clblasInsufficientMemVecY' 1493 case default 1494 write(errcode, '(i10)') ierr 1495 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')' 1496 end select 1497#endif 1498 1499 message(1) = 'clblas '//trim(name)//' '//trim(errcode) 1500 call messages_fatal(1) 1501 1502 POP_SUB(clblas_print_error) 1503 end subroutine clblas_print_error 1504 1505 ! ---------------------------------------------------- 1506 subroutine clfft_print_error(ierr, name) 1507 integer, intent(in) :: ierr 1508 character(len=*), intent(in) :: name 1509 1510 character(len=40) :: errcode 1511 1512 PUSH_SUB(clfft_print_error) 1513#ifdef HAVE_CLFFT 1514 select case(ierr) 1515 case(CLFFT_INVALID_GLOBAL_WORK_SIZE); errcode = 'CLFFT_INVALID_GLOBAL_WORK_SIZE' 1516 case(CLFFT_INVALID_MIP_LEVEL); errcode = 'CLFFT_INVALID_MIP_LEVEL' 1517 case(CLFFT_INVALID_BUFFER_SIZE); errcode = 'CLFFT_INVALID_BUFFER_SIZE' 1518 case(CLFFT_INVALID_GL_OBJECT); errcode = 'CLFFT_INVALID_GL_OBJECT' 1519 case(CLFFT_INVALID_OPERATION); errcode = 'CLFFT_INVALID_OPERATION' 1520 case(CLFFT_INVALID_EVENT); errcode = 'CLFFT_INVALID_EVENT' 1521 case(CLFFT_INVALID_EVENT_WAIT_LIST); errcode = 'CLFFT_INVALID_EVENT_WAIT_LIST' 1522 case(CLFFT_INVALID_GLOBAL_OFFSET); errcode = 'CLFFT_INVALID_GLOBAL_OFFSET' 1523 case(CLFFT_INVALID_WORK_ITEM_SIZE); errcode = 'CLFFT_INVALID_WORK_ITEM_SIZE' 1524 case(CLFFT_INVALID_WORK_GROUP_SIZE); errcode = 'CLFFT_INVALID_WORK_GROUP_SIZE' 1525 case(CLFFT_INVALID_WORK_DIMENSION); errcode = 'CLFFT_INVALID_WORK_DIMENSION' 1526 case(CLFFT_INVALID_KERNEL_ARGS); errcode = 'CLFFT_INVALID_KERNEL_ARGS' 1527 case(CLFFT_INVALID_ARG_SIZE); errcode = 'CLFFT_INVALID_ARG_SIZE' 1528 case(CLFFT_INVALID_ARG_VALUE); errcode = 'CLFFT_INVALID_ARG_VALUE' 1529 case(CLFFT_INVALID_ARG_INDEX); errcode = 'CLFFT_INVALID_ARG_INDEX' 1530 case(CLFFT_INVALID_KERNEL); errcode = 'CLFFT_INVALID_KERNEL' 1531 case(CLFFT_INVALID_KERNEL_DEFINITION); errcode = 'CLFFT_INVALID_KERNEL_DEFINITION' 1532 case(CLFFT_INVALID_KERNEL_NAME); errcode = 'CLFFT_INVALID_KERNEL_NAME' 1533 case(CLFFT_INVALID_PROGRAM_EXECUTABLE); errcode = 'CLFFT_INVALID_PROGRAM_EXECUTABLE' 1534 case(CLFFT_INVALID_PROGRAM); errcode = 'CLFFT_INVALID_PROGRAM' 1535 case(CLFFT_INVALID_BUILD_OPTIONS); errcode = 'CLFFT_INVALID_BUILD_OPTIONS' 1536 case(CLFFT_INVALID_BINARY); errcode = 'CLFFT_INVALID_BINARY' 1537 case(CLFFT_INVALID_SAMPLER); errcode = 'CLFFT_INVALID_SAMPLER' 1538 case(CLFFT_INVALID_IMAGE_SIZE); errcode = 'CLFFT_INVALID_IMAGE_SIZE' 1539 case(CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR); errcode = 'CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR' 1540 case(CLFFT_INVALID_MEM_OBJECT); errcode = 'CLFFT_INVALID_MEM_OBJECT' 1541 case(CLFFT_INVALID_HOST_PTR); errcode = 'CLFFT_INVALID_HOST_PTR' 1542 case(CLFFT_INVALID_COMMAND_QUEUE); errcode = 'CLFFT_INVALID_COMMAND_QUEUE' 1543 case(CLFFT_INVALID_QUEUE_PROPERTIES); errcode = 'CLFFT_INVALID_QUEUE_PROPERTIES' 1544 case(CLFFT_INVALID_CONTEXT); errcode = 'CLFFT_INVALID_CONTEXT' 1545 case(CLFFT_INVALID_DEVICE); errcode = 'CLFFT_INVALID_DEVICE' 1546 case(CLFFT_INVALID_PLATFORM); errcode = 'CLFFT_INVALID_PLATFORM' 1547 case(CLFFT_INVALID_DEVICE_TYPE); errcode = 'CLFFT_INVALID_DEVICE_TYPE' 1548 case(CLFFT_INVALID_VALUE); errcode = 'CLFFT_INVALID_VALUE' 1549 case(CLFFT_MAP_FAILURE); errcode = 'CLFFT_MAP_FAILURE' 1550 case(CLFFT_BUILD_PROGRAM_FAILURE); errcode = 'CLFFT_BUILD_PROGRAM_FAILURE' 1551 case(CLFFT_IMAGE_FORMAT_NOT_SUPPORTED); errcode = 'CLFFT_IMAGE_FORMAT_NOT_SUPPORTED' 1552 case(CLFFT_IMAGE_FORMAT_MISMATCH); errcode = 'CLFFT_IMAGE_FORMAT_MISMATCH' 1553 case(CLFFT_MEM_COPY_OVERLAP); errcode = 'CLFFT_MEM_COPY_OVERLAP' 1554 case(CLFFT_PROFILING_INFO_NOT_AVAILABLE); errcode = 'CLFFT_PROFILING_INFO_NOT_AVAILABLE' 1555 case(CLFFT_OUT_OF_HOST_MEMORY); errcode = 'CLFFT_OUT_OF_HOST_MEMORY' 1556 case(CLFFT_OUT_OF_RESOURCES); errcode = 'CLFFT_OUT_OF_RESOURCES' 1557 case(CLFFT_MEM_OBJECT_ALLOCATION_FAILURE); errcode = 'CLFFT_MEM_OBJECT_ALLOCATION_FAILURE' 1558 case(CLFFT_COMPILER_NOT_AVAILABLE); errcode = 'CLFFT_COMPILER_NOT_AVAILABLE' 1559 case(CLFFT_DEVICE_NOT_AVAILABLE); errcode = 'CLFFT_DEVICE_NOT_AVAILABLE' 1560 case(CLFFT_DEVICE_NOT_FOUND); errcode = 'CLFFT_DEVICE_NOT_FOUND' 1561 case(CLFFT_SUCCESS); errcode = 'CLFFT_SUCCESS' 1562 case(CLFFT_BUGCHECK); errcode = 'CLFFT_BUGCHECK' 1563 case(CLFFT_NOTIMPLEMENTED); errcode = 'CLFFT_NOTIMPLEMENTED' 1564 case(CLFFT_FILE_NOT_FOUND); errcode = 'CLFFT_FILE_NOT_FOUND' 1565 case(CLFFT_FILE_CREATE_FAILURE); errcode = 'CLFFT_FILE_CREATE_FAILURE' 1566 case(CLFFT_VERSION_MISMATCH); errcode = 'CLFFT_VERSION_MISMATCH' 1567 case(CLFFT_INVALID_PLAN); errcode = 'CLFFT_INVALID_PLAN' 1568 case(CLFFT_DEVICE_NO_DOUBLE); errcode = 'CLFFT_DEVICE_NO_DOUBLE' 1569 case(CLFFT_ENDSTATUS); errcode = 'CLFFT_ENDSTATUS' 1570 case default 1571 write(errcode, '(i10)') ierr 1572 errcode = 'UNKNOWN ERROR CODE ('//trim(adjustl(errcode))//')' 1573 end select 1574#endif 1575 1576 message(1) = 'clfft '//trim(name)//' '//trim(errcode) 1577 call messages_fatal(1) 1578 1579 POP_SUB(clfft_print_error) 1580 end subroutine clfft_print_error 1581 1582 ! ---------------------------------------------------- 1583 1584#ifdef HAVE_OPENCL 1585 logical function f90_cl_device_has_extension(device, extension) result(has) 1586 type(cl_device_id), intent(inout) :: device 1587 character(len=*), intent(in) :: extension 1588 1589 integer :: cl_status 1590 character(len=2048) :: all_extensions 1591 1592#ifdef HAVE_OPENCL 1593 call clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, all_extensions, cl_status) 1594#endif 1595 1596 has = index(all_extensions, extension) /= 0 1597 1598 end function f90_cl_device_has_extension 1599#endif 1600 1601 ! --------------------------------------------------------- 1602 1603 integer pure function opencl_pad(size, blk) result(pad) 1604 integer, intent(in) :: size 1605 integer, intent(in) :: blk 1606 1607 integer :: mm 1608 1609 mm = mod(size, blk) 1610 if(mm == 0) then 1611 pad = size 1612 else 1613 pad = size + blk - mm 1614 end if 1615 end function opencl_pad 1616 1617 ! ---------------------------------------------------- 1618 1619 subroutine accel_set_buffer_to_zero(buffer, type, nval, offset) 1620 type(accel_mem_t), intent(inout) :: buffer 1621 type(type_t), intent(in) :: type 1622 integer, intent(in) :: nval 1623 integer, optional, intent(in) :: offset 1624 1625 integer :: nval_real, bsize, offset_real 1626 1627 PUSH_SUB(accel_set_buffer_to_zero) 1628 1629 ASSERT(type == TYPE_CMPLX .or. type == TYPE_FLOAT) 1630 1631 if(nval > 0) then 1632 1633 nval_real = nval*(types_get_size(type)/8) 1634 offset_real = optional_default(offset, 0)*(types_get_size(type)/8) 1635 1636 ASSERT(nval_real > 0) 1637 1638 call accel_set_kernel_arg(set_zero, 0, nval_real) 1639 call accel_set_kernel_arg(set_zero, 1, offset_real) 1640 call accel_set_kernel_arg(set_zero, 2, buffer) 1641 1642 bsize = accel_kernel_workgroup_size(set_zero) 1643 1644 1645 call accel_kernel_run(set_zero, (/ opencl_pad(nval_real, bsize) /), (/ bsize /)) 1646 call accel_finish() 1647 1648 end if 1649 1650 POP_SUB(accel_set_buffer_to_zero) 1651 end subroutine accel_set_buffer_to_zero 1652 1653 ! ---------------------------------------------------- 1654 1655 subroutine opencl_check_bandwidth() 1656 integer :: itime 1657 integer, parameter :: times = 10 1658 integer :: size 1659 FLOAT :: time, stime 1660 FLOAT :: read_bw, write_bw 1661 type(accel_mem_t) :: buff 1662 FLOAT, allocatable :: data(:) 1663 1664 call messages_new_line() 1665 call messages_write('Info: Benchmarking the bandwidth between main memory and device memory') 1666 call messages_new_line() 1667 call messages_info() 1668 1669 call messages_write(' Buffer size Read bw Write bw') 1670 call messages_new_line() 1671 call messages_write(' [MiB] [MiB/s] [MiB/s]') 1672 call messages_info() 1673 1674 size = 15000 1675 do 1676 SAFE_ALLOCATE(data(1:size)) 1677 call accel_create_buffer(buff, ACCEL_MEM_READ_WRITE, TYPE_FLOAT, size) 1678 1679 stime = loct_clock() 1680 do itime = 1, times 1681 call accel_write_buffer(buff, size, data) 1682 call accel_finish() 1683 end do 1684 time = (loct_clock() - stime)/TOFLOAT(times) 1685 1686 write_bw = TOFLOAT(size)*CNST(8.0)/time 1687 1688 stime = loct_clock() 1689 do itime = 1, times 1690 call accel_read_buffer(buff, size, data) 1691 end do 1692 call accel_finish() 1693 1694 time = (loct_clock() - stime)/TOFLOAT(times) 1695 read_bw = TOFLOAT(size)*CNST(8.0)/time 1696 1697 call messages_write(size*CNST(8.0)/CNST(1024.0)**2) 1698 call messages_write(write_bw/CNST(1024.0)**2, fmt = '(f10.1)') 1699 call messages_write(read_bw/CNST(1024.0)**2, fmt = '(f10.1)') 1700 call messages_info() 1701 1702 call accel_release_buffer(buff) 1703 1704 SAFE_DEALLOCATE_A(data) 1705 1706 size = int(size*2.0) 1707 1708 if(size > 50000000) exit 1709 end do 1710 end subroutine opencl_check_bandwidth 1711 1712 ! ---------------------------------------------------- 1713 1714 logical pure function accel_use_shared_mem() result(use_shared_mem) 1715 1716 use_shared_mem = accel%shared_mem 1717 1718 end function accel_use_shared_mem 1719 1720 !------------------------------------------------------------ 1721 1722 subroutine accel_kernel_global_init() 1723 1724 PUSH_SUB(accel_kernel_global_init) 1725 1726 nullify(head) 1727 1728 call cuda_module_map_init(accel%module_map) 1729 1730 POP_SUB(accel_kernel_global_init) 1731 end subroutine accel_kernel_global_init 1732 1733 !------------------------------------------------------------ 1734 1735 subroutine accel_kernel_global_end() 1736 type(accel_kernel_t), pointer :: next_head 1737 1738 PUSH_SUB(accel_kernel_global_end) 1739 1740 do 1741 if(.not. associated(head)) exit 1742 next_head => head%next 1743 call accel_kernel_end(head) 1744 head => next_head 1745 end do 1746 1747 if(accel_is_enabled()) then 1748 call cuda_module_map_end(accel%module_map) 1749 end if 1750 1751 POP_SUB(accel_kernel_global_end) 1752 end subroutine accel_kernel_global_end 1753 1754 !------------------------------------------------------------ 1755 1756 subroutine accel_kernel_build(this, file_name, kernel_name, flags) 1757 type(accel_kernel_t), intent(inout) :: this 1758 character(len=*), intent(in) :: file_name 1759 character(len=*), intent(in) :: kernel_name 1760 character(len=*), optional, intent(in) :: flags 1761 1762 type(profile_t), save :: prof 1763#ifdef HAVE_OPENCL 1764 type(cl_program) :: prog 1765#endif 1766#ifdef HAVE_CUDA 1767 character(len=1000) :: all_flags 1768 type(c_ptr) :: cuda_module 1769#endif 1770 1771 PUSH_SUB(accel_kernel_build) 1772 1773 call profiling_in(prof, "ACCEL_COMPILE", exclude = .true.) 1774 1775#ifdef HAVE_CUDA 1776 all_flags = '-I'//trim(conf%share)//'/opencl/' 1777 1778 if(accel_use_shared_mem()) then 1779 all_flags = trim(all_flags)//' -DSHARED_MEM' 1780 end if 1781 1782 if(present(flags)) then 1783 all_flags = trim(all_flags)//' '//trim(flags) 1784 end if 1785 1786 call cuda_build_program(accel%module_map, this%cuda_module, accel%device%cuda_device, trim(file_name), trim(all_flags)) 1787 1788 call cuda_create_kernel(this%cuda_kernel, this%cuda_module, trim(kernel_name)) 1789 call cuda_alloc_arg_array(this%arguments) 1790 1791 this%cuda_shared_mem = 0 1792#endif 1793 1794#ifdef HAVE_OPENCL 1795 call opencl_build_program(prog, trim(conf%share)//'/opencl/'//trim(file_name), flags = flags) 1796 call opencl_create_kernel(this%kernel, prog, trim(kernel_name)) 1797 call opencl_release_program(prog) 1798#endif 1799 1800 this%initialized = .true. 1801 1802 call profiling_out(prof) 1803 1804 POP_SUB(accel_kernel_build) 1805 end subroutine accel_kernel_build 1806 1807 !------------------------------------------------------------ 1808 1809 subroutine accel_kernel_end(this) 1810 type(accel_kernel_t), intent(inout) :: this 1811#ifdef HAVE_OPENCL 1812 integer :: ierr 1813#endif 1814 1815 PUSH_SUB(accel_kernel_end) 1816 1817#ifdef HAVE_CUDA 1818 call cuda_free_arg_array(this%arguments) 1819 call cuda_release_kernel(this%cuda_kernel) 1820 ! modules are not released here, since they are not associated to a kernel 1821#endif 1822 1823#ifdef HAVE_OPENCL 1824 call clReleaseKernel(this%kernel, ierr) 1825 if(ierr /= CL_SUCCESS) call opencl_print_error(ierr, "release_kernel") 1826#endif 1827 this%initialized = .false. 1828 1829 POP_SUB(accel_kernel_end) 1830 end subroutine accel_kernel_end 1831 1832 !------------------------------------------------------------ 1833 1834 subroutine accel_kernel_start_call(this, file_name, kernel_name, flags) 1835 type(accel_kernel_t), target, intent(inout) :: this 1836 character(len=*), intent(in) :: file_name 1837 character(len=*), intent(in) :: kernel_name 1838 character(len=*), optional, intent(in) :: flags 1839 1840 PUSH_SUB(accel_kernel_start_call) 1841 1842 if(.not. this%initialized) then 1843 call accel_kernel_build(this, file_name, kernel_name, flags) 1844 this%next => head 1845 head => this 1846 end if 1847 1848 POP_SUB(accel_kernel_start_call) 1849 end subroutine accel_kernel_start_call 1850 1851 !-------------------------------------------------------------- 1852 1853 integer(8) pure function accel_global_memory_size() result(size) 1854 1855 size = accel%global_memory_size 1856 1857 end function accel_global_memory_size 1858 1859 !-------------------------------------------------------------- 1860 1861 integer(8) pure function accel_local_memory_size() result(size) 1862 1863 size = accel%local_memory_size 1864 1865 end function accel_local_memory_size 1866 1867 !-------------------------------------------------------------- 1868 1869 integer pure function accel_max_size_per_dim(dim) result(size) 1870 integer, intent(in) :: dim 1871 1872 size = 0 1873#ifdef HAVE_OPENCL 1874 size = 2**30 1875#endif 1876#ifdef HAVE_CUDA 1877 if(dim == 1) size = 2**30 1878 size = 32768 1879#endif 1880 end function accel_max_size_per_dim 1881 1882 ! ------------------------------------------------------ 1883 1884 subroutine accel_set_stream(stream_number) 1885 integer, intent(in) :: stream_number 1886 1887 PUSH_SUB(accel_set_stream) 1888 1889 if(accel_is_enabled()) then 1890#ifdef HAVE_CUDA 1891 call cuda_set_stream(accel%cuda_stream, stream_number) 1892 call cublas_set_stream(accel%cublas_handle, accel%cuda_stream) 1893#endif 1894 end if 1895 1896 POP_SUB(accel_set_stream) 1897 end subroutine accel_set_stream 1898 1899 ! ------------------------------------------------------ 1900 1901 subroutine accel_synchronize_all_streams() 1902 PUSH_SUB(accel_synchronize_all_streams) 1903 1904#ifdef HAVE_CUDA 1905 call cuda_synchronize_all_streams() 1906#endif 1907 1908 POP_SUB(accel_synchronize_all_streams) 1909 end subroutine accel_synchronize_all_streams 1910 1911#include "undef.F90" 1912#include "real.F90" 1913#include "accel_inc.F90" 1914 1915#include "undef.F90" 1916#include "complex.F90" 1917#include "accel_inc.F90" 1918 1919#include "undef.F90" 1920#include "integer.F90" 1921#include "accel_inc.F90" 1922 1923end module accel_oct_m 1924 1925!! Local Variables: 1926!! mode: f90 1927!! coding: utf-8 1928!! End: 1929