1!! Copyright (C) 2008 X. Andrade, 2020 S. Ohlmann 2!! 3!! This program is free software; you can redistribute it and/or modify 4!! it under the terms of the GNU General Public License as published by 5!! the Free Software Foundation; either version 2, or (at your option) 6!! any later version. 7!! 8!! This program is distributed in the hope that it will be useful, 9!! but WITHOUT ANY WARRANTY; without even the implied warranty of 10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11!! GNU General Public License for more details. 12!! 13!! You should have received a copy of the GNU General Public License 14!! along with this program; if not, write to the Free Software 15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 16!! 02110-1301, USA. 17!! 18 19#include "global.h" 20 21module batch_oct_m 22 use accel_oct_m 23 use allocate_hardware_aware_oct_m 24 use blas_oct_m 25 use global_oct_m 26 use hardware_oct_m 27 use iso_c_binding 28 use math_oct_m 29 use messages_oct_m 30 use mpi_oct_m 31 use profiling_oct_m 32 use types_oct_m 33 34 implicit none 35 36 private 37 public :: & 38 batch_t, & 39 batch_init, & 40 dbatch_init, & 41 zbatch_init 42 43 type batch_t 44 private 45 integer, public :: nst 46 integer, public :: dim 47 integer :: np 48 49 integer :: ndims 50 integer, pointer :: ist_idim_index(:, :) 51 integer, allocatable, public :: ist(:) 52 53 logical :: is_allocated 54 logical :: own_memory !< does the batch own the memory or is it foreign memory? 55 !> We also need a linear array with the states in order to calculate derivatives, etc. 56 integer, public :: nst_linear 57 58 integer :: status_of 59 integer :: status_host 60 type(type_t) :: type_of !< either TYPE_FLOAT or TYPE_COMPLEX 61 integer :: device_buffer_count !< whether there is a copy in the opencl buffer 62 integer :: host_buffer_count !< whether the batch was packed on the cpu 63 logical :: special_memory 64 logical :: needs_finish_unpack 65 66 67 !> unpacked variables; linear variables are pointers with different shapes 68 FLOAT, pointer, contiguous, public :: dff(:, :, :) 69 CMPLX, pointer, contiguous, public :: zff(:, :, :) 70 FLOAT, pointer, contiguous, public :: dff_linear(:, :) 71 CMPLX, pointer, contiguous, public :: zff_linear(:, :) 72 !> packed variables; only rank-2 arrays due to padding to powers of 2 73 FLOAT, pointer, contiguous, public :: dff_pack(:, :) 74 CMPLX, pointer, contiguous, public :: zff_pack(:, :) 75 76 integer, public :: pack_size(1:2) 77 integer, public :: pack_size_real(1:2) 78 79 type(accel_mem_t), public :: ff_device 80 81 contains 82 procedure :: check_compatibility_with => batch_check_compatibility_with 83 procedure :: clone_to => batch_clone_to 84 procedure :: clone_to_array => batch_clone_to_array 85 procedure :: copy_to => batch_copy_to 86 procedure :: copy_data_to => batch_copy_data_to 87 procedure :: do_pack => batch_do_pack 88 procedure :: do_unpack => batch_do_unpack 89 procedure :: finish_unpack => batch_finish_unpack 90 procedure :: end => batch_end 91 procedure :: inv_index => batch_inv_index 92 procedure :: is_packed => batch_is_packed 93 procedure :: ist_idim_to_linear => batch_ist_idim_to_linear 94 procedure :: linear_to_idim => batch_linear_to_idim 95 procedure :: linear_to_ist => batch_linear_to_ist 96 procedure :: pack_total_size => batch_pack_total_size 97 procedure :: remote_access_start => batch_remote_access_start 98 procedure :: remote_access_stop => batch_remote_access_stop 99 procedure :: status => batch_status 100 procedure :: type => batch_type 101 procedure :: type_as_int => batch_type_as_integer 102 procedure, private :: dallocate_unpacked_host => dbatch_allocate_unpacked_host 103 procedure, private :: zallocate_unpacked_host => zbatch_allocate_unpacked_host 104 procedure, private :: allocate_unpacked_host => batch_allocate_unpacked_host 105 procedure, private :: dallocate_packed_host => dbatch_allocate_packed_host 106 procedure, private :: zallocate_packed_host => zbatch_allocate_packed_host 107 procedure, private :: allocate_packed_host => batch_allocate_packed_host 108 procedure, private :: allocate_packed_device => batch_allocate_packed_device 109 procedure, private :: deallocate_unpacked_host => batch_deallocate_unpacked_host 110 procedure, private :: deallocate_packed_host => batch_deallocate_packed_host 111 procedure, private :: deallocate_packed_device => batch_deallocate_packed_device 112 end type batch_t 113 114 !-------------------------------------------------------------- 115 interface batch_init 116 module procedure dbatch_init_with_memory_3 117 module procedure zbatch_init_with_memory_3 118 module procedure dbatch_init_with_memory_2 119 module procedure zbatch_init_with_memory_2 120 module procedure dbatch_init_with_memory_1 121 module procedure zbatch_init_with_memory_1 122 end interface batch_init 123 124 integer, public, parameter :: & 125 BATCH_NOT_PACKED = 0, & 126 BATCH_PACKED = 1, & 127 BATCH_DEVICE_PACKED = 2 128 129 integer, parameter :: CL_PACK_MAX_BUFFER_SIZE = 4 !< this value controls the size (in number of wave-functions) 130 !! of the buffer used to copy states to the opencl device. 131 132contains 133 134 !-------------------------------------------------------------- 135 subroutine batch_end(this, copy) 136 class(batch_t), intent(inout) :: this 137 logical, optional, intent(in) :: copy 138 139 PUSH_SUB(batch_end) 140 141 if(this%own_memory .and. this%is_packed()) then 142 !deallocate directly to avoid unnecessary copies 143 if(this%status() == BATCH_DEVICE_PACKED) then 144 call this%deallocate_packed_device() 145 end if 146 if(this%status() == BATCH_PACKED .or. this%status_host == BATCH_PACKED) then 147 call this%deallocate_packed_host() 148 end if 149 this%status_of = BATCH_NOT_PACKED 150 this%status_host = BATCH_NOT_PACKED 151 this%host_buffer_count = 0 152 this%device_buffer_count = 0 153 end if 154 if(this%status() == BATCH_DEVICE_PACKED) call this%do_unpack(copy, force = .true.) 155 if(this%status() == BATCH_PACKED) call this%do_unpack(copy, force = .true.) 156 157 if(this%is_allocated) then 158 call this%deallocate_unpacked_host() 159 end if 160 161 SAFE_DEALLOCATE_P(this%ist_idim_index) 162 SAFE_DEALLOCATE_A(this%ist) 163 164 POP_SUB(batch_end) 165 end subroutine batch_end 166 167 !-------------------------------------------------------------- 168 subroutine batch_deallocate_unpacked_host(this) 169 class(batch_t), intent(inout) :: this 170 171 PUSH_SUB(batch_deallocate_unpacked_host) 172 173 this%is_allocated = .false. 174 175 if(this%special_memory) then 176 if(associated(this%dff)) then 177 call deallocate_hardware_aware(c_loc(this%dff(1,1,1))) 178 end if 179 if(associated(this%zff)) then 180 call deallocate_hardware_aware(c_loc(this%zff(1,1,1))) 181 end if 182 else 183 SAFE_DEALLOCATE_P(this%dff) 184 SAFE_DEALLOCATE_P(this%zff) 185 end if 186 nullify(this%dff) 187 nullify(this%dff_linear) 188 nullify(this%zff) 189 nullify(this%zff_linear) 190 191 POP_SUB(batch_deallocate_unpacked_host) 192 end subroutine batch_deallocate_unpacked_host 193 194 !-------------------------------------------------------------- 195 subroutine batch_deallocate_packed_host(this) 196 class(batch_t), intent(inout) :: this 197 198 PUSH_SUB(batch_deallocate_packed_host) 199 200 if(this%special_memory) then 201 if(associated(this%dff_pack)) then 202 call deallocate_hardware_aware(c_loc(this%dff_pack(1,1))) 203 end if 204 if(associated(this%zff_pack)) then 205 call deallocate_hardware_aware(c_loc(this%zff_pack(1,1))) 206 end if 207 else 208 SAFE_DEALLOCATE_P(this%dff_pack) 209 SAFE_DEALLOCATE_P(this%zff_pack) 210 end if 211 nullify(this%dff_pack) 212 nullify(this%zff_pack) 213 214 POP_SUB(batch_deallocate_packed_host) 215 end subroutine batch_deallocate_packed_host 216 217 !-------------------------------------------------------------- 218 subroutine batch_deallocate_packed_device(this) 219 class(batch_t), intent(inout) :: this 220 221 PUSH_SUB(batch_deallocate_packed_device) 222 223 call accel_release_buffer(this%ff_device) 224 225 POP_SUB(batch_deallocate_packed_device) 226 end subroutine batch_deallocate_packed_device 227 228 !-------------------------------------------------------------- 229 subroutine batch_allocate_unpacked_host(this) 230 class(batch_t), intent(inout) :: this 231 232 PUSH_SUB(batch_allocate_unpacked_host) 233 234 if(this%type() == TYPE_FLOAT) then 235 call this%dallocate_unpacked_host() 236 else if(this%type() == TYPE_CMPLX) then 237 call this%zallocate_unpacked_host() 238 end if 239 240 POP_SUB(batch_allocate_unpacked_host) 241 end subroutine batch_allocate_unpacked_host 242 243 !-------------------------------------------------------------- 244 subroutine batch_allocate_packed_host(this) 245 class(batch_t), intent(inout) :: this 246 247 PUSH_SUB(batch_allocate_packed_host) 248 249 if(this%type() == TYPE_FLOAT) then 250 call this%dallocate_packed_host() 251 else if(this%type() == TYPE_CMPLX) then 252 call this%zallocate_packed_host() 253 end if 254 255 POP_SUB(batch_allocate_packed_host) 256 end subroutine batch_allocate_packed_host 257 258 !-------------------------------------------------------------- 259 subroutine batch_allocate_packed_device(this) 260 class(batch_t), intent(inout) :: this 261 262 PUSH_SUB(batch_allocate_packed_device) 263 264 call accel_create_buffer(this%ff_device, ACCEL_MEM_READ_WRITE, this%type(), product(this%pack_size)) 265 266 POP_SUB(batch_allocate_packed_device) 267 end subroutine batch_allocate_packed_device 268 269 !-------------------------------------------------------------- 270 subroutine batch_init_empty (this, dim, nst, np) 271 type(batch_t), intent(out) :: this 272 integer, intent(in) :: dim 273 integer, intent(in) :: nst 274 integer, intent(in) :: np 275 276 PUSH_SUB(batch_init_empty) 277 278 this%is_allocated = .false. 279 this%own_memory = .false. 280 this%special_memory = .false. 281 this%needs_finish_unpack = .false. 282 this%nst = nst 283 this%dim = dim 284 this%type_of = TYPE_NONE 285 286 this%nst_linear = nst*dim 287 288 this%np = np 289 this%device_buffer_count = 0 290 this%host_buffer_count = 0 291 this%status_of = BATCH_NOT_PACKED 292 this%status_host = BATCH_NOT_PACKED 293 294 this%ndims = 2 295 SAFE_ALLOCATE(this%ist_idim_index(1:this%nst_linear, 1:this%ndims)) 296 SAFE_ALLOCATE(this%ist(1:this%nst)) 297 298 nullify(this%dff, this%zff, this%dff_linear, this%zff_linear) 299 nullify(this%dff_pack, this%zff_pack) 300 301 POP_SUB(batch_init_empty) 302 end subroutine batch_init_empty 303 304 !-------------------------------------------------------------- 305 306 subroutine batch_clone_to(this, dest, pack, copy_data) 307 class(batch_t), intent(in) :: this 308 class(batch_t), allocatable, intent(out) :: dest 309 logical, optional, intent(in) :: pack !< If .false. the new batch will not be packed. Default: batch_is_packed(this) 310 logical, optional, intent(in) :: copy_data !< If .true. the batch data will be copied to the destination batch. Default: .false. 311 312 PUSH_SUB(batch_clone_to) 313 314 if (.not. allocated(dest)) then 315 SAFE_ALLOCATE_TYPE(batch_t, dest) 316 else 317 message(1) = "Internal error: destination batch in batch_clone_to has been previously allocated." 318 call messages_fatal(1) 319 end if 320 321 call this%copy_to(dest, pack, copy_data) 322 323 POP_SUB(batch_clone_to) 324 end subroutine batch_clone_to 325 326 !-------------------------------------------------------------- 327 328 subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data) 329 class(batch_t), intent(in) :: this 330 class(batch_t), allocatable, intent(out) :: dest(:) 331 integer, intent(in) :: n_batches 332 logical, optional, intent(in) :: pack !< If .false. the new batch will not be packed. Default: batch_is_packed(this) 333 logical, optional, intent(in) :: copy_data !< If .true. the batch data will be copied to the destination batch. Default: .false. 334 335 integer :: ib 336 337 PUSH_SUB(batch_clone_to_array) 338 339 if (.not. allocated(dest)) then 340 SAFE_ALLOCATE_TYPE_ARRAY(batch_t, dest, (1:n_batches)) 341 else 342 message(1) = "Internal error: destination batch in batch_clone_to_array has been previously allocated." 343 call messages_fatal(1) 344 end if 345 346 do ib = 1, n_batches 347 call this%copy_to(dest(ib), pack, copy_data) 348 end do 349 350 POP_SUB(batch_clone_to_array) 351 end subroutine batch_clone_to_array 352 353 !-------------------------------------------------------------- 354 355 subroutine batch_copy_to(this, dest, pack, copy_data) 356 class(batch_t), intent(in) :: this 357 class(batch_t), intent(out) :: dest 358 logical, optional, intent(in) :: pack !< If .false. the new batch will not be packed. Default: batch_is_packed(this) 359 logical, optional, intent(in) :: copy_data !< If .true. the batch data will be copied to the destination batch. Default: .false. 360 361 logical :: host_packed 362 363 PUSH_SUB(batch_copy_to) 364 365 host_packed = this%host_buffer_count > 0 366 if(this%type() == TYPE_FLOAT) then 367 call dbatch_init(dest, this%dim, 1, this%nst, this%np, packed=host_packed) 368 else if(this%type() == TYPE_CMPLX) then 369 call zbatch_init(dest, this%dim, 1, this%nst, this%np, packed=host_packed) 370 else 371 message(1) = "Internal error: unknown batch type in batch_copy_to." 372 call messages_fatal(1) 373 end if 374 375 if(this%status() /= dest%status() .and. optional_default(pack, this%is_packed())) call dest%do_pack(copy = .false.) 376 377 dest%ist_idim_index(1:this%nst_linear, 1:this%ndims) = this%ist_idim_index(1:this%nst_linear, 1:this%ndims) 378 dest%ist(1:this%nst) = this%ist(1:this%nst) 379 380 if(optional_default(copy_data, .false.)) call this%copy_data_to(this%np, dest) 381 382 POP_SUB(batch_copy_to) 383 end subroutine batch_copy_to 384 385 ! ---------------------------------------------------- 386 !> THREADSAFE 387 type(type_t) pure function batch_type(this) result(btype) 388 class(batch_t), intent(in) :: this 389 390 btype = this%type_of 391 392 end function batch_type 393 394 ! ---------------------------------------------------- 395 !> For debuging purpose only 396 integer pure function batch_type_as_integer(this) result(itype) 397 class(batch_t), intent(in) :: this 398 399 type(type_t) :: btype 400 401 itype = 0 402 btype = this%type() 403 if( btype == TYPE_FLOAT ) itype = 1 404 if( btype == TYPE_CMPLX ) itype = 2 405 406 end function batch_type_as_integer 407 408 ! ---------------------------------------------------- 409 !> THREADSAFE 410 integer pure function batch_status(this) result(bstatus) 411 class(batch_t), intent(in) :: this 412 413 bstatus = this%status_of 414 end function batch_status 415 416 ! ---------------------------------------------------- 417 418 logical pure function batch_is_packed(this) result(in_buffer) 419 class(batch_t), intent(in) :: this 420 421 in_buffer = (this%device_buffer_count > 0) .or. (this%host_buffer_count > 0) 422 end function batch_is_packed 423 424 ! ---------------------------------------------------- 425 426 integer function batch_pack_total_size(this) result(size) 427 class(batch_t), intent(inout) :: this 428 429 size = this%np 430 if(accel_is_enabled()) size = accel_padded_size(size) 431 size = size*pad_pow2(this%nst_linear)*types_get_size(this%type()) 432 433 end function batch_pack_total_size 434 435 ! ---------------------------------------------------- 436 437 subroutine batch_do_pack(this, copy, async) 438 class(batch_t), intent(inout) :: this 439 logical, optional, intent(in) :: copy 440 logical, optional, intent(in) :: async 441 442 logical :: copy_ 443 logical :: async_ 444 type(profile_t), save :: prof 445 integer :: source, target 446 447 ! no push_sub, called too frequently 448 449 call profiling_in(prof, "BATCH_DO_PACK") 450 451 copy_ = optional_default(copy, .true.) 452 453 async_ = optional_default(async, .false.) 454 455 ! get source and target states for this batch 456 source = this%status() 457 select case(source) 458 case(BATCH_NOT_PACKED, BATCH_PACKED) 459 if(accel_is_enabled()) then 460 target = BATCH_DEVICE_PACKED 461 else 462 target = BATCH_PACKED 463 end if 464 case(BATCH_DEVICE_PACKED) 465 target = BATCH_DEVICE_PACKED 466 end select 467 468 ! only do something if target is different from source 469 if(source /= target) then 470 select case(target) 471 case(BATCH_DEVICE_PACKED) 472 call this%allocate_packed_device() 473 this%status_of = BATCH_DEVICE_PACKED 474 475 if(copy_) then 476 select case(source) 477 case(BATCH_NOT_PACKED) 478 ! copy from unpacked host array to device 479 call batch_write_unpacked_to_device(this) 480 case(BATCH_PACKED) 481 ! copy from packed host array to device 482 call batch_write_packed_to_device(this, async_) 483 end select 484 end if 485 case(BATCH_PACKED) 486 call this%allocate_packed_host() 487 this%status_of = BATCH_PACKED 488 this%status_host = BATCH_PACKED 489 490 if(copy_) then 491 if(this%type() == TYPE_FLOAT) then 492 call dbatch_pack_copy(this) 493 else if(this%type() == TYPE_CMPLX) then 494 call zbatch_pack_copy(this) 495 end if 496 end if 497 if(this%own_memory) call this%deallocate_unpacked_host() 498 end select 499 end if 500 501 select case(target) 502 case(BATCH_DEVICE_PACKED) 503 INCR(this%device_buffer_count, 1) 504 case(BATCH_PACKED) 505 INCR(this%host_buffer_count, 1) 506 end select 507 508 call profiling_out(prof) 509 end subroutine batch_do_pack 510 511 ! ---------------------------------------------------- 512 513 subroutine batch_do_unpack(this, copy, force, async) 514 class(batch_t), intent(inout) :: this 515 logical, optional, intent(in) :: copy 516 logical, optional, intent(in) :: force !< if force = .true., unpack independently of the counter 517 logical, optional, intent(in) :: async 518 519 logical :: copy_, force_, async_ 520 type(profile_t), save :: prof 521 integer :: source, target 522 523 PUSH_SUB(batch_do_unpack) 524 525 call profiling_in(prof, "BATCH_DO_UNPACK") 526 527 copy_ = optional_default(copy, .true.) 528 529 force_ = optional_default(force, .false.) 530 531 async_ = optional_default(async, .false.) 532 533 ! get source and target states for this batch 534 source = this%status() 535 select case(source) 536 case(BATCH_NOT_PACKED) 537 target = source 538 case(BATCH_PACKED) 539 target = BATCH_NOT_PACKED 540 case(BATCH_DEVICE_PACKED) 541 target = this%status_host 542 end select 543 544 ! only do something if target is different from source 545 if(source /= target) then 546 select case(source) 547 case(BATCH_PACKED) 548 if(this%host_buffer_count == 1 .or. force_) then 549 if(this%own_memory) call this%allocate_unpacked_host() 550 ! unpack from packed_host to unpacked_host 551 if(copy_ .or. this%own_memory) then 552 if(this%type() == TYPE_FLOAT) then 553 call dbatch_unpack_copy(this) 554 else if(this%type() == TYPE_CMPLX) then 555 call zbatch_unpack_copy(this) 556 end if 557 end if 558 call this%deallocate_packed_host() 559 this%status_host = target 560 this%status_of = target 561 this%host_buffer_count = 1 562 end if 563 INCR(this%host_buffer_count, -1) 564 case(BATCH_DEVICE_PACKED) 565 if(this%device_buffer_count == 1 .or. force_) then 566 if(copy_) then 567 select case(target) 568 ! unpack from packed_device to unpacked_host 569 case(BATCH_NOT_PACKED) 570 call batch_read_device_to_unpacked(this) 571 ! unpack from packed_device to packed_host 572 case(BATCH_PACKED) 573 call batch_read_device_to_packed(this, async_) 574 end select 575 end if 576 if(async_) then 577 this%needs_finish_unpack = .true. 578 else 579 call this%deallocate_packed_device() 580 end if 581 this%status_of = target 582 this%device_buffer_count = 1 583 end if 584 INCR(this%device_buffer_count, -1) 585 end select 586 end if 587 588 call profiling_out(prof) 589 590 POP_SUB(batch_do_unpack) 591 end subroutine batch_do_unpack 592 593 ! ---------------------------------------------------- 594 subroutine batch_finish_unpack(this) 595 class(batch_t), intent(inout) :: this 596 597 PUSH_SUB(batch_finish_unpack) 598 if(this%needs_finish_unpack) then 599 call accel_finish() 600 call this%deallocate_packed_device() 601 this%needs_finish_unpack = .false. 602 end if 603 POP_SUB(batch_finish_unpack) 604 end subroutine batch_finish_unpack 605 606 ! ---------------------------------------------------- 607 608 subroutine batch_write_unpacked_to_device(this) 609 class(batch_t), intent(inout) :: this 610 611 integer :: ist, ist2, unroll 612 type(accel_mem_t) :: tmp 613 type(profile_t), save :: prof, prof_pack 614 type(accel_kernel_t), pointer :: kernel 615 616 PUSH_SUB(batch_write_unpacked_to_device) 617 618 call profiling_in(prof, "BATCH_PACK_COPY_CL") 619 if(this%nst_linear == 1) then 620 ! we can copy directly 621 if(this%type() == TYPE_FLOAT) then 622 call accel_write_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1)) 623 else if(this%type() == TYPE_CMPLX) then 624 call accel_write_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1)) 625 else 626 ASSERT(.false.) 627 end if 628 629 else 630 ! we copy to a temporary array and then we re-arrange data 631 632 if(this%type() == TYPE_FLOAT) then 633 kernel => dpack 634 else 635 kernel => zpack 636 end if 637 638 unroll = min(CL_PACK_MAX_BUFFER_SIZE, this%pack_size(1)) 639 640 call accel_create_buffer(tmp, ACCEL_MEM_READ_ONLY, this%type(), unroll*this%pack_size(2)) 641 642 do ist = 1, this%nst_linear, unroll 643 644 ! copy a number 'unroll' of states to the buffer 645 do ist2 = ist, min(ist + unroll - 1, this%nst_linear) 646 647 if(this%type() == TYPE_FLOAT) then 648 call accel_write_buffer(tmp, ubound(this%dff_linear, dim=1), this%dff_linear(:, ist2), & 649 offset = (ist2 - ist)*this%pack_size(2)) 650 else 651 call accel_write_buffer(tmp, ubound(this%zff_linear, dim=1), this%zff_linear(:, ist2), & 652 offset = (ist2 - ist)*this%pack_size(2)) 653 end if 654 end do 655 656 ! now call an opencl kernel to rearrange the data 657 call accel_set_kernel_arg(kernel, 0, this%pack_size(1)) 658 call accel_set_kernel_arg(kernel, 1, this%pack_size(2)) 659 call accel_set_kernel_arg(kernel, 2, ist - 1) 660 call accel_set_kernel_arg(kernel, 3, tmp) 661 call accel_set_kernel_arg(kernel, 4, this%ff_device) 662 663 call profiling_in(prof_pack, "CL_PACK") 664 call accel_kernel_run(kernel, (/this%pack_size(2), unroll/), (/accel_max_workgroup_size()/unroll, unroll/)) 665 666 if(this%type() == TYPE_FLOAT) then 667 call profiling_count_transfers(unroll*this%pack_size(2), M_ONE) 668 else 669 call profiling_count_transfers(unroll*this%pack_size(2), M_ZI) 670 end if 671 672 call accel_finish() 673 call profiling_out(prof_pack) 674 675 end do 676 677 call accel_release_buffer(tmp) 678 679 end if 680 681 call profiling_out(prof) 682 POP_SUB(batch_write_unpacked_to_device) 683 end subroutine batch_write_unpacked_to_device 684 685 ! ------------------------------------------------------------------ 686 687 subroutine batch_read_device_to_unpacked(this) 688 class(batch_t), intent(inout) :: this 689 690 integer :: ist, ist2, unroll 691 type(accel_mem_t) :: tmp 692 type(accel_kernel_t), pointer :: kernel 693 type(profile_t), save :: prof, prof_unpack 694 695 PUSH_SUB(batch_read_device_to_unpacked) 696 call profiling_in(prof, "BATCH_UNPACK_COPY_CL") 697 698 if(this%nst_linear == 1) then 699 ! we can copy directly 700 if(this%type() == TYPE_FLOAT) then 701 call accel_read_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1)) 702 else 703 call accel_read_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1)) 704 end if 705 else 706 707 unroll = min(CL_PACK_MAX_BUFFER_SIZE, this%pack_size(1)) 708 709 ! we use a kernel to move to a temporary array and then we read 710 call accel_create_buffer(tmp, ACCEL_MEM_WRITE_ONLY, this%type(), unroll*this%pack_size(2)) 711 712 if(this%type() == TYPE_FLOAT) then 713 kernel => dunpack 714 else 715 kernel => zunpack 716 end if 717 718 do ist = 1, this%nst_linear, unroll 719 call accel_set_kernel_arg(kernel, 0, this%pack_size(1)) 720 call accel_set_kernel_arg(kernel, 1, this%pack_size(2)) 721 call accel_set_kernel_arg(kernel, 2, ist - 1) 722 call accel_set_kernel_arg(kernel, 3, this%ff_device) 723 call accel_set_kernel_arg(kernel, 4, tmp) 724 725 call profiling_in(prof_unpack, "CL_UNPACK") 726 call accel_kernel_run(kernel, (/unroll, this%pack_size(2)/), (/unroll, accel_max_workgroup_size()/unroll/)) 727 728 if(this%type() == TYPE_FLOAT) then 729 call profiling_count_transfers(unroll*this%pack_size(2), M_ONE) 730 else 731 call profiling_count_transfers(unroll*this%pack_size(2), M_ZI) 732 end if 733 734 call accel_finish() 735 call profiling_out(prof_unpack) 736 737 ! copy a number 'unroll' of states from the buffer 738 do ist2 = ist, min(ist + unroll - 1, this%nst_linear) 739 740 if(this%type() == TYPE_FLOAT) then 741 call accel_read_buffer(tmp, ubound(this%dff_linear, dim=1), this%dff_linear(:, ist2), & 742 offset = (ist2 - ist)*this%pack_size(2)) 743 else 744 call accel_read_buffer(tmp, ubound(this%zff_linear, dim=1), this%zff_linear(:, ist2), & 745 offset = (ist2 - ist)*this%pack_size(2)) 746 end if 747 end do 748 749 end do 750 751 call accel_release_buffer(tmp) 752 end if 753 754 call profiling_out(prof) 755 POP_SUB(batch_read_device_to_unpacked) 756 end subroutine batch_read_device_to_unpacked 757 758 ! ------------------------------------------------------------------ 759 subroutine batch_write_packed_to_device(this, async) 760 class(batch_t), intent(inout) :: this 761 logical, optional, intent(in) :: async 762 763 type(profile_t), save :: prof_pack 764 765 PUSH_SUB(batch_write_packed_to_device) 766 767 call profiling_in(prof_pack, "BATCH_PACK_COPY_CL") 768 if(this%type() == TYPE_FLOAT) then 769 call accel_write_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async) 770 else 771 call accel_write_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async) 772 end if 773 call profiling_out(prof_pack) 774 775 POP_SUB(batch_write_packed_to_device) 776 end subroutine batch_write_packed_to_device 777 778 ! ------------------------------------------------------------------ 779 subroutine batch_read_device_to_packed(this, async) 780 class(batch_t), intent(inout) :: this 781 logical, optional, intent(in) :: async 782 783 type(profile_t), save :: prof_unpack 784 785 PUSH_SUB(batch_read_device_to_packed) 786 787 call profiling_in(prof_unpack, "BATCH_UNPACK_COPY_CL") 788 if(this%type() == TYPE_FLOAT) then 789 call accel_read_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async) 790 else 791 call accel_read_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async) 792 end if 793 call profiling_out(prof_unpack) 794 795 POP_SUB(batch_read_device_to_packed) 796 end subroutine batch_read_device_to_packed 797 798! ------------------------------------------------------ 799integer function batch_inv_index(this, cind) result(index) 800 class(batch_t), intent(in) :: this 801 integer, intent(in) :: cind(:) 802 803 do index = 1, this%nst_linear 804 if(all(cind(1:this%ndims) == this%ist_idim_index(index, 1:this%ndims))) exit 805 end do 806 807 ASSERT(index <= this%nst_linear) 808 809end function batch_inv_index 810 811! ------------------------------------------------------ 812 813integer pure function batch_ist_idim_to_linear(this, cind) result(index) 814 class(batch_t), intent(in) :: this 815 integer, intent(in) :: cind(:) 816 817 if(ubound(cind, dim = 1) == 1) then 818 index = cind(1) 819 else 820 index = (cind(1) - 1)*this%dim + cind(2) 821 end if 822 823end function batch_ist_idim_to_linear 824 825! ------------------------------------------------------ 826 827integer pure function batch_linear_to_ist(this, linear_index) result(ist) 828 class(batch_t), intent(in) :: this 829 integer, intent(in) :: linear_index 830 831 ist = this%ist_idim_index(linear_index, 1) 832 833end function batch_linear_to_ist 834 835! ------------------------------------------------------ 836 837integer pure function batch_linear_to_idim(this, linear_index) result(idim) 838 class(batch_t), intent(in) :: this 839 integer, intent(in) :: linear_index 840 841 idim = this%ist_idim_index(linear_index, 2) 842 843end function batch_linear_to_idim 844 845! ------------------------------------------------------ 846 847subroutine batch_remote_access_start(this, mpi_grp, rma_win) 848 class(batch_t), intent(inout) :: this 849 type(mpi_grp_t), intent(in) :: mpi_grp 850 integer, intent(out) :: rma_win 851 852 PUSH_SUB(batch_remote_access_start) 853 854 ASSERT(.not. accel_is_enabled()) 855 856 if(mpi_grp%size > 1) then 857 call this%do_pack() 858 859 if(this%type() == TYPE_CMPLX) then 860#ifdef HAVE_MPI2 861 call MPI_Win_create(this%zff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), MPI_ADDRESS_KIND), & 862 types_get_size(this%type()), MPI_INFO_NULL, mpi_grp%comm, rma_win, mpi_err) 863#endif 864 else if (this%type() == TYPE_FLOAT) then 865#ifdef HAVE_MPI2 866 call MPI_Win_create(this%dff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), MPI_ADDRESS_KIND), & 867 types_get_size(this%type()), MPI_INFO_NULL, mpi_grp%comm, rma_win, mpi_err) 868#endif 869 else 870 message(1) = "Internal error: unknown batch type in batch_remote_access_start." 871 call messages_fatal(1) 872 end if 873 874 else 875 rma_win = -1 876 end if 877 878 POP_SUB(batch_remote_access_start) 879end subroutine batch_remote_access_start 880 881! ------------------------------------------------------ 882 883subroutine batch_remote_access_stop(this, rma_win) 884 class(batch_t), intent(inout) :: this 885 integer, intent(inout) :: rma_win 886 887 PUSH_SUB(batch_remote_access_stop) 888 889 if(rma_win /= -1) then 890#ifdef HAVE_MPI2 891 call MPI_Win_free(rma_win, mpi_err) 892#endif 893 call this%do_unpack() 894 end if 895 896 POP_SUB(batch_remote_access_stop) 897end subroutine batch_remote_access_stop 898 899! -------------------------------------------------------------- 900 901subroutine batch_copy_data_to(this, np, dest) 902 class(batch_t), intent(in) :: this 903 integer, intent(in) :: np 904 class(batch_t), intent(inout) :: dest 905 906 integer :: ist, dim2, dim3 907 type(profile_t), save :: prof 908 integer :: localsize 909 910 PUSH_SUB(batch_copy_data_to) 911 call profiling_in(prof, "BATCH_COPY_DATA_TO") 912 913 call this%check_compatibility_with(dest) 914 915 select case(this%status()) 916 case(BATCH_DEVICE_PACKED) 917 call accel_set_kernel_arg(kernel_copy, 0, np) 918 call accel_set_kernel_arg(kernel_copy, 1, this%ff_device) 919 call accel_set_kernel_arg(kernel_copy, 2, log2(this%pack_size_real(1))) 920 call accel_set_kernel_arg(kernel_copy, 3, dest%ff_device) 921 call accel_set_kernel_arg(kernel_copy, 4, log2(dest%pack_size_real(1))) 922 923 localsize = accel_kernel_workgroup_size(kernel_copy)/dest%pack_size_real(1) 924 925 dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1 926 dim2 = min(accel_max_size_per_dim(2)*localsize, pad(np, localsize)) 927 928 call accel_kernel_run(kernel_copy, (/dest%pack_size_real(1), dim2, dim3/), (/dest%pack_size_real(1), localsize, 1/)) 929 930 call accel_finish() 931 932 case(BATCH_PACKED) 933 if(dest%type() == TYPE_FLOAT) then 934 call blas_copy(np*this%pack_size(1), this%dff_pack(1, 1), 1, dest%dff_pack(1, 1), 1) 935 else 936 call blas_copy(np*this%pack_size(1), this%zff_pack(1, 1), 1, dest%zff_pack(1, 1), 1) 937 end if 938 939 case(BATCH_NOT_PACKED) 940 !$omp parallel do private(ist) 941 do ist = 1, dest%nst_linear 942 if(dest%type() == TYPE_CMPLX) then 943 call blas_copy(np, this%zff_linear(1, ist), 1, dest%zff_linear(1, ist), 1) 944 else 945 call blas_copy(np, this%dff_linear(1, ist), 1, dest%dff_linear(1, ist), 1) 946 end if 947 end do 948 949 end select 950 951 call profiling_out(prof) 952 POP_SUB(batch_copy_data_to) 953end subroutine batch_copy_data_to 954 955! -------------------------------------------------------------- 956 957subroutine batch_check_compatibility_with(this, target, only_check_dim) 958 class(batch_t), intent(in) :: this 959 class(batch_t), intent(in) :: target 960 logical, optional, intent(in) :: only_check_dim 961 962 PUSH_SUB(batch_check_compatibility_with) 963 964 ASSERT(this%type() == target%type()) 965 if(.not. optional_default(only_check_dim, .false.)) then 966 ASSERT(this%nst_linear == target%nst_linear) 967 end if 968 ASSERT(this%status() == target%status()) 969 ASSERT(this%dim == target%dim) 970 971 POP_SUB(batch_check_compatibility_with) 972 973end subroutine batch_check_compatibility_with 974 975!-------------------------------------------------------------- 976subroutine batch_build_indices(this, st_start, st_end) 977 class(batch_t), intent(inout) :: this 978 integer, intent(in) :: st_start 979 integer, intent(in) :: st_end 980 981 integer :: idim, ii, ist 982 983 PUSH_SUB(batch_build_indices) 984 985 do ist = st_start, st_end 986 ! now we also populate the linear array 987 do idim = 1, this%dim 988 ii = this%dim*(ist - st_start) + idim 989 this%ist_idim_index(ii, 1) = ist 990 this%ist_idim_index(ii, 2) = idim 991 end do 992 this%ist(ist - st_start + 1) = ist 993 end do 994 995 ! compute packed sizes 996 this%pack_size(1) = pad_pow2(this%nst_linear) 997 this%pack_size(2) = this%np 998 if(accel_is_enabled()) this%pack_size(2) = accel_padded_size(this%pack_size(2)) 999 1000 this%pack_size_real = this%pack_size 1001 if(type_is_complex(this%type())) this%pack_size_real(1) = 2*this%pack_size_real(1) 1002 1003 POP_SUB(batch_build_indices) 1004end subroutine batch_build_indices 1005 1006 1007#include "real.F90" 1008#include "batch_inc.F90" 1009#include "undef.F90" 1010 1011#include "complex.F90" 1012#include "batch_inc.F90" 1013#include "undef.F90" 1014 1015end module batch_oct_m 1016 1017!! Local Variables: 1018!! mode: f90 1019!! coding: utf-8 1020!! End: 1021