1!! Copyright (C) 2008 X. Andrade, 2020 S. Ohlmann
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17!!
18
19#include "global.h"
20
21module batch_oct_m
22  use accel_oct_m
23  use allocate_hardware_aware_oct_m
24  use blas_oct_m
25  use global_oct_m
26  use hardware_oct_m
27  use iso_c_binding
28  use math_oct_m
29  use messages_oct_m
30  use mpi_oct_m
31  use profiling_oct_m
32  use types_oct_m
33
34  implicit none
35
36  private
37  public ::                         &
38    batch_t,                        &
39    batch_init,                     &
40    dbatch_init,                    &
41    zbatch_init
42
43  type batch_t
44    private
45    integer,                        public :: nst
46    integer,                        public :: dim
47    integer                                :: np
48
49    integer                                :: ndims
50    integer,               pointer         :: ist_idim_index(:, :)
51    integer,           allocatable, public :: ist(:)
52
53    logical                                :: is_allocated
54    logical                                :: own_memory !< does the batch own the memory or is it foreign memory?
55    !> We also need a linear array with the states in order to calculate derivatives, etc.
56    integer,                        public :: nst_linear
57
58    integer                                :: status_of
59    integer                                :: status_host
60    type(type_t)                           :: type_of !< either TYPE_FLOAT or TYPE_COMPLEX
61    integer                                :: device_buffer_count !< whether there is a copy in the opencl buffer
62    integer                                :: host_buffer_count !< whether the batch was packed on the cpu
63    logical                                :: special_memory
64    logical                                :: needs_finish_unpack
65
66
67    !> unpacked variables; linear variables are pointers with different shapes
68    FLOAT, pointer, contiguous,     public :: dff(:, :, :)
69    CMPLX, pointer, contiguous,     public :: zff(:, :, :)
70    FLOAT, pointer, contiguous,     public :: dff_linear(:, :)
71    CMPLX, pointer, contiguous,     public :: zff_linear(:, :)
72    !> packed variables; only rank-2 arrays due to padding to powers of 2
73    FLOAT, pointer, contiguous,     public :: dff_pack(:, :)
74    CMPLX, pointer, contiguous,     public :: zff_pack(:, :)
75
76    integer,                        public :: pack_size(1:2)
77    integer,                        public :: pack_size_real(1:2)
78
79    type(accel_mem_t),              public :: ff_device
80
81  contains
82    procedure :: check_compatibility_with => batch_check_compatibility_with
83    procedure :: clone_to => batch_clone_to
84    procedure :: clone_to_array => batch_clone_to_array
85    procedure :: copy_to => batch_copy_to
86    procedure :: copy_data_to => batch_copy_data_to
87    procedure :: do_pack => batch_do_pack
88    procedure :: do_unpack => batch_do_unpack
89    procedure :: finish_unpack => batch_finish_unpack
90    procedure :: end => batch_end
91    procedure :: inv_index => batch_inv_index
92    procedure :: is_packed => batch_is_packed
93    procedure :: ist_idim_to_linear => batch_ist_idim_to_linear
94    procedure :: linear_to_idim => batch_linear_to_idim
95    procedure :: linear_to_ist => batch_linear_to_ist
96    procedure :: pack_total_size => batch_pack_total_size
97    procedure :: remote_access_start => batch_remote_access_start
98    procedure :: remote_access_stop => batch_remote_access_stop
99    procedure :: status => batch_status
100    procedure :: type => batch_type
101    procedure :: type_as_int => batch_type_as_integer
102    procedure, private :: dallocate_unpacked_host => dbatch_allocate_unpacked_host
103    procedure, private :: zallocate_unpacked_host => zbatch_allocate_unpacked_host
104    procedure, private :: allocate_unpacked_host => batch_allocate_unpacked_host
105    procedure, private :: dallocate_packed_host => dbatch_allocate_packed_host
106    procedure, private :: zallocate_packed_host => zbatch_allocate_packed_host
107    procedure, private :: allocate_packed_host => batch_allocate_packed_host
108    procedure, private :: allocate_packed_device => batch_allocate_packed_device
109    procedure, private :: deallocate_unpacked_host => batch_deallocate_unpacked_host
110    procedure, private :: deallocate_packed_host => batch_deallocate_packed_host
111    procedure, private :: deallocate_packed_device => batch_deallocate_packed_device
112  end type batch_t
113
114  !--------------------------------------------------------------
115  interface batch_init
116    module procedure dbatch_init_with_memory_3
117    module procedure zbatch_init_with_memory_3
118    module procedure dbatch_init_with_memory_2
119    module procedure zbatch_init_with_memory_2
120    module procedure dbatch_init_with_memory_1
121    module procedure zbatch_init_with_memory_1
122  end interface batch_init
123
124  integer, public, parameter :: &
125    BATCH_NOT_PACKED     = 0,   &
126    BATCH_PACKED         = 1,   &
127    BATCH_DEVICE_PACKED  = 2
128
129  integer, parameter :: CL_PACK_MAX_BUFFER_SIZE = 4 !< this value controls the size (in number of wave-functions)
130                                                    !! of the buffer used to copy states to the opencl device.
131
132contains
133
134  !--------------------------------------------------------------
135  subroutine batch_end(this, copy)
136    class(batch_t),          intent(inout) :: this
137    logical,       optional, intent(in)    :: copy
138
139    PUSH_SUB(batch_end)
140
141    if(this%own_memory .and. this%is_packed()) then
142      !deallocate directly to avoid unnecessary copies
143      if(this%status() == BATCH_DEVICE_PACKED) then
144        call this%deallocate_packed_device()
145      end if
146      if(this%status() == BATCH_PACKED .or. this%status_host == BATCH_PACKED) then
147        call this%deallocate_packed_host()
148      end if
149      this%status_of = BATCH_NOT_PACKED
150      this%status_host = BATCH_NOT_PACKED
151      this%host_buffer_count = 0
152      this%device_buffer_count = 0
153    end if
154    if(this%status() == BATCH_DEVICE_PACKED) call this%do_unpack(copy, force = .true.)
155    if(this%status() == BATCH_PACKED) call this%do_unpack(copy, force = .true.)
156
157    if(this%is_allocated) then
158      call this%deallocate_unpacked_host()
159    end if
160
161    SAFE_DEALLOCATE_P(this%ist_idim_index)
162    SAFE_DEALLOCATE_A(this%ist)
163
164    POP_SUB(batch_end)
165  end subroutine batch_end
166
167  !--------------------------------------------------------------
168  subroutine batch_deallocate_unpacked_host(this)
169    class(batch_t),  intent(inout) :: this
170
171    PUSH_SUB(batch_deallocate_unpacked_host)
172
173    this%is_allocated = .false.
174
175    if(this%special_memory) then
176      if(associated(this%dff)) then
177        call deallocate_hardware_aware(c_loc(this%dff(1,1,1)))
178      end if
179      if(associated(this%zff)) then
180        call deallocate_hardware_aware(c_loc(this%zff(1,1,1)))
181      end if
182    else
183      SAFE_DEALLOCATE_P(this%dff)
184      SAFE_DEALLOCATE_P(this%zff)
185    end if
186    nullify(this%dff)
187    nullify(this%dff_linear)
188    nullify(this%zff)
189    nullify(this%zff_linear)
190
191    POP_SUB(batch_deallocate_unpacked_host)
192  end subroutine batch_deallocate_unpacked_host
193
194  !--------------------------------------------------------------
195  subroutine batch_deallocate_packed_host(this)
196    class(batch_t),  intent(inout) :: this
197
198    PUSH_SUB(batch_deallocate_packed_host)
199
200    if(this%special_memory) then
201      if(associated(this%dff_pack)) then
202        call deallocate_hardware_aware(c_loc(this%dff_pack(1,1)))
203      end if
204      if(associated(this%zff_pack)) then
205        call deallocate_hardware_aware(c_loc(this%zff_pack(1,1)))
206      end if
207    else
208      SAFE_DEALLOCATE_P(this%dff_pack)
209      SAFE_DEALLOCATE_P(this%zff_pack)
210    end if
211    nullify(this%dff_pack)
212    nullify(this%zff_pack)
213
214    POP_SUB(batch_deallocate_packed_host)
215  end subroutine batch_deallocate_packed_host
216
217  !--------------------------------------------------------------
218  subroutine batch_deallocate_packed_device(this)
219    class(batch_t),  intent(inout) :: this
220
221    PUSH_SUB(batch_deallocate_packed_device)
222
223    call accel_release_buffer(this%ff_device)
224
225    POP_SUB(batch_deallocate_packed_device)
226  end subroutine batch_deallocate_packed_device
227
228  !--------------------------------------------------------------
229  subroutine batch_allocate_unpacked_host(this)
230    class(batch_t),  intent(inout) :: this
231
232    PUSH_SUB(batch_allocate_unpacked_host)
233
234    if(this%type() == TYPE_FLOAT) then
235      call this%dallocate_unpacked_host()
236    else if(this%type() == TYPE_CMPLX) then
237      call this%zallocate_unpacked_host()
238    end if
239
240    POP_SUB(batch_allocate_unpacked_host)
241  end subroutine batch_allocate_unpacked_host
242
243  !--------------------------------------------------------------
244  subroutine batch_allocate_packed_host(this)
245    class(batch_t),  intent(inout) :: this
246
247    PUSH_SUB(batch_allocate_packed_host)
248
249    if(this%type() == TYPE_FLOAT) then
250      call this%dallocate_packed_host()
251    else if(this%type() == TYPE_CMPLX) then
252      call this%zallocate_packed_host()
253    end if
254
255    POP_SUB(batch_allocate_packed_host)
256  end subroutine batch_allocate_packed_host
257
258  !--------------------------------------------------------------
259  subroutine batch_allocate_packed_device(this)
260    class(batch_t),  intent(inout) :: this
261
262    PUSH_SUB(batch_allocate_packed_device)
263
264    call accel_create_buffer(this%ff_device, ACCEL_MEM_READ_WRITE, this%type(), product(this%pack_size))
265
266    POP_SUB(batch_allocate_packed_device)
267  end subroutine batch_allocate_packed_device
268
269  !--------------------------------------------------------------
270  subroutine batch_init_empty (this, dim, nst, np)
271    type(batch_t), intent(out)   :: this
272    integer,       intent(in)    :: dim
273    integer,       intent(in)    :: nst
274    integer,       intent(in)    :: np
275
276    PUSH_SUB(batch_init_empty)
277
278    this%is_allocated = .false.
279    this%own_memory = .false.
280    this%special_memory = .false.
281    this%needs_finish_unpack = .false.
282    this%nst = nst
283    this%dim = dim
284    this%type_of = TYPE_NONE
285
286    this%nst_linear = nst*dim
287
288    this%np = np
289    this%device_buffer_count = 0
290    this%host_buffer_count = 0
291    this%status_of = BATCH_NOT_PACKED
292    this%status_host = BATCH_NOT_PACKED
293
294    this%ndims = 2
295    SAFE_ALLOCATE(this%ist_idim_index(1:this%nst_linear, 1:this%ndims))
296    SAFE_ALLOCATE(this%ist(1:this%nst))
297
298    nullify(this%dff, this%zff, this%dff_linear, this%zff_linear)
299    nullify(this%dff_pack, this%zff_pack)
300
301    POP_SUB(batch_init_empty)
302  end subroutine batch_init_empty
303
304  !--------------------------------------------------------------
305
306  subroutine batch_clone_to(this, dest, pack, copy_data)
307    class(batch_t),              intent(in)    :: this
308    class(batch_t), allocatable, intent(out)   :: dest
309    logical,        optional,    intent(in)    :: pack       !< If .false. the new batch will not be packed. Default: batch_is_packed(this)
310    logical,        optional,    intent(in)    :: copy_data  !< If .true. the batch data will be copied to the destination batch. Default: .false.
311
312    PUSH_SUB(batch_clone_to)
313
314    if (.not. allocated(dest)) then
315      SAFE_ALLOCATE_TYPE(batch_t, dest)
316    else
317      message(1) = "Internal error: destination batch in batch_clone_to has been previously allocated."
318      call messages_fatal(1)
319    end if
320
321    call this%copy_to(dest, pack, copy_data)
322
323    POP_SUB(batch_clone_to)
324  end subroutine batch_clone_to
325
326  !--------------------------------------------------------------
327
328  subroutine batch_clone_to_array(this, dest, n_batches, pack, copy_data)
329    class(batch_t),              intent(in)    :: this
330    class(batch_t), allocatable, intent(out)   :: dest(:)
331    integer,                     intent(in)    :: n_batches
332    logical,        optional,    intent(in)    :: pack       !< If .false. the new batch will not be packed. Default: batch_is_packed(this)
333    logical,        optional,    intent(in)    :: copy_data  !< If .true. the batch data will be copied to the destination batch. Default: .false.
334
335    integer :: ib
336
337    PUSH_SUB(batch_clone_to_array)
338
339    if (.not. allocated(dest)) then
340      SAFE_ALLOCATE_TYPE_ARRAY(batch_t, dest, (1:n_batches))
341    else
342      message(1) = "Internal error: destination batch in batch_clone_to_array has been previously allocated."
343      call messages_fatal(1)
344    end if
345
346    do ib = 1, n_batches
347      call this%copy_to(dest(ib), pack, copy_data)
348    end do
349
350    POP_SUB(batch_clone_to_array)
351  end subroutine batch_clone_to_array
352
353  !--------------------------------------------------------------
354
355  subroutine batch_copy_to(this, dest, pack, copy_data)
356    class(batch_t),          intent(in)    :: this
357    class(batch_t),          intent(out)   :: dest
358    logical,       optional, intent(in)    :: pack       !< If .false. the new batch will not be packed. Default: batch_is_packed(this)
359    logical,       optional, intent(in)    :: copy_data  !< If .true. the batch data will be copied to the destination batch. Default: .false.
360
361    logical :: host_packed
362
363    PUSH_SUB(batch_copy_to)
364
365    host_packed = this%host_buffer_count > 0
366    if(this%type() == TYPE_FLOAT) then
367      call dbatch_init(dest, this%dim, 1, this%nst, this%np, packed=host_packed)
368    else if(this%type() == TYPE_CMPLX) then
369      call zbatch_init(dest, this%dim, 1, this%nst, this%np, packed=host_packed)
370    else
371      message(1) = "Internal error: unknown batch type in batch_copy_to."
372      call messages_fatal(1)
373    end if
374
375    if(this%status() /= dest%status() .and. optional_default(pack, this%is_packed())) call dest%do_pack(copy = .false.)
376
377    dest%ist_idim_index(1:this%nst_linear, 1:this%ndims) = this%ist_idim_index(1:this%nst_linear, 1:this%ndims)
378    dest%ist(1:this%nst) = this%ist(1:this%nst)
379
380    if(optional_default(copy_data, .false.)) call this%copy_data_to(this%np, dest)
381
382    POP_SUB(batch_copy_to)
383  end subroutine batch_copy_to
384
385  ! ----------------------------------------------------
386  !> THREADSAFE
387  type(type_t) pure function batch_type(this) result(btype)
388    class(batch_t),      intent(in)    :: this
389
390    btype = this%type_of
391
392  end function batch_type
393
394  ! ----------------------------------------------------
395  !> For debuging purpose only
396  integer pure function batch_type_as_integer(this) result(itype)
397    class(batch_t),      intent(in)    :: this
398
399    type(type_t) :: btype
400
401    itype = 0
402    btype = this%type()
403    if( btype == TYPE_FLOAT ) itype = 1
404    if( btype == TYPE_CMPLX ) itype = 2
405
406  end function batch_type_as_integer
407
408  ! ----------------------------------------------------
409  !> THREADSAFE
410  integer pure function batch_status(this) result(bstatus)
411    class(batch_t),      intent(in)    :: this
412
413    bstatus = this%status_of
414  end function batch_status
415
416  ! ----------------------------------------------------
417
418  logical pure function batch_is_packed(this) result(in_buffer)
419    class(batch_t),      intent(in)    :: this
420
421    in_buffer = (this%device_buffer_count > 0) .or. (this%host_buffer_count > 0)
422  end function batch_is_packed
423
424  ! ----------------------------------------------------
425
426  integer function batch_pack_total_size(this) result(size)
427    class(batch_t),      intent(inout) :: this
428
429    size = this%np
430    if(accel_is_enabled()) size = accel_padded_size(size)
431    size = size*pad_pow2(this%nst_linear)*types_get_size(this%type())
432
433  end function batch_pack_total_size
434
435  ! ----------------------------------------------------
436
437  subroutine batch_do_pack(this, copy, async)
438    class(batch_t),      intent(inout) :: this
439    logical,   optional, intent(in)    :: copy
440    logical,   optional, intent(in)    :: async
441
442    logical               :: copy_
443    logical               :: async_
444    type(profile_t), save :: prof
445    integer               :: source, target
446
447    ! no push_sub, called too frequently
448
449    call profiling_in(prof, "BATCH_DO_PACK")
450
451    copy_ = optional_default(copy, .true.)
452
453    async_ = optional_default(async, .false.)
454
455    ! get source and target states for this batch
456    source = this%status()
457    select case(source)
458    case(BATCH_NOT_PACKED, BATCH_PACKED)
459      if(accel_is_enabled()) then
460        target = BATCH_DEVICE_PACKED
461      else
462        target = BATCH_PACKED
463      end if
464    case(BATCH_DEVICE_PACKED)
465      target = BATCH_DEVICE_PACKED
466    end select
467
468    ! only do something if target is different from source
469    if(source /= target) then
470      select case(target)
471      case(BATCH_DEVICE_PACKED)
472        call this%allocate_packed_device()
473        this%status_of = BATCH_DEVICE_PACKED
474
475        if(copy_) then
476          select case(source)
477          case(BATCH_NOT_PACKED)
478            ! copy from unpacked host array to device
479            call batch_write_unpacked_to_device(this)
480          case(BATCH_PACKED)
481            ! copy from packed host array to device
482            call batch_write_packed_to_device(this, async_)
483          end select
484        end if
485      case(BATCH_PACKED)
486        call this%allocate_packed_host()
487        this%status_of = BATCH_PACKED
488        this%status_host = BATCH_PACKED
489
490        if(copy_) then
491          if(this%type() == TYPE_FLOAT) then
492            call dbatch_pack_copy(this)
493          else if(this%type() == TYPE_CMPLX) then
494            call zbatch_pack_copy(this)
495          end if
496        end if
497        if(this%own_memory) call this%deallocate_unpacked_host()
498      end select
499    end if
500
501    select case(target)
502    case(BATCH_DEVICE_PACKED)
503      INCR(this%device_buffer_count, 1)
504    case(BATCH_PACKED)
505      INCR(this%host_buffer_count, 1)
506    end select
507
508    call profiling_out(prof)
509  end subroutine batch_do_pack
510
511  ! ----------------------------------------------------
512
513  subroutine batch_do_unpack(this, copy, force, async)
514    class(batch_t),     intent(inout) :: this
515    logical, optional,  intent(in)    :: copy
516    logical, optional,  intent(in)    :: force  !< if force = .true., unpack independently of the counter
517    logical, optional,  intent(in)    :: async
518
519    logical :: copy_, force_, async_
520    type(profile_t), save :: prof
521    integer               :: source, target
522
523    PUSH_SUB(batch_do_unpack)
524
525    call profiling_in(prof, "BATCH_DO_UNPACK")
526
527    copy_ = optional_default(copy, .true.)
528
529    force_ = optional_default(force, .false.)
530
531    async_ = optional_default(async, .false.)
532
533    ! get source and target states for this batch
534    source = this%status()
535    select case(source)
536    case(BATCH_NOT_PACKED)
537      target = source
538    case(BATCH_PACKED)
539      target = BATCH_NOT_PACKED
540    case(BATCH_DEVICE_PACKED)
541      target = this%status_host
542    end select
543
544    ! only do something if target is different from source
545    if(source /= target) then
546      select case(source)
547      case(BATCH_PACKED)
548        if(this%host_buffer_count == 1 .or. force_) then
549          if(this%own_memory) call this%allocate_unpacked_host()
550          ! unpack from packed_host to unpacked_host
551          if(copy_ .or. this%own_memory) then
552            if(this%type() == TYPE_FLOAT) then
553              call dbatch_unpack_copy(this)
554            else if(this%type() == TYPE_CMPLX) then
555              call zbatch_unpack_copy(this)
556            end if
557          end if
558          call this%deallocate_packed_host()
559          this%status_host = target
560          this%status_of = target
561          this%host_buffer_count = 1
562        end if
563        INCR(this%host_buffer_count, -1)
564      case(BATCH_DEVICE_PACKED)
565        if(this%device_buffer_count == 1 .or. force_) then
566          if(copy_) then
567            select case(target)
568            ! unpack from packed_device to unpacked_host
569            case(BATCH_NOT_PACKED)
570              call batch_read_device_to_unpacked(this)
571            ! unpack from packed_device to packed_host
572            case(BATCH_PACKED)
573              call batch_read_device_to_packed(this, async_)
574            end select
575          end if
576          if(async_) then
577            this%needs_finish_unpack = .true.
578          else
579            call this%deallocate_packed_device()
580          end if
581          this%status_of = target
582          this%device_buffer_count = 1
583        end if
584        INCR(this%device_buffer_count, -1)
585      end select
586    end if
587
588    call profiling_out(prof)
589
590    POP_SUB(batch_do_unpack)
591  end subroutine batch_do_unpack
592
593  ! ----------------------------------------------------
594  subroutine batch_finish_unpack(this)
595    class(batch_t),      intent(inout)  :: this
596
597    PUSH_SUB(batch_finish_unpack)
598    if(this%needs_finish_unpack) then
599      call accel_finish()
600      call this%deallocate_packed_device()
601      this%needs_finish_unpack = .false.
602    end if
603    POP_SUB(batch_finish_unpack)
604  end subroutine batch_finish_unpack
605
606  ! ----------------------------------------------------
607
608  subroutine batch_write_unpacked_to_device(this)
609    class(batch_t),      intent(inout)  :: this
610
611    integer :: ist, ist2, unroll
612    type(accel_mem_t) :: tmp
613    type(profile_t), save :: prof, prof_pack
614    type(accel_kernel_t), pointer :: kernel
615
616    PUSH_SUB(batch_write_unpacked_to_device)
617
618    call profiling_in(prof, "BATCH_PACK_COPY_CL")
619    if(this%nst_linear == 1) then
620      ! we can copy directly
621      if(this%type() == TYPE_FLOAT) then
622        call accel_write_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
623      else if(this%type() == TYPE_CMPLX) then
624        call accel_write_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
625      else
626        ASSERT(.false.)
627      end if
628
629    else
630      ! we copy to a temporary array and then we re-arrange data
631
632      if(this%type() == TYPE_FLOAT) then
633        kernel => dpack
634      else
635        kernel => zpack
636      end if
637
638      unroll = min(CL_PACK_MAX_BUFFER_SIZE, this%pack_size(1))
639
640      call accel_create_buffer(tmp, ACCEL_MEM_READ_ONLY, this%type(), unroll*this%pack_size(2))
641
642      do ist = 1, this%nst_linear, unroll
643
644        ! copy a number 'unroll' of states to the buffer
645        do ist2 = ist, min(ist + unroll - 1, this%nst_linear)
646
647          if(this%type() == TYPE_FLOAT) then
648            call accel_write_buffer(tmp, ubound(this%dff_linear, dim=1), this%dff_linear(:, ist2), &
649              offset = (ist2 - ist)*this%pack_size(2))
650          else
651            call accel_write_buffer(tmp, ubound(this%zff_linear, dim=1), this%zff_linear(:, ist2), &
652              offset = (ist2 - ist)*this%pack_size(2))
653          end if
654        end do
655
656        ! now call an opencl kernel to rearrange the data
657        call accel_set_kernel_arg(kernel, 0, this%pack_size(1))
658        call accel_set_kernel_arg(kernel, 1, this%pack_size(2))
659        call accel_set_kernel_arg(kernel, 2, ist - 1)
660        call accel_set_kernel_arg(kernel, 3, tmp)
661        call accel_set_kernel_arg(kernel, 4, this%ff_device)
662
663        call profiling_in(prof_pack, "CL_PACK")
664        call accel_kernel_run(kernel, (/this%pack_size(2), unroll/), (/accel_max_workgroup_size()/unroll, unroll/))
665
666        if(this%type() == TYPE_FLOAT) then
667          call profiling_count_transfers(unroll*this%pack_size(2), M_ONE)
668        else
669          call profiling_count_transfers(unroll*this%pack_size(2), M_ZI)
670        end if
671
672        call accel_finish()
673        call profiling_out(prof_pack)
674
675      end do
676
677      call accel_release_buffer(tmp)
678
679    end if
680
681    call profiling_out(prof)
682    POP_SUB(batch_write_unpacked_to_device)
683  end subroutine batch_write_unpacked_to_device
684
685  ! ------------------------------------------------------------------
686
687  subroutine batch_read_device_to_unpacked(this)
688    class(batch_t),      intent(inout) :: this
689
690    integer :: ist, ist2, unroll
691    type(accel_mem_t) :: tmp
692    type(accel_kernel_t), pointer :: kernel
693    type(profile_t), save :: prof, prof_unpack
694
695    PUSH_SUB(batch_read_device_to_unpacked)
696    call profiling_in(prof, "BATCH_UNPACK_COPY_CL")
697
698    if(this%nst_linear == 1) then
699      ! we can copy directly
700      if(this%type() == TYPE_FLOAT) then
701        call accel_read_buffer(this%ff_device, ubound(this%dff_linear, dim=1), this%dff_linear(:, 1))
702      else
703        call accel_read_buffer(this%ff_device, ubound(this%zff_linear, dim=1), this%zff_linear(:, 1))
704      end if
705    else
706
707      unroll = min(CL_PACK_MAX_BUFFER_SIZE, this%pack_size(1))
708
709      ! we use a kernel to move to a temporary array and then we read
710      call accel_create_buffer(tmp, ACCEL_MEM_WRITE_ONLY, this%type(), unroll*this%pack_size(2))
711
712      if(this%type() == TYPE_FLOAT) then
713        kernel => dunpack
714      else
715        kernel => zunpack
716      end if
717
718      do ist = 1, this%nst_linear, unroll
719        call accel_set_kernel_arg(kernel, 0, this%pack_size(1))
720        call accel_set_kernel_arg(kernel, 1, this%pack_size(2))
721        call accel_set_kernel_arg(kernel, 2, ist - 1)
722        call accel_set_kernel_arg(kernel, 3, this%ff_device)
723        call accel_set_kernel_arg(kernel, 4, tmp)
724
725        call profiling_in(prof_unpack, "CL_UNPACK")
726        call accel_kernel_run(kernel, (/unroll, this%pack_size(2)/), (/unroll, accel_max_workgroup_size()/unroll/))
727
728        if(this%type() == TYPE_FLOAT) then
729          call profiling_count_transfers(unroll*this%pack_size(2), M_ONE)
730        else
731          call profiling_count_transfers(unroll*this%pack_size(2), M_ZI)
732        end if
733
734        call accel_finish()
735        call profiling_out(prof_unpack)
736
737        ! copy a number 'unroll' of states from the buffer
738        do ist2 = ist, min(ist + unroll - 1, this%nst_linear)
739
740          if(this%type() == TYPE_FLOAT) then
741            call accel_read_buffer(tmp, ubound(this%dff_linear, dim=1), this%dff_linear(:, ist2), &
742              offset = (ist2 - ist)*this%pack_size(2))
743          else
744            call accel_read_buffer(tmp, ubound(this%zff_linear, dim=1), this%zff_linear(:, ist2), &
745              offset = (ist2 - ist)*this%pack_size(2))
746          end if
747        end do
748
749      end do
750
751      call accel_release_buffer(tmp)
752    end if
753
754    call profiling_out(prof)
755    POP_SUB(batch_read_device_to_unpacked)
756  end subroutine batch_read_device_to_unpacked
757
758  ! ------------------------------------------------------------------
759  subroutine batch_write_packed_to_device(this, async)
760    class(batch_t),      intent(inout)  :: this
761    logical,   optional, intent(in)     :: async
762
763    type(profile_t), save :: prof_pack
764
765    PUSH_SUB(batch_write_packed_to_device)
766
767    call profiling_in(prof_pack, "BATCH_PACK_COPY_CL")
768    if(this%type() == TYPE_FLOAT) then
769      call accel_write_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
770    else
771      call accel_write_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
772    end if
773    call profiling_out(prof_pack)
774
775    POP_SUB(batch_write_packed_to_device)
776  end subroutine batch_write_packed_to_device
777
778  ! ------------------------------------------------------------------
779  subroutine batch_read_device_to_packed(this, async)
780    class(batch_t),      intent(inout) :: this
781    logical,   optional, intent(in)    :: async
782
783    type(profile_t), save :: prof_unpack
784
785    PUSH_SUB(batch_read_device_to_packed)
786
787    call profiling_in(prof_unpack, "BATCH_UNPACK_COPY_CL")
788    if(this%type() == TYPE_FLOAT) then
789      call accel_read_buffer(this%ff_device, product(this%pack_size), this%dff_pack, async=async)
790    else
791      call accel_read_buffer(this%ff_device, product(this%pack_size), this%zff_pack, async=async)
792    end if
793    call profiling_out(prof_unpack)
794
795    POP_SUB(batch_read_device_to_packed)
796  end subroutine batch_read_device_to_packed
797
798! ------------------------------------------------------
799integer function batch_inv_index(this, cind) result(index)
800  class(batch_t),     intent(in)    :: this
801  integer,            intent(in)    :: cind(:)
802
803  do index = 1, this%nst_linear
804    if(all(cind(1:this%ndims) == this%ist_idim_index(index, 1:this%ndims))) exit
805  end do
806
807  ASSERT(index <= this%nst_linear)
808
809end function batch_inv_index
810
811! ------------------------------------------------------
812
813integer pure function batch_ist_idim_to_linear(this, cind) result(index)
814  class(batch_t),     intent(in)    :: this
815  integer,            intent(in)    :: cind(:)
816
817  if(ubound(cind, dim = 1) == 1) then
818    index = cind(1)
819  else
820    index = (cind(1) - 1)*this%dim + cind(2)
821  end if
822
823end function batch_ist_idim_to_linear
824
825! ------------------------------------------------------
826
827integer pure function batch_linear_to_ist(this, linear_index) result(ist)
828  class(batch_t),     intent(in)    :: this
829  integer,            intent(in)    :: linear_index
830
831  ist = this%ist_idim_index(linear_index, 1)
832
833end function batch_linear_to_ist
834
835! ------------------------------------------------------
836
837integer pure function batch_linear_to_idim(this, linear_index) result(idim)
838  class(batch_t),     intent(in)    :: this
839  integer,            intent(in)    :: linear_index
840
841  idim = this%ist_idim_index(linear_index, 2)
842
843end function batch_linear_to_idim
844
845! ------------------------------------------------------
846
847subroutine batch_remote_access_start(this, mpi_grp, rma_win)
848  class(batch_t),  intent(inout) :: this
849  type(mpi_grp_t), intent(in)    :: mpi_grp
850  integer,         intent(out)   :: rma_win
851
852  PUSH_SUB(batch_remote_access_start)
853
854  ASSERT(.not. accel_is_enabled())
855
856  if(mpi_grp%size > 1) then
857    call this%do_pack()
858
859    if(this%type() == TYPE_CMPLX) then
860#ifdef HAVE_MPI2
861      call MPI_Win_create(this%zff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), MPI_ADDRESS_KIND), &
862        types_get_size(this%type()), MPI_INFO_NULL, mpi_grp%comm, rma_win, mpi_err)
863#endif
864    else if (this%type() == TYPE_FLOAT) then
865#ifdef HAVE_MPI2
866      call MPI_Win_create(this%dff_pack(1, 1), int(product(this%pack_size)*types_get_size(this%type()), MPI_ADDRESS_KIND), &
867        types_get_size(this%type()), MPI_INFO_NULL, mpi_grp%comm, rma_win, mpi_err)
868#endif
869    else
870      message(1) = "Internal error: unknown batch type in batch_remote_access_start."
871      call messages_fatal(1)
872    end if
873
874  else
875    rma_win = -1
876  end if
877
878  POP_SUB(batch_remote_access_start)
879end subroutine batch_remote_access_start
880
881! ------------------------------------------------------
882
883subroutine batch_remote_access_stop(this, rma_win)
884  class(batch_t),     intent(inout) :: this
885  integer,            intent(inout) :: rma_win
886
887  PUSH_SUB(batch_remote_access_stop)
888
889  if(rma_win /= -1) then
890#ifdef HAVE_MPI2
891    call MPI_Win_free(rma_win, mpi_err)
892#endif
893    call this%do_unpack()
894  end if
895
896  POP_SUB(batch_remote_access_stop)
897end subroutine batch_remote_access_stop
898
899! --------------------------------------------------------------
900
901subroutine batch_copy_data_to(this, np, dest)
902  class(batch_t),    intent(in)    :: this
903  integer,           intent(in)    :: np
904  class(batch_t),    intent(inout) :: dest
905
906  integer :: ist, dim2, dim3
907  type(profile_t), save :: prof
908  integer :: localsize
909
910  PUSH_SUB(batch_copy_data_to)
911  call profiling_in(prof, "BATCH_COPY_DATA_TO")
912
913  call this%check_compatibility_with(dest)
914
915  select case(this%status())
916  case(BATCH_DEVICE_PACKED)
917    call accel_set_kernel_arg(kernel_copy, 0, np)
918    call accel_set_kernel_arg(kernel_copy, 1, this%ff_device)
919    call accel_set_kernel_arg(kernel_copy, 2, log2(this%pack_size_real(1)))
920    call accel_set_kernel_arg(kernel_copy, 3, dest%ff_device)
921    call accel_set_kernel_arg(kernel_copy, 4, log2(dest%pack_size_real(1)))
922
923    localsize = accel_kernel_workgroup_size(kernel_copy)/dest%pack_size_real(1)
924
925    dim3 = np/(accel_max_size_per_dim(2)*localsize) + 1
926    dim2 = min(accel_max_size_per_dim(2)*localsize, pad(np, localsize))
927
928    call accel_kernel_run(kernel_copy, (/dest%pack_size_real(1), dim2, dim3/), (/dest%pack_size_real(1), localsize, 1/))
929
930    call accel_finish()
931
932  case(BATCH_PACKED)
933    if(dest%type() == TYPE_FLOAT) then
934      call blas_copy(np*this%pack_size(1), this%dff_pack(1, 1), 1, dest%dff_pack(1, 1), 1)
935    else
936      call blas_copy(np*this%pack_size(1), this%zff_pack(1, 1), 1, dest%zff_pack(1, 1), 1)
937    end if
938
939  case(BATCH_NOT_PACKED)
940    !$omp parallel do private(ist)
941    do ist = 1, dest%nst_linear
942      if(dest%type() == TYPE_CMPLX) then
943        call blas_copy(np, this%zff_linear(1, ist), 1, dest%zff_linear(1, ist), 1)
944      else
945        call blas_copy(np, this%dff_linear(1, ist), 1, dest%dff_linear(1, ist), 1)
946      end if
947    end do
948
949  end select
950
951  call profiling_out(prof)
952  POP_SUB(batch_copy_data_to)
953end subroutine batch_copy_data_to
954
955! --------------------------------------------------------------
956
957subroutine batch_check_compatibility_with(this, target, only_check_dim)
958  class(batch_t),    intent(in) :: this
959  class(batch_t),    intent(in) :: target
960  logical, optional, intent(in) :: only_check_dim
961
962  PUSH_SUB(batch_check_compatibility_with)
963
964  ASSERT(this%type() == target%type())
965  if(.not. optional_default(only_check_dim, .false.)) then
966    ASSERT(this%nst_linear == target%nst_linear)
967  end if
968  ASSERT(this%status() == target%status())
969  ASSERT(this%dim == target%dim)
970
971  POP_SUB(batch_check_compatibility_with)
972
973end subroutine batch_check_compatibility_with
974
975!--------------------------------------------------------------
976subroutine batch_build_indices(this, st_start, st_end)
977  class(batch_t), intent(inout) :: this
978  integer,        intent(in)    :: st_start
979  integer,        intent(in)    :: st_end
980
981  integer :: idim, ii, ist
982
983  PUSH_SUB(batch_build_indices)
984
985  do ist = st_start, st_end
986    ! now we also populate the linear array
987    do idim = 1, this%dim
988      ii = this%dim*(ist - st_start) + idim
989      this%ist_idim_index(ii, 1) = ist
990      this%ist_idim_index(ii, 2) = idim
991    end do
992    this%ist(ist - st_start + 1) = ist
993  end do
994
995  ! compute packed sizes
996  this%pack_size(1) = pad_pow2(this%nst_linear)
997  this%pack_size(2) = this%np
998  if(accel_is_enabled()) this%pack_size(2) = accel_padded_size(this%pack_size(2))
999
1000  this%pack_size_real = this%pack_size
1001  if(type_is_complex(this%type())) this%pack_size_real(1) = 2*this%pack_size_real(1)
1002
1003  POP_SUB(batch_build_indices)
1004end subroutine batch_build_indices
1005
1006
1007#include "real.F90"
1008#include "batch_inc.F90"
1009#include "undef.F90"
1010
1011#include "complex.F90"
1012#include "batch_inc.F90"
1013#include "undef.F90"
1014
1015end module batch_oct_m
1016
1017!! Local Variables:
1018!! mode: f90
1019!! coding: utf-8
1020!! End:
1021