1!
2! Copyright (C) 2002-2013 Quantum ESPRESSO group
3! This file is distributed under the terms of the
4! GNU General Public License. See the file `License'
5! in the root directory of the present distribution,
6! or http://www.gnu.org/copyleft/gpl.txt .
7!
8! This module contains interfaces to most low-level MPI operations:
9! initialization and stopping, broadcast, parallel sum, etc.
10!
11!------------------------------------------------------------------------------!
12MODULE mp
13!------------------------------------------------------------------------------!
14  USE util_param,     ONLY : DP, stdout, i8b
15  USE parallel_include
16#if defined(__CUDA)
17  USE cudafor,        ONLY : cudamemcpy, cudamemcpy2d, &
18                            & cudaMemcpyDeviceToDevice, &
19                            & cudaDeviceSynchronize
20#endif
21  !
22  IMPLICIT NONE
23  PRIVATE
24  !
25  PUBLIC :: mp_start, mp_abort, mp_stop, mp_end, &
26    mp_bcast, mp_sum, mp_max, mp_min, mp_rank, mp_size, &
27    mp_gather, mp_alltoall, mp_get, mp_put, &
28    mp_barrier, mp_report, mp_group_free, &
29    mp_root_sum, mp_comm_free, mp_comm_create, mp_comm_group, &
30    mp_group_create, mp_comm_split, mp_set_displs, &
31    mp_circular_shift_left, mp_circular_shift_left_start, &
32    mp_get_comm_null, mp_get_comm_self, mp_count_nodes, &
33    mp_type_create_column_section, mp_type_create_row_section, mp_type_free, &
34    mp_allgather, mp_waitall, mp_testall
35  !
36  INTERFACE mp_bcast
37    MODULE PROCEDURE mp_bcast_i1, mp_bcast_r1, mp_bcast_c1, &
38      mp_bcast_z, mp_bcast_zv, &
39      mp_bcast_iv, mp_bcast_i8v, mp_bcast_rv, mp_bcast_cv, mp_bcast_l, mp_bcast_rm, &
40      mp_bcast_cm, mp_bcast_im, mp_bcast_it, mp_bcast_i4d, mp_bcast_rt, mp_bcast_lv, &
41      mp_bcast_lm, mp_bcast_r4d, mp_bcast_r5d, mp_bcast_ct,  mp_bcast_c4d,&
42      mp_bcast_c5d, mp_bcast_c6d
43#if defined(__CUDA)
44    MODULE PROCEDURE mp_bcast_i1_gpu, mp_bcast_r1_gpu, mp_bcast_c1_gpu, &
45      !mp_bcast_z_gpu, mp_bcast_zv_gpu, &
46      mp_bcast_iv_gpu, mp_bcast_rv_gpu, mp_bcast_cv_gpu, mp_bcast_l_gpu, mp_bcast_rm_gpu, &
47      mp_bcast_cm_gpu, mp_bcast_im_gpu, mp_bcast_it_gpu, mp_bcast_i4d_gpu, mp_bcast_rt_gpu, mp_bcast_lv_gpu, &
48      mp_bcast_lm_gpu, mp_bcast_r4d_gpu, mp_bcast_r5d_gpu, mp_bcast_ct_gpu,  mp_bcast_c4d_gpu,&
49      mp_bcast_c5d_gpu, mp_bcast_c6d_gpu
50#endif
51  END INTERFACE
52  !
53  INTERFACE mp_sum
54    MODULE PROCEDURE mp_sum_i1, mp_sum_iv, mp_sum_i8v, mp_sum_im, mp_sum_it, mp_sum_i4, mp_sum_i5, &
55      mp_sum_r1, mp_sum_rv, mp_sum_rm, mp_sum_rt, mp_sum_r4d, &
56      mp_sum_c1, mp_sum_cv, mp_sum_cm, mp_sum_ct, mp_sum_c4d, &
57      mp_sum_c5d, mp_sum_c6d, mp_sum_rmm, mp_sum_cmm, mp_sum_r5d, &
58      mp_sum_r6d
59#if defined(__CUDA)
60    MODULE PROCEDURE  mp_sum_i1_gpu, mp_sum_iv_gpu, mp_sum_im_gpu, mp_sum_it_gpu, &
61      mp_sum_r1_gpu, mp_sum_rv_gpu, mp_sum_rm_gpu, mp_sum_rt_gpu, mp_sum_r4d_gpu, &
62      mp_sum_c1_gpu, mp_sum_cv_gpu, mp_sum_cm_gpu, mp_sum_ct_gpu, mp_sum_c4d_gpu, &
63      mp_sum_c5d_gpu, mp_sum_c6d_gpu, mp_sum_rmm_gpu, mp_sum_cmm_gpu, mp_sum_r5d_gpu, &
64      mp_sum_r6d_gpu
65#endif
66  END INTERFACE
67  !
68  INTERFACE mp_root_sum
69    MODULE PROCEDURE mp_root_sum_rm, mp_root_sum_cm
70#if defined(__CUDA)
71    MODULE PROCEDURE mp_root_sum_rm_gpu, mp_root_sum_cm_gpu
72#endif
73  END INTERFACE
74  !
75  INTERFACE mp_get
76    MODULE PROCEDURE mp_get_r1, mp_get_rv, mp_get_cv, mp_get_i1, mp_get_iv, mp_get_rm, mp_get_cm
77#if defined(__CUDA)
78    MODULE PROCEDURE mp_get_r1_gpu, mp_get_rv_gpu, mp_get_cv_gpu, mp_get_i1_gpu, mp_get_iv_gpu, &
79      mp_get_rm_gpu, mp_get_cm_gpu
80#endif
81   END INTERFACE
82   !
83   INTERFACE mp_put
84     MODULE PROCEDURE mp_put_rv, mp_put_cv, mp_put_i1, mp_put_iv, &
85       mp_put_rm
86#if defined(__CUDA)
87   MODULE PROCEDURE mp_put_rv_gpu, mp_put_cv_gpu, mp_put_i1_gpu, mp_put_iv_gpu, &
88     mp_put_rm_gpu
89#endif
90   END INTERFACE
91   !
92   INTERFACE mp_max
93     MODULE PROCEDURE mp_max_i, mp_max_r, mp_max_rv, mp_max_iv
94#if defined(__CUDA)
95     MODULE PROCEDURE mp_max_i_gpu, mp_max_r_gpu, mp_max_rv_gpu, mp_max_iv_gpu
96#endif
97   END INTERFACE
98   !
99   INTERFACE mp_min
100     MODULE PROCEDURE mp_min_i, mp_min_r, mp_min_rv, mp_min_iv
101#if defined(__CUDA)
102     MODULE PROCEDURE mp_min_i_gpu, mp_min_r_gpu, mp_min_rv_gpu, mp_min_iv_gpu
103#endif
104   END INTERFACE
105   !
106   INTERFACE mp_gather
107     MODULE PROCEDURE mp_gather_i1, mp_gather_iv, mp_gatherv_rv, mp_gatherv_iv, &
108                      mp_gatherv_rm, mp_gatherv_im, mp_gatherv_cv, &
109                      mp_gatherv_inplace_cplx_array
110#if defined(__CUDA)
111     MODULE PROCEDURE mp_gather_i1_gpu, mp_gather_iv_gpu, mp_gatherv_rv_gpu, mp_gatherv_iv_gpu, &
112       mp_gatherv_rm_gpu, mp_gatherv_im_gpu, mp_gatherv_cv_gpu, mp_gatherv_inplace_cplx_array_gpu
113#endif
114   END INTERFACE
115   !
116   INTERFACE mp_allgather
117     MODULE PROCEDURE mp_allgatherv_inplace_cplx_array
118     MODULE PROCEDURE mp_allgatherv_inplace_real_array
119#if defined(__CUDA)
120     MODULE PROCEDURE mp_allgatherv_inplace_cplx_array_gpu
121#endif
122   END INTERFACE
123   !
124   INTERFACE mp_alltoall
125     MODULE PROCEDURE mp_alltoall_c3d, mp_alltoall_i3d
126#if defined(__CUDA)
127     MODULE PROCEDURE mp_alltoall_c3d_gpu, mp_alltoall_i3d_gpu
128#endif
129   END INTERFACE
130   !
131   INTERFACE mp_circular_shift_left
132     MODULE PROCEDURE mp_circular_shift_left_i0, &
133       mp_circular_shift_left_i1, &
134       mp_circular_shift_left_i2, &
135       mp_circular_shift_left_r2d, &
136       mp_circular_shift_left_c2d
137#if defined(__CUDA)
138     MODULE PROCEDURE mp_circular_shift_left_i0_gpu, &
139       mp_circular_shift_left_i1_gpu, &
140       mp_circular_shift_left_i2_gpu, &
141       mp_circular_shift_left_r2d_gpu, &
142       mp_circular_shift_left_c2d_gpu
143#endif
144   END INTERFACE
145   !
146   INTERFACE mp_circular_shift_left_start
147     MODULE PROCEDURE mp_circular_shift_left_start_i0, &
148       mp_circular_shift_left_start_i1, &
149       mp_circular_shift_left_start_i2, &
150       mp_circular_shift_left_start_r2d, &
151       mp_circular_shift_left_start_c2d
152   END INTERFACE
153   !
154   INTERFACE mp_type_create_column_section
155     MODULE PROCEDURE mp_type_create_cplx_column_section
156     MODULE PROCEDURE mp_type_create_real_column_section
157#if defined(__CUDA)
158     MODULE PROCEDURE mp_type_create_cplx_column_section_gpu
159     MODULE PROCEDURE mp_type_create_real_column_section_gpu
160#endif
161   END INTERFACE
162
163   INTERFACE mp_type_create_row_section
164     MODULE PROCEDURE mp_type_create_cplx_row_section
165     MODULE PROCEDURE mp_type_create_real_row_section
166#if defined(__CUDA)
167     MODULE PROCEDURE mp_type_create_cplx_row_section_gpu
168     MODULE PROCEDURE mp_type_create_real_row_section_gpu
169#endif
170   END INTERFACE
171!------------------------------------------------------------------------------!
172!
173   CONTAINS
174!
175!------------------------------------------------------------------------------!
176!
177!------------------------------------------------------------------------------!
178!..mp_gather_i1
179      SUBROUTINE mp_gather_i1(mydata, alldata, root, gid)
180        IMPLICIT NONE
181        INTEGER, INTENT(IN) :: mydata, root
182        INTEGER, INTENT(IN) :: gid
183        INTEGER :: group
184        INTEGER, INTENT(OUT) :: alldata(:)
185        INTEGER :: ierr
186
187
188#if defined (__MPI)
189        group = gid
190        CALL MPI_GATHER(mydata, 1, MPI_INTEGER, alldata, 1, MPI_INTEGER, root, group, IERR)
191        IF (ierr/=0) CALL mp_stop( 8013 )
192#else
193        alldata(1) = mydata
194#endif
195        RETURN
196      END SUBROUTINE mp_gather_i1
197
198!------------------------------------------------------------------------------!
199!..mp_gather_iv
200!..Carlo Cavazzoni
201      SUBROUTINE mp_gather_iv(mydata, alldata, root, gid)
202        IMPLICIT NONE
203        INTEGER, INTENT(IN) :: mydata(:), root
204        INTEGER, INTENT(IN) :: gid
205        INTEGER :: group
206        INTEGER, INTENT(OUT) :: alldata(:,:)
207        INTEGER :: msglen, ierr
208
209
210#if defined (__MPI)
211        msglen = SIZE(mydata)
212        IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
213        group = gid
214        CALL MPI_GATHER(mydata, msglen, MPI_INTEGER, alldata, msglen, MPI_INTEGER, root, group, IERR)
215        IF (ierr/=0) CALL mp_stop( 8014 )
216#else
217        msglen = SIZE(mydata)
218        IF( msglen .NE. SIZE(alldata, 1) ) CALL mp_stop( 8014 )
219        alldata(:,1) = mydata(:)
220#endif
221        RETURN
222      END SUBROUTINE mp_gather_iv
223
224!
225!------------------------------------------------------------------------------!
226!..mp_start
227      SUBROUTINE mp_start(numtask, taskid, group)
228
229! ...
230        IMPLICIT NONE
231        INTEGER, INTENT (OUT) :: numtask, taskid
232        INTEGER, INTENT (IN)  :: group
233        INTEGER :: ierr
234! ...
235        ierr = 0
236        numtask = 1
237        taskid = 0
238
239#  if defined(__MPI)
240        IF (ierr/=0) CALL mp_stop( 8004 )
241        CALL mpi_comm_rank(group,taskid,ierr)
242        IF (ierr/=0) CALL mp_stop( 8005 )
243        CALL mpi_comm_size(group,numtask,ierr)
244        IF (ierr/=0) CALL mp_stop( 8006 )
245! ...
246        CALL allocate_buffers()
247#if defined(__CUDA)
248        CALL allocate_buffers_gpu()
249#endif
250#endif
251        RETURN
252      END SUBROUTINE mp_start
253!
254!------------------------------------------------------------------------------!
255!..mp_abort
256
257      SUBROUTINE mp_abort(errorcode,gid)
258        IMPLICIT NONE
259        INTEGER :: ierr
260        INTEGER, INTENT(IN):: errorcode, gid
261#if defined(__MPI)
262        CALL deallocate_buffers()
263#if defined(__CUDA)
264        CALL deallocate_buffers_gpu()
265#endif
266        CALL mpi_abort(gid, errorcode, ierr)
267#endif
268      END SUBROUTINE mp_abort
269!
270!------------------------------------------------------------------------------!
271!..mp_end
272
273      SUBROUTINE mp_end(groupid, cleanup)
274        IMPLICIT NONE
275        INTEGER, INTENT(IN) :: groupid
276        LOGICAL, OPTIONAL, INTENT(IN) :: cleanup
277        INTEGER :: ierr, taskid
278        LOGICAL :: cleanup_
279
280        ierr = 0
281        taskid = 0
282
283#if defined(__MPI)
284        CALL mpi_comm_rank( groupid, taskid, ierr)
285        cleanup_ = .FALSE.
286        IF (PRESENT(cleanup)) cleanup_ = cleanup
287        IF(cleanup_) THEN
288           CALL deallocate_buffers()
289#if defined(__CUDA)
290           CALL deallocate_buffers_gpu()
291#endif
292        END IF
293#endif
294        RETURN
295      END SUBROUTINE mp_end
296
297!------------------------------------------------------------------------------!
298!..mp_group
299
300      SUBROUTINE mp_comm_group( comm, group )
301         IMPLICIT NONE
302         INTEGER, INTENT (IN) :: comm
303         INTEGER, INTENT (OUT) :: group
304         INTEGER :: ierr
305         ierr = 0
306#if defined(__MPI)
307         CALL mpi_comm_group( comm, group, ierr )
308         IF (ierr/=0) CALL mp_stop( 8007 )
309#else
310         group = 0
311#endif
312      END SUBROUTINE  mp_comm_group
313
314      SUBROUTINE mp_comm_split( old_comm, color, key, new_comm )
315         IMPLICIT NONE
316         INTEGER, INTENT (IN) :: old_comm
317         INTEGER, INTENT (IN) :: color, key
318         INTEGER, INTENT (OUT) :: new_comm
319         INTEGER :: ierr
320         ierr = 0
321#if defined(__MPI)
322         CALL MPI_COMM_SPLIT( old_comm, color, key, new_comm, ierr )
323         IF (ierr/=0) CALL mp_stop( 8008 )
324#else
325         new_comm = old_comm
326#endif
327      END SUBROUTINE  mp_comm_split
328
329
330      SUBROUTINE mp_group_create( group_list, group_size, old_grp, new_grp )
331        IMPLICIT NONE
332        INTEGER, INTENT (IN) :: group_list(:), group_size, old_grp
333        INTEGER, INTENT (OUT) :: new_grp
334        INTEGER :: ierr
335
336        ierr = 0
337        new_grp = old_grp
338#if defined(__MPI)
339        CALL mpi_group_incl( old_grp, group_size, group_list, new_grp, ierr )
340        IF (ierr/=0) CALL mp_stop( 8009 )
341#endif
342      END SUBROUTINE mp_group_create
343
344!------------------------------------------------------------------------------!
345      SUBROUTINE mp_comm_create( old_comm, new_grp, new_comm )
346        IMPLICIT NONE
347        INTEGER, INTENT (IN) :: old_comm
348        INTEGER, INTENT (IN) :: new_grp
349        INTEGER, INTENT (OUT) :: new_comm
350        INTEGER :: ierr
351
352        ierr = 0
353        new_comm = old_comm
354#if defined(__MPI)
355        CALL mpi_comm_create( old_comm, new_grp, new_comm, ierr )
356        IF (ierr/=0) CALL mp_stop( 8010 )
357#endif
358      END SUBROUTINE mp_comm_create
359
360!------------------------------------------------------------------------------!
361!..mp_group_free
362      SUBROUTINE mp_group_free( group )
363        IMPLICIT NONE
364        INTEGER, INTENT (INOUT) :: group
365        INTEGER :: ierr
366        ierr = 0
367#if defined(__MPI)
368        CALL mpi_group_free( group, ierr )
369        IF (ierr/=0) CALL mp_stop( 8011 )
370#endif
371      END SUBROUTINE mp_group_free
372!------------------------------------------------------------------------------!
373
374      SUBROUTINE mp_comm_free( comm )
375         IMPLICIT NONE
376         INTEGER, INTENT (INOUT) :: comm
377         INTEGER :: ierr
378         ierr = 0
379#if defined(__MPI)
380         IF( comm /= MPI_COMM_NULL ) THEN
381            CALL mpi_comm_free( comm, ierr )
382            IF (ierr/=0) CALL mp_stop( 8012 )
383         END IF
384#endif
385         RETURN
386      END SUBROUTINE mp_comm_free
387
388!------------------------------------------------------------------------------!
389! non-blocking helpers
390! waits till all request are completed
391      SUBROUTINE mp_waitall(requests)
392! ...
393        IMPLICIT NONE
394        INTEGER, INTENT (INOUT) :: requests(:)
395        INTEGER :: ierr
396#if defined(__MPI)
397        INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
398#endif
399        ierr = 0
400#if defined(__MPI)
401        call MPI_Waitall(size(requests), requests, istatus, ierr)
402        IF (ierr/=0) CALL mp_stop( 8004 )
403#endif
404        RETURN
405      END SUBROUTINE mp_waitall
406
407!tests all requests
408      SUBROUTINE mp_testall(requests, flag)
409      ! ...
410         IMPLICIT NONE
411          INTEGER, INTENT (INOUT) :: requests(:)
412          INTEGER :: ierr
413#if defined(__MPI)
414          INTEGER :: istatus(MPI_STATUS_SIZE, size(requests))
415#endif
416          LOGICAL, INTENT(OUT):: flag
417          !
418          ierr = 0
419          flag = .FALSE.
420#if defined(__MPI)
421          call MPI_Testall(size(requests), requests, flag, istatus, ierr)
422          IF (ierr/=0) CALL mp_stop( 8004 )
423#else
424          flag = .TRUE.
425#endif
426          RETURN
427       END SUBROUTINE mp_testall
428
429!------------------------------------------------------------------------------!
430!..mp_bcast
431      SUBROUTINE mp_bcast_i1(msg,source,gid)
432        IMPLICIT NONE
433        INTEGER :: msg
434        INTEGER :: source
435        INTEGER, INTENT(IN) :: gid
436        INTEGER :: group
437        INTEGER :: msglen
438
439#if defined(__MPI)
440        msglen = 1
441        group = gid
442        CALL bcast_integer( msg, msglen, source, group )
443#endif
444      END SUBROUTINE mp_bcast_i1
445      !
446      !------------------------------------------------------------------------------!
447      SUBROUTINE mp_bcast_iv(msg, source, gid)
448      !------------------------------------------------------------------------------!
449      !!
450      !! Bcast an integer vector
451      !!
452      IMPLICIT NONE
453      !
454      INTEGER :: msg(:)
455      INTEGER, INTENT(in) :: source
456      INTEGER, INTENT(in) :: gid
457#if defined(__MPI)
458      INTEGER :: msglen
459      msglen = SIZE(msg)
460      CALL bcast_integer(msg, msglen, source, gid)
461#endif
462      !------------------------------------------------------------------------------!
463      END SUBROUTINE mp_bcast_iv
464      !------------------------------------------------------------------------------!
465      !
466      !------------------------------------------------------------------------------!
467      SUBROUTINE mp_bcast_i8v(msg, source, gid)
468      !------------------------------------------------------------------------------!
469      !!
470      !! Bcast an integer vector of kind i8b.
471      !!
472      IMPLICIT NONE
473      !
474      INTEGER(KIND = i8b) :: msg(:)
475      INTEGER, INTENT(in) :: source
476      INTEGER, INTENT(in) :: gid
477#if defined(__MPI)
478      INTEGER :: msglen
479      msglen = SIZE(msg)
480      CALL bcast_integer8(msg, msglen, source, gid)
481#endif
482      !------------------------------------------------------------------------------!
483      END SUBROUTINE mp_bcast_i8v
484      !------------------------------------------------------------------------------!
485      !
486      !------------------------------------------------------------------------------!
487      SUBROUTINE mp_bcast_im(msg, source, gid)
488      !------------------------------------------------------------------------------!
489        IMPLICIT NONE
490        INTEGER :: msg(:,:)
491        INTEGER, INTENT(IN) :: source
492        INTEGER, INTENT(IN) :: gid
493#if defined(__MPI)
494        INTEGER :: msglen
495        msglen = size(msg)
496        CALL bcast_integer(msg, msglen, source, gid)
497#endif
498      END SUBROUTINE mp_bcast_im
499!
500!------------------------------------------------------------------------------!
501!
502! Carlo Cavazzoni
503!
504      SUBROUTINE mp_bcast_it( msg, source, gid )
505        IMPLICIT NONE
506        INTEGER :: msg(:,:,:)
507        INTEGER, INTENT(IN) :: source
508        INTEGER, INTENT(IN) :: gid
509#if defined(__MPI)
510        INTEGER :: msglen
511        msglen = size(msg)
512        CALL bcast_integer( msg, msglen, source, gid )
513#endif
514      END SUBROUTINE mp_bcast_it
515!
516!------------------------------------------------------------------------------!
517!
518! Samuel Ponce
519!
520      SUBROUTINE mp_bcast_i4d(msg, source, gid)
521        IMPLICIT NONE
522        INTEGER :: msg(:,:,:,:)
523        INTEGER, INTENT(IN) :: source
524        INTEGER, INTENT(IN) :: gid
525#if defined(__MPI)
526        INTEGER :: msglen
527        msglen = size(msg)
528        CALL bcast_integer( msg, msglen, source, gid )
529#endif
530      END SUBROUTINE mp_bcast_i4d
531!
532!------------------------------------------------------------------------------!
533!
534      SUBROUTINE mp_bcast_r1( msg, source, gid )
535        IMPLICIT NONE
536        REAL (DP) :: msg
537        INTEGER, INTENT(IN) :: source
538        INTEGER, INTENT(IN) :: gid
539#if defined(__MPI)
540        INTEGER :: msglen
541        msglen = 1
542        CALL bcast_real( msg, msglen, source, gid )
543#endif
544      END SUBROUTINE mp_bcast_r1
545!
546!------------------------------------------------------------------------------!
547!
548      SUBROUTINE mp_bcast_rv(msg,source,gid)
549        IMPLICIT NONE
550        REAL (DP) :: msg(:)
551        INTEGER, INTENT(IN) :: source
552        INTEGER, INTENT(IN) :: gid
553#if defined(__MPI)
554        INTEGER :: msglen
555        msglen = size(msg)
556        CALL bcast_real( msg, msglen, source, gid )
557#endif
558      END SUBROUTINE mp_bcast_rv
559!
560!------------------------------------------------------------------------------!
561!
562      SUBROUTINE mp_bcast_rm(msg,source,gid)
563        IMPLICIT NONE
564        REAL (DP) :: msg(:,:)
565        INTEGER, INTENT(IN) :: source
566        INTEGER, INTENT(IN) :: gid
567#if defined(__MPI)
568        INTEGER :: msglen
569        msglen = size(msg)
570        CALL bcast_real( msg, msglen, source, gid )
571#endif
572      END SUBROUTINE mp_bcast_rm
573!
574!------------------------------------------------------------------------------!
575!
576! Carlo Cavazzoni
577!
578      SUBROUTINE mp_bcast_rt(msg,source,gid)
579        IMPLICIT NONE
580        REAL (DP) :: msg(:,:,:)
581        INTEGER, INTENT(IN) :: source
582        INTEGER, INTENT(IN) :: gid
583#if defined(__MPI)
584        INTEGER :: msglen
585        msglen = size(msg)
586        CALL bcast_real( msg, msglen, source, gid )
587#endif
588      END SUBROUTINE mp_bcast_rt
589!
590!------------------------------------------------------------------------------!
591!
592! Carlo Cavazzoni
593!
594      SUBROUTINE mp_bcast_r4d(msg, source, gid)
595        IMPLICIT NONE
596        REAL (DP) :: msg(:,:,:,:)
597        INTEGER, INTENT(IN) :: source
598        INTEGER, INTENT(IN) :: gid
599#if defined(__MPI)
600        INTEGER :: msglen
601        msglen = size(msg)
602        CALL bcast_real( msg, msglen, source, gid )
603#endif
604      END SUBROUTINE mp_bcast_r4d
605
606!
607!------------------------------------------------------------------------------!
608!
609! Carlo Cavazzoni
610!
611      SUBROUTINE mp_bcast_r5d(msg, source, gid)
612        IMPLICIT NONE
613        REAL (DP) :: msg(:,:,:,:,:)
614        INTEGER, INTENT(IN) :: source
615        INTEGER, INTENT(IN) :: gid
616#if defined(__MPI)
617        INTEGER :: msglen
618        msglen = size(msg)
619        CALL bcast_real( msg, msglen, source, gid )
620#endif
621      END SUBROUTINE mp_bcast_r5d
622
623!------------------------------------------------------------------------------!
624!
625      SUBROUTINE mp_bcast_c1(msg,source,gid)
626        IMPLICIT NONE
627        COMPLEX (DP) :: msg
628        INTEGER, INTENT(IN) :: source
629        INTEGER, INTENT(IN) :: gid
630#if defined(__MPI)
631        INTEGER :: msglen
632        msglen = 1
633        CALL bcast_real( msg, 2 * msglen, source, gid )
634#endif
635      END SUBROUTINE mp_bcast_c1
636!
637!------------------------------------------------------------------------------!
638      SUBROUTINE mp_bcast_cv(msg,source,gid)
639        IMPLICIT NONE
640        COMPLEX (DP) :: msg(:)
641        INTEGER, INTENT(IN) :: source
642        INTEGER, INTENT(IN) :: gid
643#if defined(__MPI)
644        INTEGER :: msglen
645        msglen = size(msg)
646        CALL bcast_real( msg, 2 * msglen, source, gid )
647#endif
648      END SUBROUTINE mp_bcast_cv
649!
650!------------------------------------------------------------------------------!
651      SUBROUTINE mp_bcast_cm(msg,source,gid)
652        IMPLICIT NONE
653        COMPLEX (DP) :: msg(:,:)
654        INTEGER, INTENT(IN) :: source
655        INTEGER, INTENT(IN) :: gid
656#if defined(__MPI)
657        INTEGER :: msglen
658        msglen = size(msg)
659        CALL bcast_real( msg, 2 * msglen, source, gid )
660#endif
661      END SUBROUTINE mp_bcast_cm
662!
663!------------------------------------------------------------------------------!
664      SUBROUTINE mp_bcast_ct(msg,source,gid)
665        IMPLICIT NONE
666        COMPLEX (DP) :: msg(:,:,:)
667        INTEGER, INTENT(IN) :: source
668        INTEGER, INTENT(IN) :: gid
669#if defined(__MPI)
670        INTEGER :: msglen
671        msglen = size(msg)
672        CALL bcast_real( msg, 2 * msglen, source, gid )
673#endif
674      END SUBROUTINE mp_bcast_ct
675
676!
677!------------------------------------------------------------------------------!
678      SUBROUTINE mp_bcast_c4d(msg,source,gid)
679        IMPLICIT NONE
680        COMPLEX (DP) :: msg(:,:,:,:)
681        INTEGER, INTENT(IN) :: source
682        INTEGER, INTENT(IN) :: gid
683#if defined(__MPI)
684        INTEGER :: msglen
685        msglen = size(msg)
686        CALL bcast_real( msg, 2 * msglen, source, gid )
687#endif
688      END SUBROUTINE mp_bcast_c4d
689
690      SUBROUTINE mp_bcast_c5d(msg,source,gid)
691        IMPLICIT NONE
692        COMPLEX (DP) :: msg(:,:,:,:,:)
693        INTEGER, INTENT(IN) :: source
694        INTEGER, INTENT(IN) :: gid
695#if defined(__MPI)
696        INTEGER :: msglen
697        msglen = size(msg)
698        CALL bcast_real( msg, 2 * msglen, source, gid )
699#endif
700      END SUBROUTINE mp_bcast_c5d
701
702      SUBROUTINE mp_bcast_c6d(msg,source,gid)
703        IMPLICIT NONE
704        COMPLEX (DP) :: msg(:,:,:,:,:,:)
705        INTEGER, INTENT(IN) :: source
706        INTEGER, INTENT(IN) :: gid
707#if defined(__MPI)
708        INTEGER :: msglen
709        msglen = size(msg)
710        CALL bcast_real( msg, 2 * msglen, source, gid )
711#endif
712      END SUBROUTINE mp_bcast_c6d
713
714!
715!------------------------------------------------------------------------------!
716
717      SUBROUTINE mp_bcast_l(msg,source,gid)
718        IMPLICIT NONE
719        LOGICAL :: msg
720        INTEGER, INTENT(IN) :: source
721        INTEGER, INTENT(IN) :: gid
722#if defined(__MPI)
723        INTEGER :: msglen
724        msglen = 1
725        CALL bcast_logical( msg, msglen, source, gid )
726#endif
727      END SUBROUTINE mp_bcast_l
728!
729!------------------------------------------------------------------------------!
730!
731! Carlo Cavazzoni
732!
733      SUBROUTINE mp_bcast_lv(msg,source,gid)
734        IMPLICIT NONE
735        LOGICAL :: msg(:)
736        INTEGER, INTENT(IN) :: source
737        INTEGER, INTENT(IN) :: gid
738#if defined(__MPI)
739        INTEGER :: msglen
740        msglen = size(msg)
741        CALL bcast_logical( msg, msglen, source, gid )
742#endif
743      END SUBROUTINE mp_bcast_lv
744
745!------------------------------------------------------------------------------!
746!
747! Carlo Cavazzoni
748!
749      SUBROUTINE mp_bcast_lm(msg,source,gid)
750        IMPLICIT NONE
751        LOGICAL :: msg(:,:)
752        INTEGER, INTENT(IN) :: source
753        INTEGER, INTENT(IN) :: gid
754#if defined(__MPI)
755        INTEGER :: msglen
756        msglen = size(msg)
757        CALL bcast_logical( msg, msglen, source, gid )
758#endif
759      END SUBROUTINE mp_bcast_lm
760
761
762!
763!------------------------------------------------------------------------------!
764!
765      SUBROUTINE mp_bcast_z(msg,source,gid)
766        IMPLICIT NONE
767        CHARACTER (len=*) :: msg
768        INTEGER, INTENT(IN) :: source
769        INTEGER, INTENT(IN) :: gid
770        INTEGER :: group
771        INTEGER :: msglen, ierr, i
772        INTEGER, ALLOCATABLE :: imsg(:)
773#if defined(__MPI)
774        ierr = 0
775        msglen = len(msg)
776        group = gid
777        ALLOCATE (imsg(1:msglen), STAT=ierr)
778        IF (ierr/=0) CALL mp_stop( 8015 )
779        DO i = 1, msglen
780          imsg(i) = ichar(msg(i:i))
781        END DO
782        CALL bcast_integer( imsg, msglen, source, group )
783        DO i = 1, msglen
784          msg(i:i) = char(imsg(i))
785        END DO
786        DEALLOCATE (imsg, STAT=ierr)
787        IF (ierr/=0) CALL mp_stop( 8016 )
788#endif
789      END SUBROUTINE mp_bcast_z
790!
791!------------------------------------------------------------------------------!
792!
793!------------------------------------------------------------------------------!
794!
795      SUBROUTINE mp_bcast_zv(msg,source,gid)
796        IMPLICIT NONE
797        CHARACTER (len=*) :: msg(:)
798        INTEGER, INTENT(IN) :: source
799        INTEGER, INTENT(IN) :: gid
800        INTEGER :: group
801        INTEGER :: msglen, m1, m2, ierr, i, j
802        INTEGER, ALLOCATABLE :: imsg(:,:)
803#if defined(__MPI)
804        ierr = 0
805        m1 = LEN(msg)
806        m2 = SIZE(msg)
807        msglen = LEN(msg)*SIZE(msg)
808        group = gid
809        ALLOCATE (imsg(1:m1,1:m2), STAT=ierr)
810        IF (ierr/=0) CALL mp_stop( 8017 )
811        DO j = 1, m2
812          DO i = 1, m1
813            imsg(i,j) = ichar(msg(j)(i:i))
814          END DO
815        END DO
816        CALL bcast_integer( imsg, msglen, source, group )
817        DO j = 1, m2
818          DO i = 1, m1
819            msg(j)(i:i) = char(imsg(i,j))
820          END DO
821        END DO
822        DEALLOCATE (imsg, STAT=ierr)
823        IF (ierr/=0) CALL mp_stop( 8018 )
824#endif
825      END SUBROUTINE mp_bcast_zv
826!
827!------------------------------------------------------------------------------!
828!
829! Carlo Cavazzoni
830!
831      SUBROUTINE mp_get_i1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
832        INTEGER :: msg_dest
833        INTEGER, INTENT(IN) :: msg_sour
834        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
835        INTEGER, INTENT(IN) :: gid
836        INTEGER :: group
837#if defined(__MPI)
838        INTEGER :: istatus(MPI_STATUS_SIZE)
839#endif
840        INTEGER :: ierr, nrcv
841        INTEGER :: msglen = 1
842
843#if defined(__MPI)
844        group = gid
845#endif
846
847        ! processors not taking part in the communication have 0 length message
848
849        msglen = 0
850
851        IF(dest .NE. sour) THEN
852#if defined(__MPI)
853           IF(mpime .EQ. sour) THEN
854             msglen=1
855             CALL MPI_SEND( msg_sour, msglen, MPI_INTEGER, dest, ip, group, ierr)
856             IF (ierr/=0) CALL mp_stop( 8019 )
857           ELSE IF(mpime .EQ. dest) THEN
858             msglen=1
859             CALL MPI_RECV( msg_dest, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
860             IF (ierr/=0) CALL mp_stop( 8020 )
861             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
862             IF (ierr/=0) CALL mp_stop( 8021 )
863             msglen = nrcv
864           END IF
865#endif
866        ELSEIF(mpime .EQ. sour)THEN
867          msg_dest = msg_sour
868          msglen = 1
869        END IF
870
871#if defined(__MPI)
872        CALL MPI_BARRIER(group, IERR)
873        IF (ierr/=0) CALL mp_stop( 8022 )
874#endif
875
876
877        RETURN
878      END SUBROUTINE mp_get_i1
879
880!------------------------------------------------------------------------------!
881!
882! Carlo Cavazzoni
883!
884      SUBROUTINE mp_get_iv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
885        INTEGER :: msg_dest(:)
886        INTEGER, INTENT(IN) :: msg_sour(:)
887        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
888        INTEGER, INTENT(IN) :: gid
889        INTEGER :: group
890#if defined(__MPI)
891        INTEGER :: istatus(MPI_STATUS_SIZE)
892#endif
893        INTEGER :: ierr, nrcv
894        INTEGER :: msglen
895
896#if defined(__MPI)
897        group = gid
898#endif
899
900        ! processors not taking part in the communication have 0 length message
901
902        msglen = 0
903
904        IF(sour .NE. dest) THEN
905#if defined(__MPI)
906           IF(mpime .EQ. sour) THEN
907             msglen = SIZE(msg_sour)
908             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
909             IF (ierr/=0) CALL mp_stop( 8023 )
910           ELSE IF(mpime .EQ. dest) THEN
911             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
912             IF (ierr/=0) CALL mp_stop( 8024 )
913             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
914             IF (ierr/=0) CALL mp_stop( 8025 )
915             msglen = nrcv
916           END IF
917#endif
918        ELSEIF(mpime .EQ. sour)THEN
919          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
920          msglen = SIZE(msg_sour)
921        END IF
922#if defined(__MPI)
923        CALL MPI_BARRIER(group, IERR)
924        IF (ierr/=0) CALL mp_stop( 8026 )
925#endif
926        RETURN
927      END SUBROUTINE mp_get_iv
928
929!------------------------------------------------------------------------------!
930
931      SUBROUTINE mp_get_r1(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
932        REAL (DP)             :: msg_dest
933        REAL (DP), INTENT(IN) :: msg_sour
934        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
935        INTEGER, INTENT(IN) :: gid
936        INTEGER :: group
937#if defined(__MPI)
938        INTEGER :: istatus(MPI_STATUS_SIZE)
939#endif
940        INTEGER :: ierr, nrcv
941        INTEGER :: msglen
942
943#if defined(__MPI)
944        group = gid
945#endif
946
947        ! processors not taking part in the communication have 0 length message
948
949        msglen = 0
950
951        IF(sour .NE. dest) THEN
952#if defined(__MPI)
953           IF(mpime .EQ. sour) THEN
954             msglen = 1
955             CALL MPI_SEND( msg_sour, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
956             IF (ierr/=0) CALL mp_stop( 8027 )
957           ELSE IF(mpime .EQ. dest) THEN
958             CALL MPI_RECV( msg_dest, 1, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
959             IF (ierr/=0) CALL mp_stop( 8028 )
960             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
961             IF (ierr/=0) CALL mp_stop( 8029 )
962             msglen = nrcv
963           END IF
964#endif
965        ELSEIF(mpime .EQ. sour)THEN
966          msg_dest = msg_sour
967          msglen = 1
968        END IF
969#if defined(__MPI)
970        CALL MPI_BARRIER(group, IERR)
971        IF (ierr/=0) CALL mp_stop( 8030 )
972#endif
973        RETURN
974      END SUBROUTINE mp_get_r1
975
976!------------------------------------------------------------------------------!
977!
978! Carlo Cavazzoni
979!
980      SUBROUTINE mp_get_rv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
981        REAL (DP)             :: msg_dest(:)
982        REAL (DP), INTENT(IN) :: msg_sour(:)
983        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
984        INTEGER, INTENT(IN) :: gid
985        INTEGER :: group
986#if defined(__MPI)
987        INTEGER :: istatus(MPI_STATUS_SIZE)
988#endif
989        INTEGER :: ierr, nrcv
990        INTEGER :: msglen
991
992#if defined(__MPI)
993        group = gid
994#endif
995
996        ! processors not taking part in the communication have 0 length message
997
998        msglen = 0
999
1000        IF(sour .NE. dest) THEN
1001#if defined(__MPI)
1002           IF(mpime .EQ. sour) THEN
1003             msglen = SIZE(msg_sour)
1004             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
1005             IF (ierr/=0) CALL mp_stop( 8027 )
1006           ELSE IF(mpime .EQ. dest) THEN
1007             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
1008             IF (ierr/=0) CALL mp_stop( 8028 )
1009             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
1010             IF (ierr/=0) CALL mp_stop( 8029 )
1011             msglen = nrcv
1012           END IF
1013#endif
1014        ELSEIF(mpime .EQ. sour)THEN
1015          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
1016          msglen = SIZE(msg_sour)
1017        END IF
1018#if defined(__MPI)
1019        CALL MPI_BARRIER(group, IERR)
1020        IF (ierr/=0) CALL mp_stop( 8030 )
1021#endif
1022        RETURN
1023      END SUBROUTINE mp_get_rv
1024
1025!------------------------------------------------------------------------------!
1026!
1027! Carlo Cavazzoni
1028!
1029      SUBROUTINE mp_get_rm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
1030        REAL (DP) :: msg_dest(:,:)
1031        REAL (DP), INTENT(IN) :: msg_sour(:,:)
1032        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1033        INTEGER, INTENT(IN) :: gid
1034        INTEGER :: group
1035#if defined(__MPI)
1036        INTEGER :: istatus(MPI_STATUS_SIZE)
1037#endif
1038        INTEGER :: ierr, nrcv
1039        INTEGER :: msglen
1040
1041#if defined(__MPI)
1042        group = gid
1043#endif
1044
1045        ! processors not taking part in the communication have 0 length message
1046
1047        msglen = 0
1048
1049        IF(sour .NE. dest) THEN
1050#if defined(__MPI)
1051           IF(mpime .EQ. sour) THEN
1052             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
1053             IF (ierr/=0) CALL mp_stop( 8031 )
1054             msglen = SIZE(msg_sour)
1055           ELSE IF(mpime .EQ. dest) THEN
1056             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
1057             IF (ierr/=0) CALL mp_stop( 8032 )
1058             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
1059             IF (ierr/=0) CALL mp_stop( 8033 )
1060             msglen = nrcv
1061           END IF
1062#endif
1063        ELSEIF(mpime .EQ. sour)THEN
1064          msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
1065          msglen = SIZE( msg_sour )
1066        END IF
1067#if defined(__MPI)
1068        CALL MPI_BARRIER(group, IERR)
1069        IF (ierr/=0) CALL mp_stop( 8034 )
1070#endif
1071        RETURN
1072      END SUBROUTINE mp_get_rm
1073
1074
1075!------------------------------------------------------------------------------!
1076!
1077! Carlo Cavazzoni
1078!
1079      SUBROUTINE mp_get_cv(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
1080        COMPLEX (DP)             :: msg_dest(:)
1081        COMPLEX (DP), INTENT(IN) :: msg_sour(:)
1082        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1083        INTEGER, INTENT(IN) :: gid
1084        INTEGER :: group
1085#if defined(__MPI)
1086        INTEGER :: istatus(MPI_STATUS_SIZE)
1087#endif
1088        INTEGER :: ierr, nrcv
1089        INTEGER :: msglen
1090
1091#if defined(__MPI)
1092        group = gid
1093#endif
1094
1095        ! processors not taking part in the communication have 0 length message
1096
1097        msglen = 0
1098
1099        IF( dest .NE. sour ) THEN
1100#if defined(__MPI)
1101           IF(mpime .EQ. sour) THEN
1102             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
1103             IF (ierr/=0) CALL mp_stop( 8035 )
1104             msglen = SIZE(msg_sour)
1105           ELSE IF(mpime .EQ. dest) THEN
1106             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
1107             IF (ierr/=0) CALL mp_stop( 8036 )
1108             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
1109             IF (ierr/=0) CALL mp_stop( 8037 )
1110             msglen = nrcv
1111           END IF
1112#endif
1113        ELSEIF(mpime .EQ. sour)THEN
1114          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
1115          msglen = SIZE(msg_sour)
1116        END IF
1117#if defined(__MPI)
1118        CALL MPI_BARRIER(group, IERR)
1119        IF (ierr/=0) CALL mp_stop( 8038 )
1120#endif
1121        RETURN
1122      END SUBROUTINE mp_get_cv
1123
1124
1125
1126!------------------------------------------------------------------------------!
1127!
1128! Marco Govoni
1129!
1130      SUBROUTINE mp_get_cm(msg_dest, msg_sour, mpime, dest, sour, ip, gid)
1131        COMPLEX (DP)              :: msg_dest(:,:)
1132        COMPLEX (DP), INTENT(IN)  :: msg_sour(:,:)
1133        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1134        INTEGER, INTENT(IN) :: gid
1135        INTEGER :: group
1136#if defined(__MPI)
1137        INTEGER :: istatus(MPI_STATUS_SIZE)
1138#endif
1139        INTEGER :: ierr, nrcv
1140        INTEGER :: msglen
1141
1142#if defined(__MPI)
1143        group = gid
1144#endif
1145
1146        ! processors not taking part in the communication have 0 length message
1147
1148        msglen = 0
1149
1150        IF(sour .NE. dest) THEN
1151#if defined(__MPI)
1152           IF(mpime .EQ. sour) THEN
1153             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
1154             IF (ierr/=0) CALL mp_stop( 8031 )
1155             msglen = SIZE(msg_sour)
1156           ELSE IF(mpime .EQ. dest) THEN
1157             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
1158             IF (ierr/=0) CALL mp_stop( 8032 )
1159             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
1160             IF (ierr/=0) CALL mp_stop( 8033 )
1161             msglen = nrcv
1162           END IF
1163#endif
1164        ELSEIF(mpime .EQ. sour)THEN
1165          msg_dest(1:SIZE(msg_sour,1), 1:SIZE(msg_sour,2)) = msg_sour(:,:)
1166          msglen = SIZE( msg_sour )
1167        END IF
1168#if defined(__MPI)
1169        CALL MPI_BARRIER(group, IERR)
1170        IF (ierr/=0) CALL mp_stop( 8034 )
1171#endif
1172        RETURN
1173      END SUBROUTINE mp_get_cm
1174!------------------------------------------------------------------------------!
1175!
1176!
1177!------------------------------------------------------------------------------!
1178
1179
1180      SUBROUTINE mp_put_i1(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
1181        INTEGER :: msg_dest
1182        INTEGER, INTENT(IN) :: msg_sour
1183        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1184        INTEGER, INTENT(IN) :: gid
1185        INTEGER :: group
1186#if defined(__MPI)
1187        INTEGER :: istatus(MPI_STATUS_SIZE)
1188#endif
1189        INTEGER :: ierr, nrcv
1190        INTEGER :: msglen
1191
1192#if defined(__MPI)
1193        group = gid
1194#endif
1195
1196        ! processors not taking part in the communication have 0 length message
1197
1198        msglen = 0
1199
1200        IF(dest .NE. sour) THEN
1201#if defined(__MPI)
1202           IF(mpime .EQ. sour) THEN
1203             CALL MPI_SEND( msg_sour, 1, MPI_INTEGER, dest, ip, group, ierr)
1204             IF (ierr/=0) CALL mp_stop( 8039 )
1205             msglen = 1
1206           ELSE IF(mpime .EQ. dest) THEN
1207             CALL MPI_RECV( msg_dest, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
1208             IF (ierr/=0) CALL mp_stop( 8040 )
1209             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
1210             IF (ierr/=0) CALL mp_stop( 8041 )
1211             msglen = 1
1212           END IF
1213#endif
1214        ELSEIF(mpime .EQ. sour)THEN
1215          msg_dest = msg_sour
1216          msglen = 1
1217        END IF
1218#if defined(__MPI)
1219        CALL MPI_BARRIER(group, IERR)
1220        IF (ierr/=0) CALL mp_stop( 8042 )
1221#endif
1222        RETURN
1223      END SUBROUTINE mp_put_i1
1224
1225!------------------------------------------------------------------------------!
1226!
1227!
1228      SUBROUTINE mp_put_iv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
1229        INTEGER             :: msg_dest(:)
1230        INTEGER, INTENT(IN) :: msg_sour(:)
1231        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1232        INTEGER, INTENT(IN) :: gid
1233        INTEGER :: group
1234#if defined(__MPI)
1235        INTEGER :: istatus(MPI_STATUS_SIZE)
1236#endif
1237        INTEGER :: ierr, nrcv
1238        INTEGER :: msglen
1239#if defined(__MPI)
1240        group = gid
1241#endif
1242        ! processors not taking part in the communication have 0 length message
1243
1244        msglen = 0
1245
1246        IF(sour .NE. dest) THEN
1247#if defined(__MPI)
1248           IF(mpime .EQ. sour) THEN
1249             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_INTEGER, dest, ip, group, ierr)
1250             IF (ierr/=0) CALL mp_stop( 8043 )
1251             msglen = SIZE(msg_sour)
1252           ELSE IF(mpime .EQ. dest) THEN
1253             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_INTEGER, sour, ip, group, istatus, IERR )
1254             IF (ierr/=0) CALL mp_stop( 8044 )
1255             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
1256             IF (ierr/=0) CALL mp_stop( 8045 )
1257             msglen = nrcv
1258           END IF
1259#endif
1260        ELSEIF(mpime .EQ. sour)THEN
1261          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
1262          msglen = SIZE(msg_sour)
1263        END IF
1264#if defined(__MPI)
1265        CALL MPI_BARRIER(group, IERR)
1266        IF (ierr/=0) CALL mp_stop( 8046 )
1267#endif
1268        RETURN
1269      END SUBROUTINE mp_put_iv
1270
1271!------------------------------------------------------------------------------!
1272!
1273!
1274      SUBROUTINE mp_put_rv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
1275        REAL (DP)             :: msg_dest(:)
1276        REAL (DP), INTENT(IN) :: msg_sour(:)
1277        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1278        INTEGER, INTENT(IN) :: gid
1279        INTEGER :: group
1280#if defined(__MPI)
1281        INTEGER :: istatus(MPI_STATUS_SIZE)
1282#endif
1283        INTEGER :: ierr, nrcv
1284        INTEGER :: msglen
1285#if defined(__MPI)
1286        group = gid
1287#endif
1288        ! processors not taking part in the communication have 0 length message
1289
1290        msglen = 0
1291
1292        IF(sour .NE. dest) THEN
1293#if defined(__MPI)
1294           IF(mpime .EQ. sour) THEN
1295             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
1296             IF (ierr/=0) CALL mp_stop( 8047 )
1297             msglen = SIZE(msg_sour)
1298           ELSE IF(mpime .EQ. dest) THEN
1299             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
1300             IF (ierr/=0) CALL mp_stop( 8048 )
1301             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
1302             IF (ierr/=0) CALL mp_stop( 8049 )
1303             msglen = nrcv
1304           END IF
1305#endif
1306        ELSEIF(mpime .EQ. sour)THEN
1307          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
1308          msglen = SIZE(msg_sour)
1309        END IF
1310#if defined(__MPI)
1311        CALL MPI_BARRIER(group, IERR)
1312        IF (ierr/=0) CALL mp_stop( 8050 )
1313#endif
1314        RETURN
1315      END SUBROUTINE mp_put_rv
1316
1317!------------------------------------------------------------------------------!
1318!
1319!
1320      SUBROUTINE mp_put_rm(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
1321        REAL (DP)             :: msg_dest(:,:)
1322        REAL (DP), INTENT(IN) :: msg_sour(:,:)
1323        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1324        INTEGER, INTENT(IN) :: gid
1325        INTEGER :: group
1326#if defined(__MPI)
1327        INTEGER :: istatus(MPI_STATUS_SIZE)
1328#endif
1329        INTEGER :: ierr, nrcv
1330        INTEGER :: msglen
1331#if defined(__MPI)
1332        group = gid
1333#endif
1334        ! processors not taking part in the communication have 0 length message
1335
1336        msglen = 0
1337
1338        IF(sour .NE. dest) THEN
1339#if defined(__MPI)
1340           IF(mpime .EQ. sour) THEN
1341             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
1342             IF (ierr/=0) CALL mp_stop( 8051 )
1343             msglen = SIZE(msg_sour)
1344           ELSE IF(mpime .EQ. dest) THEN
1345             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
1346             IF (ierr/=0) CALL mp_stop( 8052 )
1347             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
1348             IF (ierr/=0) CALL mp_stop( 8053 )
1349             msglen = nrcv
1350           END IF
1351#endif
1352        ELSEIF(mpime .EQ. sour)THEN
1353          msg_dest(1:SIZE(msg_sour,1),1:SIZE(msg_sour,2)) = msg_sour(:,:)
1354          msglen = SIZE(msg_sour)
1355        END IF
1356#if defined(__MPI)
1357        CALL MPI_BARRIER(group, IERR)
1358        IF (ierr/=0) CALL mp_stop( 8054 )
1359#endif
1360        RETURN
1361      END SUBROUTINE mp_put_rm
1362
1363
1364!------------------------------------------------------------------------------!
1365!
1366!
1367      SUBROUTINE mp_put_cv(msg_dest, msg_sour, mpime, sour, dest, ip, gid)
1368        COMPLEX (DP)             :: msg_dest(:)
1369        COMPLEX (DP), INTENT(IN) :: msg_sour(:)
1370        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
1371        INTEGER, INTENT(IN) :: gid
1372        INTEGER :: group
1373#if defined(__MPI)
1374        INTEGER :: istatus(MPI_STATUS_SIZE)
1375#endif
1376        INTEGER :: ierr, nrcv
1377        INTEGER :: msglen
1378#if defined(__MPI)
1379        group = gid
1380#endif
1381        ! processors not taking part in the communication have 0 length message
1382
1383        msglen = 0
1384
1385        IF( dest .NE. sour ) THEN
1386#if defined(__MPI)
1387           IF(mpime .EQ. sour) THEN
1388             CALL MPI_SEND( msg_sour, SIZE(msg_sour), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
1389             IF (ierr/=0) CALL mp_stop( 8055 )
1390             msglen = SIZE(msg_sour)
1391           ELSE IF(mpime .EQ. dest) THEN
1392             CALL MPI_RECV( msg_dest, SIZE(msg_dest), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
1393             IF (ierr/=0) CALL mp_stop( 8056 )
1394             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
1395             IF (ierr/=0) CALL mp_stop( 8057 )
1396             msglen = nrcv
1397           END IF
1398#endif
1399        ELSEIF(mpime .EQ. sour)THEN
1400          msg_dest(1:SIZE(msg_sour)) = msg_sour(:)
1401          msglen = SIZE(msg_sour)
1402        END IF
1403#if defined(__MPI)
1404        CALL MPI_BARRIER(group, IERR)
1405        IF (ierr/=0) CALL mp_stop( 8058 )
1406#endif
1407        RETURN
1408      END SUBROUTINE mp_put_cv
1409
1410!
1411!------------------------------------------------------------------------------!
1412!
1413!..mp_stop
1414!
1415      SUBROUTINE mp_stop(code)
1416        IMPLICIT NONE
1417        INTEGER, INTENT (IN) :: code
1418        INTEGER :: ierr
1419        WRITE( stdout, fmt='( "*** error in Message Passing (mp) module ***")' )
1420        WRITE( stdout, fmt='( "*** error code: ",I5)' ) code
1421#if defined(__MPI)
1422        ! abort with extreme prejudice across the entire MPI set of tasks
1423        CALL mpi_abort(MPI_COMM_WORLD,code,ierr)
1424#endif
1425        STOP
1426      END SUBROUTINE mp_stop
1427!------------------------------------------------------------------------------!
1428!
1429!..mp_sum
1430      SUBROUTINE mp_sum_i1(msg,gid)
1431        IMPLICIT NONE
1432        INTEGER, INTENT (INOUT) :: msg
1433        INTEGER, INTENT(IN) :: gid
1434#if defined(__MPI)
1435        INTEGER :: msglen
1436        msglen = 1
1437        CALL reduce_base_integer( msglen, msg, gid, -1 )
1438#endif
1439      END SUBROUTINE mp_sum_i1
1440      !
1441      !------------------------------------------------------------------------------!
1442      SUBROUTINE mp_sum_iv(msg, gid)
1443      !------------------------------------------------------------------------------!
1444      !!
1445      !! MPI sum an integer vector from all cores and bcast the result to all.
1446      !!
1447      IMPLICIT NONE
1448      !
1449      INTEGER, INTENT(inout) :: msg(:)
1450      INTEGER, INTENT(in) :: gid
1451#if defined(__MPI)
1452      INTEGER :: msglen
1453      msglen = SIZE(msg)
1454      CALL reduce_base_integer(msglen, msg, gid, -1)
1455#endif
1456      !------------------------------------------------------------------------------!
1457      END SUBROUTINE mp_sum_iv
1458      !------------------------------------------------------------------------------!
1459      !
1460      !------------------------------------------------------------------------------!
1461      SUBROUTINE mp_sum_i8v(msg, gid)
1462      !------------------------------------------------------------------------------!
1463      !!
1464      !! MPI sum an integer vector from all cores and bcast the result to all.
1465      !!
1466      IMPLICIT NONE
1467      !
1468      INTEGER(KIND = i8b), INTENT(inout) :: msg(:)
1469      INTEGER, INTENT(in) :: gid
1470#if defined(__MPI)
1471      INTEGER :: msglen
1472      msglen = SIZE(msg)
1473      CALL reduce_base_integer8(msglen, msg, gid, -1)
1474#endif
1475      !------------------------------------------------------------------------------!
1476      END SUBROUTINE mp_sum_i8v
1477      !------------------------------------------------------------------------------!
1478      !
1479      !------------------------------------------------------------------------------!
1480      SUBROUTINE mp_sum_im(msg,gid)
1481      !------------------------------------------------------------------------------!
1482        IMPLICIT NONE
1483        INTEGER, INTENT (INOUT) :: msg(:,:)
1484        INTEGER, INTENT(IN) :: gid
1485#if defined(__MPI)
1486        INTEGER :: msglen
1487        msglen = size(msg)
1488        CALL reduce_base_integer( msglen, msg, gid, -1 )
1489#endif
1490      END SUBROUTINE mp_sum_im
1491!
1492!------------------------------------------------------------------------------!
1493
1494      SUBROUTINE mp_sum_it(msg,gid)
1495        IMPLICIT NONE
1496        INTEGER, INTENT (INOUT) :: msg(:,:,:)
1497        INTEGER, INTENT (IN) :: gid
1498#if defined(__MPI)
1499        INTEGER :: msglen
1500        msglen = size(msg)
1501        CALL reduce_base_integer( msglen, msg, gid, -1 )
1502#endif
1503      END SUBROUTINE mp_sum_it
1504
1505!------------------------------------------------------------------------------!
1506
1507      SUBROUTINE mp_sum_i4(msg,gid)
1508        IMPLICIT NONE
1509        INTEGER, INTENT (INOUT) :: msg(:,:,:,:)
1510        INTEGER, INTENT (IN) :: gid
1511#if defined(__MPI)
1512        INTEGER :: msglen
1513        msglen = size(msg)
1514        CALL reduce_base_integer( msglen, msg, gid, -1 )
1515#endif
1516      END SUBROUTINE mp_sum_i4
1517
1518!------------------------------------------------------------------------------!
1519
1520      SUBROUTINE mp_sum_i5(msg,gid)
1521        IMPLICIT NONE
1522        INTEGER, INTENT (INOUT) :: msg(:,:,:,:,:)
1523        INTEGER, INTENT (IN) :: gid
1524#if defined(__MPI)
1525        INTEGER :: msglen
1526        msglen = size(msg)
1527        CALL reduce_base_integer( msglen, msg, gid, -1 )
1528#endif
1529      END SUBROUTINE mp_sum_i5
1530
1531
1532!------------------------------------------------------------------------------!
1533
1534      SUBROUTINE mp_sum_r1(msg,gid)
1535        IMPLICIT NONE
1536        REAL (DP), INTENT (INOUT) :: msg
1537        INTEGER, INTENT (IN) :: gid
1538#if defined(__MPI)
1539        INTEGER :: msglen
1540        msglen = 1
1541        CALL reduce_base_real( msglen, msg, gid, -1 )
1542#endif
1543      END SUBROUTINE mp_sum_r1
1544
1545!
1546!------------------------------------------------------------------------------!
1547
1548      SUBROUTINE mp_sum_rv(msg,gid)
1549        IMPLICIT NONE
1550        REAL (DP), INTENT (INOUT) :: msg(:)
1551        INTEGER, INTENT (IN) :: gid
1552#if defined(__MPI)
1553        INTEGER :: msglen
1554        msglen = size(msg)
1555        CALL reduce_base_real( msglen, msg, gid, -1 )
1556#endif
1557      END SUBROUTINE mp_sum_rv
1558!
1559!------------------------------------------------------------------------------!
1560
1561
1562      SUBROUTINE mp_sum_rm(msg, gid)
1563        IMPLICIT NONE
1564        REAL (DP), INTENT (INOUT) :: msg(:,:)
1565        INTEGER, INTENT (IN) :: gid
1566#if defined(__MPI)
1567        INTEGER :: msglen
1568        msglen = size(msg)
1569        CALL reduce_base_real( msglen, msg, gid, -1 )
1570#endif
1571      END SUBROUTINE mp_sum_rm
1572
1573
1574      SUBROUTINE mp_root_sum_rm( msg, res, root, gid )
1575        IMPLICIT NONE
1576        REAL (DP), INTENT (IN)  :: msg(:,:)
1577        REAL (DP), INTENT (OUT) :: res(:,:)
1578        INTEGER,   INTENT (IN)  :: root
1579        INTEGER,   INTENT (IN) :: gid
1580#if defined(__MPI)
1581        INTEGER :: msglen, ierr, taskid
1582
1583        msglen = size(msg)
1584
1585        CALL mpi_comm_rank( gid, taskid, ierr)
1586        IF( ierr /= 0 ) CALL mp_stop( 8059 )
1587        !
1588        IF( taskid == root ) THEN
1589           IF( msglen > size(res) ) CALL mp_stop( 8060 )
1590        END IF
1591
1592        CALL reduce_base_real_to( msglen, msg, res, gid, root )
1593
1594#else
1595
1596        res = msg
1597
1598#endif
1599
1600      END SUBROUTINE mp_root_sum_rm
1601
1602
1603      SUBROUTINE mp_root_sum_cm( msg, res, root, gid )
1604        IMPLICIT NONE
1605        COMPLEX (DP), INTENT (IN)  :: msg(:,:)
1606        COMPLEX (DP), INTENT (OUT) :: res(:,:)
1607        INTEGER,   INTENT (IN)  :: root
1608        INTEGER,  INTENT (IN) :: gid
1609#if defined(__MPI)
1610        INTEGER :: msglen, ierr, taskid
1611
1612        msglen = size(msg)
1613
1614        CALL mpi_comm_rank( gid, taskid, ierr)
1615        IF( ierr /= 0 ) CALL mp_stop( 8061 )
1616
1617        IF( taskid == root ) THEN
1618           IF( msglen > size(res) ) CALL mp_stop( 8062 )
1619        END IF
1620
1621        CALL reduce_base_real_to( 2 * msglen, msg, res, gid, root )
1622
1623#else
1624
1625        res = msg
1626
1627#endif
1628
1629      END SUBROUTINE mp_root_sum_cm
1630
1631!
1632!------------------------------------------------------------------------------!
1633
1634
1635!------------------------------------------------------------------------------!
1636!
1637
1638      SUBROUTINE mp_sum_rmm( msg, res, root, gid )
1639        IMPLICIT NONE
1640        REAL (DP), INTENT (IN) :: msg(:,:)
1641        REAL (DP), INTENT (OUT) :: res(:,:)
1642        INTEGER, INTENT (IN) :: root
1643        INTEGER, INTENT (IN) :: gid
1644        INTEGER :: group
1645        INTEGER :: msglen
1646        INTEGER :: taskid, ierr
1647
1648        msglen = size(msg)
1649
1650#if defined(__MPI)
1651
1652        group = gid
1653        !
1654        CALL mpi_comm_rank( group, taskid, ierr)
1655        IF( ierr /= 0 ) CALL mp_stop( 8063 )
1656
1657        IF( taskid == root ) THEN
1658           IF( msglen > size(res) ) CALL mp_stop( 8064 )
1659        END IF
1660        !
1661        CALL reduce_base_real_to( msglen, msg, res, group, root )
1662        !
1663
1664#else
1665        res = msg
1666#endif
1667
1668      END SUBROUTINE mp_sum_rmm
1669
1670
1671!
1672!------------------------------------------------------------------------------!
1673
1674
1675      SUBROUTINE mp_sum_rt( msg, gid )
1676        IMPLICIT NONE
1677        REAL (DP), INTENT (INOUT) :: msg(:,:,:)
1678        INTEGER, INTENT(IN) :: gid
1679#if defined(__MPI)
1680        INTEGER :: msglen
1681        msglen = size(msg)
1682        CALL reduce_base_real( msglen, msg, gid, -1 )
1683#endif
1684      END SUBROUTINE mp_sum_rt
1685
1686!
1687!------------------------------------------------------------------------------!
1688!
1689! Carlo Cavazzoni
1690!
1691      SUBROUTINE mp_sum_r4d(msg,gid)
1692        IMPLICIT NONE
1693        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:)
1694        INTEGER, INTENT(IN) :: gid
1695#if defined(__MPI)
1696        INTEGER :: msglen
1697        msglen = size(msg)
1698        CALL reduce_base_real( msglen, msg, gid, -1 )
1699#endif
1700      END SUBROUTINE mp_sum_r4d
1701
1702
1703
1704!------------------------------------------------------------------------------!
1705
1706      SUBROUTINE mp_sum_c1(msg,gid)
1707        IMPLICIT NONE
1708        COMPLEX (DP), INTENT (INOUT) :: msg
1709        INTEGER, INTENT(IN) :: gid
1710#if defined(__MPI)
1711        INTEGER :: msglen
1712        msglen = 1
1713        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1714#endif
1715      END SUBROUTINE mp_sum_c1
1716!
1717!------------------------------------------------------------------------------!
1718
1719      SUBROUTINE mp_sum_cv(msg,gid)
1720        IMPLICIT NONE
1721        COMPLEX (DP), INTENT (INOUT) :: msg(:)
1722        INTEGER, INTENT(IN) :: gid
1723#if defined(__MPI)
1724        INTEGER :: msglen
1725        msglen = size(msg)
1726        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1727#endif
1728      END SUBROUTINE mp_sum_cv
1729!
1730!------------------------------------------------------------------------------!
1731
1732      SUBROUTINE mp_sum_cm(msg, gid)
1733        IMPLICIT NONE
1734        COMPLEX (DP), INTENT (INOUT) :: msg(:,:)
1735        INTEGER, INTENT (IN) :: gid
1736#if defined(__MPI)
1737        INTEGER :: msglen
1738        msglen = size(msg)
1739        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1740#endif
1741      END SUBROUTINE mp_sum_cm
1742!
1743!------------------------------------------------------------------------------!
1744
1745
1746      SUBROUTINE mp_sum_cmm(msg, res, gid)
1747        IMPLICIT NONE
1748        COMPLEX (DP), INTENT (IN) :: msg(:,:)
1749        COMPLEX (DP), INTENT (OUT) :: res(:,:)
1750        INTEGER, INTENT (IN) :: gid
1751#if defined(__MPI)
1752        INTEGER :: msglen
1753        msglen = size(msg)
1754        CALL reduce_base_real_to( 2 * msglen, msg, res, gid, -1 )
1755#else
1756        res = msg
1757#endif
1758      END SUBROUTINE mp_sum_cmm
1759
1760
1761!
1762!------------------------------------------------------------------------------!
1763!
1764! Carlo Cavazzoni
1765!
1766      SUBROUTINE mp_sum_ct(msg,gid)
1767        IMPLICIT NONE
1768        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:)
1769        INTEGER, INTENT(IN) :: gid
1770#if defined(__MPI)
1771        INTEGER :: msglen
1772        msglen = SIZE(msg)
1773        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1774#endif
1775      END SUBROUTINE mp_sum_ct
1776
1777!
1778!------------------------------------------------------------------------------!
1779!
1780! Carlo Cavazzoni
1781!
1782      SUBROUTINE mp_sum_c4d(msg,gid)
1783        IMPLICIT NONE
1784        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:)
1785        INTEGER, INTENT(IN) :: gid
1786#if defined(__MPI)
1787        INTEGER :: msglen
1788        msglen = size(msg)
1789        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1790#endif
1791      END SUBROUTINE mp_sum_c4d
1792!
1793!------------------------------------------------------------------------------!
1794!
1795! Carlo Cavazzoni
1796!
1797      SUBROUTINE mp_sum_c5d(msg,gid)
1798        IMPLICIT NONE
1799        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
1800        INTEGER, INTENT(IN) :: gid
1801#if defined(__MPI)
1802        INTEGER :: msglen
1803        msglen = size(msg)
1804        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1805#endif
1806      END SUBROUTINE mp_sum_c5d
1807
1808!------------------------------------------------------------------------------!
1809!
1810! Carlo Cavazzoni
1811!
1812      SUBROUTINE mp_sum_r5d(msg,gid)
1813        IMPLICIT NONE
1814        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:)
1815        INTEGER, INTENT(IN) :: gid
1816#if defined(__MPI)
1817        INTEGER :: msglen
1818        msglen = size(msg)
1819        CALL reduce_base_real( msglen, msg, gid, -1 )
1820#endif
1821      END SUBROUTINE mp_sum_r5d
1822
1823
1824      SUBROUTINE mp_sum_r6d(msg,gid)
1825        IMPLICIT NONE
1826        REAL (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
1827        INTEGER, INTENT(IN) :: gid
1828#if defined(__MPI)
1829        INTEGER :: msglen
1830        msglen = size(msg)
1831        CALL reduce_base_real( msglen, msg, gid, -1 )
1832#endif
1833      END SUBROUTINE mp_sum_r6d
1834
1835!
1836!------------------------------------------------------------------------------!
1837!
1838! Carlo Cavazzoni
1839!
1840      SUBROUTINE mp_sum_c6d(msg,gid)
1841        IMPLICIT NONE
1842        COMPLEX (DP), INTENT (INOUT) :: msg(:,:,:,:,:,:)
1843        INTEGER, INTENT(IN) :: gid
1844#if defined(__MPI)
1845        INTEGER :: msglen
1846        msglen = size(msg)
1847        CALL reduce_base_real( 2 * msglen, msg, gid, -1 )
1848#endif
1849      END SUBROUTINE mp_sum_c6d
1850
1851
1852
1853!------------------------------------------------------------------------------!
1854      SUBROUTINE mp_max_i(msg,gid)
1855        IMPLICIT NONE
1856        INTEGER, INTENT (INOUT) :: msg
1857        INTEGER, INTENT(IN) :: gid
1858#if defined(__MPI)
1859        INTEGER :: msglen
1860        msglen = 1
1861        CALL parallel_max_integer( msglen, msg, gid, -1 )
1862#endif
1863      END SUBROUTINE mp_max_i
1864!
1865!------------------------------------------------------------------------------!
1866!
1867!..mp_max_iv
1868!..Carlo Cavazzoni
1869!
1870      SUBROUTINE mp_max_iv(msg,gid)
1871        IMPLICIT NONE
1872        INTEGER, INTENT (INOUT) :: msg(:)
1873        INTEGER, INTENT(IN) :: gid
1874#if defined(__MPI)
1875        INTEGER :: msglen
1876        msglen = size(msg)
1877        CALL parallel_max_integer( msglen, msg, gid, -1 )
1878#endif
1879      END SUBROUTINE mp_max_iv
1880!
1881!----------------------------------------------------------------------
1882
1883      SUBROUTINE mp_max_r(msg,gid)
1884        IMPLICIT NONE
1885        REAL (DP), INTENT (INOUT) :: msg
1886        INTEGER, INTENT(IN) :: gid
1887#if defined(__MPI)
1888        INTEGER :: msglen
1889        msglen = 1
1890        CALL parallel_max_real( msglen, msg, gid, -1 )
1891#endif
1892      END SUBROUTINE mp_max_r
1893!
1894!------------------------------------------------------------------------------!
1895      SUBROUTINE mp_max_rv(msg,gid)
1896        IMPLICIT NONE
1897        REAL (DP), INTENT (INOUT) :: msg(:)
1898        INTEGER, INTENT(IN) :: gid
1899#if defined(__MPI)
1900        INTEGER :: msglen
1901        msglen = size(msg)
1902        CALL parallel_max_real( msglen, msg, gid, -1 )
1903#endif
1904      END SUBROUTINE mp_max_rv
1905!------------------------------------------------------------------------------!
1906      SUBROUTINE mp_min_i(msg,gid)
1907        IMPLICIT NONE
1908        INTEGER, INTENT (INOUT) :: msg
1909        INTEGER, INTENT(IN) :: gid
1910#if defined(__MPI)
1911        INTEGER :: msglen
1912        msglen = 1
1913        CALL parallel_min_integer( msglen, msg, gid, -1 )
1914#endif
1915      END SUBROUTINE mp_min_i
1916!------------------------------------------------------------------------------!
1917      SUBROUTINE mp_min_iv(msg,gid)
1918        IMPLICIT NONE
1919        INTEGER, INTENT (INOUT) :: msg(:)
1920        INTEGER, INTENT(IN) :: gid
1921#if defined(__MPI)
1922        INTEGER :: msglen
1923        msglen = SIZE(msg)
1924        CALL parallel_min_integer( msglen, msg, gid, -1 )
1925#endif
1926      END SUBROUTINE mp_min_iv
1927!------------------------------------------------------------------------------!
1928      SUBROUTINE mp_min_r(msg,gid)
1929        IMPLICIT NONE
1930        REAL (DP), INTENT (INOUT) :: msg
1931        INTEGER, INTENT(IN) :: gid
1932#if defined(__MPI)
1933        INTEGER :: msglen
1934        msglen = 1
1935        CALL parallel_min_real( msglen, msg, gid, -1 )
1936#endif
1937      END SUBROUTINE mp_min_r
1938!
1939!------------------------------------------------------------------------------!
1940      SUBROUTINE mp_min_rv(msg,gid)
1941        IMPLICIT NONE
1942        REAL (DP), INTENT (INOUT) :: msg(:)
1943        INTEGER, INTENT(IN) :: gid
1944#if defined(__MPI)
1945        INTEGER :: msglen
1946        msglen = size(msg)
1947        CALL parallel_min_real( msglen, msg, gid, -1 )
1948#endif
1949      END SUBROUTINE mp_min_rv
1950
1951!------------------------------------------------------------------------------!
1952
1953      SUBROUTINE mp_barrier(gid)
1954        IMPLICIT NONE
1955        INTEGER, INTENT(IN) :: gid
1956        INTEGER :: ierr
1957#if defined(__MPI)
1958        CALL MPI_BARRIER(gid,IERR)
1959        IF (ierr/=0) CALL mp_stop( 8066 )
1960#endif
1961      END SUBROUTINE mp_barrier
1962
1963!------------------------------------------------------------------------------!
1964!.. Carlo Cavazzoni
1965!..mp_rank
1966      FUNCTION mp_rank( comm )
1967        IMPLICIT NONE
1968        INTEGER :: mp_rank
1969        INTEGER, INTENT(IN) :: comm
1970        INTEGER :: ierr, taskid
1971
1972        ierr = 0
1973        taskid = 0
1974#if defined(__MPI)
1975        CALL mpi_comm_rank(comm,taskid,ierr)
1976        IF (ierr/=0) CALL mp_stop( 8067 )
1977#endif
1978        mp_rank = taskid
1979      END FUNCTION mp_rank
1980
1981!------------------------------------------------------------------------------!
1982!.. Carlo Cavazzoni
1983!..mp_size
1984      FUNCTION mp_size( comm )
1985        IMPLICIT NONE
1986        INTEGER :: mp_size
1987        INTEGER, INTENT(IN) :: comm
1988        INTEGER :: ierr, numtask
1989
1990        ierr = 0
1991        numtask = 1
1992#if defined(__MPI)
1993        CALL mpi_comm_size(comm,numtask,ierr)
1994        IF (ierr/=0) CALL mp_stop( 8068 )
1995#endif
1996        mp_size = numtask
1997      END FUNCTION mp_size
1998
1999      SUBROUTINE mp_report
2000        INTEGER :: i
2001        WRITE( stdout, *)
2002#if defined(__MPI)
2003#  if defined (__MP_STAT)
2004        WRITE( stdout, 20 )
200520      FORMAT(3X,'please use an MPI profiler to analyze communications ')
2006#  endif
2007#else
2008        WRITE( stdout, *)
2009#endif
2010        RETURN
2011      END SUBROUTINE mp_report
2012
2013
2014!------------------------------------------------------------------------------!
2015!..mp_gatherv_rv
2016!..Carlo Cavazzoni
2017
2018      SUBROUTINE mp_gatherv_rv( mydata, alldata, recvcount, displs, root, gid)
2019        IMPLICIT NONE
2020        REAL(DP) :: mydata(:)
2021        REAL(DP) :: alldata(:)
2022        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
2023        INTEGER, INTENT(IN) :: gid
2024        INTEGER :: group
2025        INTEGER :: ierr, npe, myid
2026
2027#if defined (__MPI)
2028        group = gid
2029        CALL mpi_comm_size( group, npe, ierr )
2030        IF (ierr/=0) CALL mp_stop( 8069 )
2031        CALL mpi_comm_rank( group, myid, ierr )
2032        IF (ierr/=0) CALL mp_stop( 8070 )
2033        !
2034        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2035        IF ( myid == root ) THEN
2036           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
2037        END IF
2038        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
2039        !
2040        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
2041                         alldata, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
2042        IF (ierr/=0) CALL mp_stop( 8074 )
2043#else
2044        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
2045        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
2046        !
2047        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
2048#endif
2049        RETURN
2050      END SUBROUTINE mp_gatherv_rv
2051
2052!------------------------------------------------------------------------------!
2053!..mp_gatherv_cv
2054!..Carlo Cavazzoni
2055
2056      SUBROUTINE mp_gatherv_cv( mydata, alldata, recvcount, displs, root, gid)
2057        IMPLICIT NONE
2058        COMPLEX(DP) :: mydata(:)
2059        COMPLEX(DP) :: alldata(:)
2060        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
2061        INTEGER, INTENT(IN) :: gid
2062        INTEGER :: group
2063        INTEGER :: ierr, npe, myid
2064
2065#if defined (__MPI)
2066        group = gid
2067        CALL mpi_comm_size( group, npe, ierr )
2068        IF (ierr/=0) CALL mp_stop( 8069 )
2069        CALL mpi_comm_rank( group, myid, ierr )
2070        IF (ierr/=0) CALL mp_stop( 8070 )
2071        !
2072        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2073        IF ( myid == root ) THEN
2074           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
2075        END IF
2076        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
2077        !
2078        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
2079                         alldata, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
2080        IF (ierr/=0) CALL mp_stop( 8074 )
2081#else
2082        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
2083        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
2084        !
2085        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
2086#endif
2087        RETURN
2088      END SUBROUTINE mp_gatherv_cv
2089
2090!------------------------------------------------------------------------------!
2091!..mp_gatherv_rv
2092!..Carlo Cavazzoni
2093
2094      SUBROUTINE mp_gatherv_iv( mydata, alldata, recvcount, displs, root, gid)
2095        IMPLICIT NONE
2096        INTEGER :: mydata(:)
2097        INTEGER :: alldata(:)
2098        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
2099        INTEGER, INTENT(IN) :: gid
2100        INTEGER :: group
2101        INTEGER :: ierr, npe, myid
2102
2103#if defined (__MPI)
2104        group = gid
2105        CALL mpi_comm_size( group, npe, ierr )
2106        IF (ierr/=0) CALL mp_stop( 8069 )
2107        CALL mpi_comm_rank( group, myid, ierr )
2108        IF (ierr/=0) CALL mp_stop( 8070 )
2109        !
2110        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2111        IF ( myid == root ) THEN
2112           IF ( SIZE( alldata ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
2113        END IF
2114        IF ( SIZE( mydata ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
2115        !
2116        CALL MPI_GATHERV( mydata, recvcount( myid + 1 ), MPI_INTEGER, &
2117                         alldata, recvcount, displs, MPI_INTEGER, root, group, ierr )
2118        IF (ierr/=0) CALL mp_stop( 8074 )
2119#else
2120        IF ( SIZE( alldata ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
2121        IF ( SIZE( mydata  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
2122        !
2123        alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
2124#endif
2125        RETURN
2126      END SUBROUTINE mp_gatherv_iv
2127
2128
2129!------------------------------------------------------------------------------!
2130!..mp_gatherv_rm
2131!..Carlo Cavazzoni
2132
2133      SUBROUTINE mp_gatherv_rm( mydata, alldata, recvcount, displs, root, gid)
2134        IMPLICIT NONE
2135        REAL(DP) :: mydata(:,:)  ! Warning first dimension is supposed constant!
2136        REAL(DP) :: alldata(:,:)
2137        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
2138        INTEGER, INTENT(IN) :: gid
2139        INTEGER :: group
2140        INTEGER :: ierr, npe, myid, nsiz
2141        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
2142
2143
2144#if defined (__MPI)
2145        group = gid
2146        CALL mpi_comm_size( group, npe, ierr )
2147        IF (ierr/=0) CALL mp_stop( 8069 )
2148        CALL mpi_comm_rank( group, myid, ierr )
2149        IF (ierr/=0) CALL mp_stop( 8070 )
2150        !
2151        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2152        IF ( myid == root ) THEN
2153           IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
2154           IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
2155        END IF
2156        IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
2157        !
2158        ALLOCATE( nrecv( npe ), ndisp( npe ) )
2159        !
2160        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
2161        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
2162        !
2163        CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
2164                         alldata, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
2165        IF (ierr/=0) CALL mp_stop( 8074 )
2166        !
2167        DEALLOCATE( nrecv, ndisp )
2168        !
2169#else
2170        IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
2171        IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
2172        IF ( SIZE( mydata, 2  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
2173        !
2174        alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
2175#endif
2176        RETURN
2177      END SUBROUTINE mp_gatherv_rm
2178
2179!------------------------------------------------------------------------------!
2180!..mp_gatherv_im
2181!..Carlo Cavazzoni
2182
2183      SUBROUTINE mp_gatherv_im( mydata, alldata, recvcount, displs, root, gid)
2184        IMPLICIT NONE
2185        INTEGER :: mydata(:,:)  ! Warning first dimension is supposed constant!
2186        INTEGER :: alldata(:,:)
2187        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
2188        INTEGER, INTENT(IN) :: gid
2189        INTEGER :: group
2190        INTEGER :: ierr, npe, myid, nsiz
2191        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
2192
2193
2194#if defined (__MPI)
2195        group = gid
2196        CALL mpi_comm_size( group, npe, ierr )
2197        IF (ierr/=0) CALL mp_stop( 8069 )
2198        CALL mpi_comm_rank( group, myid, ierr )
2199        IF (ierr/=0) CALL mp_stop( 8070 )
2200        !
2201        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2202        IF ( myid == root ) THEN
2203           IF ( SIZE( alldata, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 8072 )
2204           IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8072 )
2205        END IF
2206        IF ( SIZE( mydata, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 8073 )
2207        !
2208        ALLOCATE( nrecv( npe ), ndisp( npe ) )
2209        !
2210        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata, 1 )
2211        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata, 1 )
2212        !
2213        CALL MPI_GATHERV( mydata, nrecv( myid + 1 ), MPI_INTEGER, &
2214                         alldata, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
2215        IF (ierr/=0) CALL mp_stop( 8074 )
2216        !
2217        DEALLOCATE( nrecv, ndisp )
2218        !
2219#else
2220        IF ( SIZE( alldata, 1 ) /= SIZE( mydata, 1 ) ) CALL mp_stop( 8075 )
2221        IF ( SIZE( alldata, 2 ) < recvcount( 1 ) ) CALL mp_stop( 8075 )
2222        IF ( SIZE( mydata, 2  ) < recvcount( 1 ) ) CALL mp_stop( 8076 )
2223        !
2224        alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
2225#endif
2226        RETURN
2227      END SUBROUTINE mp_gatherv_im
2228
2229
2230!------------------------------------------------------------------------------!
2231!..mp_gatherv_inplace_cplx_array
2232!..Ye Luo
2233
2234      SUBROUTINE mp_gatherv_inplace_cplx_array(alldata, my_column_type, recvcount, displs, root, gid)
2235        IMPLICIT NONE
2236        COMPLEX(DP) :: alldata(:,:)
2237        INTEGER, INTENT(IN) :: my_column_type
2238        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
2239        INTEGER, INTENT(IN) :: root, gid
2240        INTEGER :: ierr, npe, myid
2241
2242#if defined (__MPI)
2243        CALL mpi_comm_size( gid, npe, ierr )
2244        IF (ierr/=0) CALL mp_stop( 8069 )
2245        CALL mpi_comm_rank( gid, myid, ierr )
2246        IF (ierr/=0) CALL mp_stop( 8070 )
2247        !
2248        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2249        !
2250        IF (myid==root) THEN
2251           CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
2252                             alldata, recvcount, displs, my_column_type, root, gid, ierr )
2253        ELSE
2254           CALL MPI_GATHERV( alldata(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
2255                             MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
2256        ENDIF
2257        IF (ierr/=0) CALL mp_stop( 8074 )
2258#endif
2259        RETURN
2260      END SUBROUTINE mp_gatherv_inplace_cplx_array
2261
2262!------------------------------------------------------------------------------!
2263!..mp_allgatherv_inplace_cplx_array
2264!..Ye Luo
2265
2266      SUBROUTINE mp_allgatherv_inplace_cplx_array(alldata, my_element_type, recvcount, displs, gid)
2267        IMPLICIT NONE
2268        COMPLEX(DP) :: alldata(:,:)
2269        INTEGER, INTENT(IN) :: my_element_type
2270        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
2271        INTEGER, INTENT(IN) :: gid
2272        INTEGER :: ierr, npe, myid
2273
2274#if defined (__MPI)
2275        CALL mpi_comm_size( gid, npe, ierr )
2276        IF (ierr/=0) CALL mp_stop( 8069 )
2277        CALL mpi_comm_rank( gid, myid, ierr )
2278        IF (ierr/=0) CALL mp_stop( 8070 )
2279        !
2280        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2281        !
2282        CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
2283                             alldata, recvcount, displs, my_element_type, gid, ierr )
2284        IF (ierr/=0) CALL mp_stop( 8074 )
2285#endif
2286        RETURN
2287      END SUBROUTINE mp_allgatherv_inplace_cplx_array
2288
2289!.. SdG added 16/08/19
2290      SUBROUTINE mp_allgatherv_inplace_real_array(alldata, my_element_type, recvcount, displs, gid)
2291        IMPLICIT NONE
2292        REAL(DP) :: alldata(:,:)
2293        INTEGER, INTENT(IN) :: my_element_type
2294        INTEGER, INTENT(IN) :: recvcount(:), displs(:)
2295        INTEGER, INTENT(IN) :: gid
2296        INTEGER :: ierr, npe, myid
2297
2298#if defined (__MPI)
2299        CALL mpi_comm_size( gid, npe, ierr )
2300        IF (ierr/=0) CALL mp_stop( 8069 )
2301        CALL mpi_comm_rank( gid, myid, ierr )
2302        IF (ierr/=0) CALL mp_stop( 8070 )
2303        !
2304        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 8071 )
2305        !
2306        CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
2307                             alldata, recvcount, displs, my_element_type, gid, ierr )
2308        IF (ierr/=0) CALL mp_stop( 8074 )
2309#endif
2310        RETURN
2311      END SUBROUTINE mp_allgatherv_inplace_real_array
2312
2313!------------------------------------------------------------------------------!
2314
2315      SUBROUTINE mp_set_displs( recvcount, displs, ntot, nproc )
2316        !  Given the number of elements on each processor (recvcount), this subroutine
2317        !  sets the correct offsets (displs) to collect them on a single
2318        !  array with contiguous elemets
2319        IMPLICIT NONE
2320        INTEGER, INTENT(IN) :: recvcount(:) ! number of elements on each processor
2321        INTEGER, INTENT(OUT) :: displs(:)   ! offsets/displacements
2322        INTEGER, INTENT(OUT) :: ntot
2323        INTEGER, INTENT(IN) :: nproc
2324        INTEGER :: i
2325
2326        displs( 1 ) = 0
2327        !
2328#if defined (__MPI)
2329        IF( nproc < 1 ) CALL mp_stop( 8090 )
2330        DO i = 2, nproc
2331           displs( i ) = displs( i - 1 ) + recvcount( i - 1 )
2332        END DO
2333        ntot = displs( nproc ) + recvcount( nproc )
2334#else
2335        ntot = recvcount( 1 )
2336#endif
2337        RETURN
2338      END SUBROUTINE mp_set_displs
2339
2340!------------------------------------------------------------------------------!
2341
2342
2343SUBROUTINE mp_alltoall_c3d( sndbuf, rcvbuf, gid )
2344   IMPLICIT NONE
2345   COMPLEX(DP) :: sndbuf( :, :, : )
2346   COMPLEX(DP) :: rcvbuf( :, :, : )
2347   INTEGER, INTENT(IN) :: gid
2348   INTEGER :: nsiz, group, ierr, npe
2349
2350#if defined (__MPI)
2351
2352   group = gid
2353
2354   CALL mpi_comm_size( group, npe, ierr )
2355   IF (ierr/=0) CALL mp_stop( 8069 )
2356
2357   IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
2358   IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )
2359
2360   nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )
2361
2362   CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_DOUBLE_COMPLEX, &
2363                      rcvbuf, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )
2364
2365   IF (ierr/=0) CALL mp_stop( 8074 )
2366
2367#else
2368
2369   rcvbuf = sndbuf
2370
2371#endif
2372
2373   RETURN
2374END SUBROUTINE mp_alltoall_c3d
2375
2376
2377!------------------------------------------------------------------------------!
2378
2379SUBROUTINE mp_alltoall_i3d( sndbuf, rcvbuf, gid )
2380   IMPLICIT NONE
2381   INTEGER :: sndbuf( :, :, : )
2382   INTEGER :: rcvbuf( :, :, : )
2383   INTEGER, INTENT(IN) :: gid
2384   INTEGER :: nsiz, group, ierr, npe
2385
2386#if defined (__MPI)
2387
2388   group = gid
2389
2390   CALL mpi_comm_size( group, npe, ierr )
2391   IF (ierr/=0) CALL mp_stop( 8069 )
2392
2393   IF ( SIZE( sndbuf, 3 ) < npe ) CALL mp_stop( 8069 )
2394   IF ( SIZE( rcvbuf, 3 ) < npe ) CALL mp_stop( 8069 )
2395
2396   nsiz = SIZE( sndbuf, 1 ) * SIZE( sndbuf, 2 )
2397
2398   CALL MPI_ALLTOALL( sndbuf, nsiz, MPI_INTEGER, &
2399                      rcvbuf, nsiz, MPI_INTEGER, group, ierr )
2400
2401   IF (ierr/=0) CALL mp_stop( 8074 )
2402
2403#else
2404
2405   rcvbuf = sndbuf
2406
2407#endif
2408
2409   RETURN
2410END SUBROUTINE mp_alltoall_i3d
2411
2412SUBROUTINE mp_circular_shift_left_i0( buf, itag, gid )
2413   IMPLICIT NONE
2414   INTEGER :: buf
2415   INTEGER, INTENT(IN) :: itag
2416   INTEGER, INTENT(IN) :: gid
2417   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2418
2419#if defined (__MPI)
2420
2421   INTEGER :: istatus( mpi_status_size )
2422   !
2423   group = gid
2424   !
2425   CALL mpi_comm_size( group, npe, ierr )
2426   IF (ierr/=0) CALL mp_stop( 8100 )
2427   CALL mpi_comm_rank( group, mype, ierr )
2428   IF (ierr/=0) CALL mp_stop( 8101 )
2429   !
2430   sour = mype + 1
2431   IF( sour == npe ) sour = 0
2432   dest = mype - 1
2433   IF( dest == -1 ) dest = npe - 1
2434   !
2435   CALL MPI_Sendrecv_replace( buf, 1, MPI_INTEGER, &
2436        dest, itag, sour, itag, group, istatus, ierr)
2437   !
2438   IF (ierr/=0) CALL mp_stop( 8102 )
2439   !
2440#else
2441   ! do nothing
2442#endif
2443   RETURN
2444END SUBROUTINE mp_circular_shift_left_i0
2445
2446
2447SUBROUTINE mp_circular_shift_left_i1( buf, itag, gid )
2448   IMPLICIT NONE
2449   INTEGER :: buf(:)
2450   INTEGER, INTENT(IN) :: itag
2451   INTEGER, INTENT(IN) :: gid
2452   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2453
2454#if defined (__MPI)
2455
2456   INTEGER :: istatus( mpi_status_size )
2457   !
2458   group = gid
2459   !
2460   CALL mpi_comm_size( group, npe, ierr )
2461   IF (ierr/=0) CALL mp_stop( 8100 )
2462   CALL mpi_comm_rank( group, mype, ierr )
2463   IF (ierr/=0) CALL mp_stop( 8101 )
2464   !
2465   sour = mype + 1
2466   IF( sour == npe ) sour = 0
2467   dest = mype - 1
2468   IF( dest == -1 ) dest = npe - 1
2469   !
2470   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
2471        dest, itag, sour, itag, group, istatus, ierr)
2472   !
2473   IF (ierr/=0) CALL mp_stop( 8102 )
2474   !
2475#else
2476   ! do nothing
2477#endif
2478   RETURN
2479END SUBROUTINE mp_circular_shift_left_i1
2480
2481
2482SUBROUTINE mp_circular_shift_left_i2( buf, itag, gid )
2483   IMPLICIT NONE
2484   INTEGER :: buf(:,:)
2485   INTEGER, INTENT(IN) :: itag
2486   INTEGER, INTENT(IN) :: gid
2487   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2488
2489#if defined (__MPI)
2490
2491   INTEGER :: istatus( mpi_status_size )
2492   !
2493   group = gid
2494   !
2495   CALL mpi_comm_size( group, npe, ierr )
2496   IF (ierr/=0) CALL mp_stop( 8100 )
2497   CALL mpi_comm_rank( group, mype, ierr )
2498   IF (ierr/=0) CALL mp_stop( 8101 )
2499   !
2500   sour = mype + 1
2501   IF( sour == npe ) sour = 0
2502   dest = mype - 1
2503   IF( dest == -1 ) dest = npe - 1
2504   !
2505   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_INTEGER, &
2506        dest, itag, sour, itag, group, istatus, ierr)
2507   !
2508   IF (ierr/=0) CALL mp_stop( 8102 )
2509   !
2510#else
2511   ! do nothing
2512#endif
2513   RETURN
2514END SUBROUTINE mp_circular_shift_left_i2
2515
2516
2517SUBROUTINE mp_circular_shift_left_r2d( buf, itag, gid )
2518   IMPLICIT NONE
2519   REAL(DP) :: buf( :, : )
2520   INTEGER, INTENT(IN) :: itag
2521   INTEGER, INTENT(IN) :: gid
2522   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2523
2524#if defined (__MPI)
2525
2526   INTEGER :: istatus( mpi_status_size )
2527   !
2528   group = gid
2529   !
2530   CALL mpi_comm_size( group, npe, ierr )
2531   IF (ierr/=0) CALL mp_stop( 8100 )
2532   CALL mpi_comm_rank( group, mype, ierr )
2533   IF (ierr/=0) CALL mp_stop( 8101 )
2534   !
2535   sour = mype + 1
2536   IF( sour == npe ) sour = 0
2537   dest = mype - 1
2538   IF( dest == -1 ) dest = npe - 1
2539   !
2540   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_PRECISION, &
2541        dest, itag, sour, itag, group, istatus, ierr)
2542   !
2543   IF (ierr/=0) CALL mp_stop( 8102 )
2544   !
2545#else
2546   ! do nothing
2547#endif
2548   RETURN
2549END SUBROUTINE mp_circular_shift_left_r2d
2550
2551SUBROUTINE mp_circular_shift_left_c2d( buf, itag, gid )
2552   IMPLICIT NONE
2553   COMPLEX(DP) :: buf( :, : )
2554   INTEGER, INTENT(IN) :: itag
2555   INTEGER, INTENT(IN) :: gid
2556   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2557
2558#if defined (__MPI)
2559
2560   INTEGER :: istatus( mpi_status_size )
2561   !
2562   group = gid
2563   !
2564   CALL mpi_comm_size( group, npe, ierr )
2565   IF (ierr/=0) CALL mp_stop( 8100 )
2566   CALL mpi_comm_rank( group, mype, ierr )
2567   IF (ierr/=0) CALL mp_stop( 8101 )
2568   !
2569   sour = mype + 1
2570   IF( sour == npe ) sour = 0
2571   dest = mype - 1
2572   IF( dest == -1 ) dest = npe - 1
2573   !
2574   CALL MPI_Sendrecv_replace( buf, SIZE(buf), MPI_DOUBLE_COMPLEX, &
2575        dest, itag, sour, itag, group, istatus, ierr)
2576   !
2577   IF (ierr/=0) CALL mp_stop( 8102 )
2578   !
2579#else
2580   ! do nothing
2581#endif
2582   RETURN
2583END SUBROUTINE mp_circular_shift_left_c2d
2584
2585
2586!------------------------------------------------------------------------------!
2587!..mp_circular_shift_left_start
2588SUBROUTINE mp_circular_shift_left_start_i0( sendbuf, recvbuf, itag, gid, requests)
2589   IMPLICIT NONE
2590   INTEGER  :: sendbuf, recvbuf
2591   INTEGER, INTENT(IN) :: itag
2592   INTEGER, INTENT(IN) :: gid
2593   INTEGER, INTENT(INOUT) :: requests(2)
2594   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2595
2596#if defined (__MPI)
2597
2598   !set null requests
2599   requests = mpi_request_null
2600
2601   !communicator
2602   group = gid
2603   !
2604   CALL mpi_comm_size( group, npe, ierr )
2605   IF (ierr/=0) CALL mp_stop( 8100 )
2606   CALL mpi_comm_rank( group, mype, ierr )
2607   IF (ierr/=0) CALL mp_stop( 8101 )
2608   !
2609   sour = modulo(mype + 1, npe)
2610   dest = modulo(mype + npe - 1, npe)
2611   !
2612   CALL MPI_Irecv( recvbuf, 1, MPI_INTEGER, &
2613        sour, itag, group, requests(1), ierr)
2614   !
2615   IF (ierr/=0) CALL mp_stop( 8102 )
2616   !
2617   CALL MPI_Isend( sendbuf, 1, MPI_INTEGER, &
2618        dest, itag, group, requests(2), ierr)
2619   !
2620   IF (ierr/=0) CALL mp_stop( 8103 )
2621   !
2622#else
2623
2624   recvbuf = sendbuf
2625
2626#endif
2627   RETURN
2628END SUBROUTINE mp_circular_shift_left_start_i0
2629
2630
2631SUBROUTINE mp_circular_shift_left_start_i1( sendbuf, recvbuf, itag, gid, requests)
2632   IMPLICIT NONE
2633   INTEGER  :: sendbuf( : ), recvbuf( : )
2634   INTEGER, INTENT(IN) :: itag
2635   INTEGER, INTENT(IN) :: gid
2636   INTEGER, INTENT(INOUT) :: requests(2)
2637   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2638
2639#if defined (__MPI)
2640
2641   !set null requests
2642   requests = mpi_request_null
2643
2644   !communicator
2645   group = gid
2646   !
2647   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
2648   CALL mpi_comm_size( group, npe, ierr )
2649   IF (ierr/=0) CALL mp_stop( 8100 )
2650   CALL mpi_comm_rank( group, mype, ierr )
2651   IF (ierr/=0) CALL mp_stop( 8101 )
2652   !
2653   sour = modulo(mype + 1, npe)
2654   dest = modulo(mype + npe - 1, npe)
2655   !
2656   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
2657        sour, itag, group, requests(1), ierr)
2658   !
2659   IF (ierr/=0) CALL mp_stop( 8102 )
2660   !
2661   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
2662        dest, itag, group, requests(2), ierr)
2663   !
2664   IF (ierr/=0) CALL mp_stop( 8103 )
2665   !
2666#else
2667
2668   recvbuf = sendbuf
2669
2670#endif
2671   RETURN
2672END SUBROUTINE mp_circular_shift_left_start_i1
2673
2674
2675SUBROUTINE mp_circular_shift_left_start_i2( sendbuf, recvbuf, itag, gid, requests)
2676   IMPLICIT NONE
2677   INTEGER  :: sendbuf( :, : ), recvbuf( :, : )
2678   INTEGER, INTENT(IN) :: itag
2679   INTEGER, INTENT(IN) :: gid
2680   INTEGER, INTENT(INOUT) :: requests(2)
2681   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2682
2683#if defined (__MPI)
2684
2685   !set null requests
2686   requests = mpi_request_null
2687
2688   !communicator
2689   group = gid
2690   !
2691   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
2692   CALL mpi_comm_size( group, npe, ierr )
2693   IF (ierr/=0) CALL mp_stop( 8100 )
2694   CALL mpi_comm_rank( group, mype, ierr )
2695   IF (ierr/=0) CALL mp_stop( 8101 )
2696   !
2697   sour = modulo(mype + 1, npe)
2698   dest = modulo(mype + npe - 1, npe)
2699   !
2700   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_INTEGER, &
2701        sour, itag, group, requests(1), ierr)
2702   !
2703   IF (ierr/=0) CALL mp_stop( 8102 )
2704   !
2705   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_INTEGER, &
2706        dest, itag, group, requests(2), ierr)
2707   !
2708   IF (ierr/=0) CALL mp_stop( 8103 )
2709   !
2710#else
2711
2712   recvbuf = sendbuf
2713
2714#endif
2715   RETURN
2716END SUBROUTINE mp_circular_shift_left_start_i2
2717
2718
2719SUBROUTINE mp_circular_shift_left_start_r2d( sendbuf, recvbuf, itag, gid, requests)
2720   IMPLICIT NONE
2721   REAL(DP) :: sendbuf( :, : ), recvbuf( :, : )
2722   INTEGER, INTENT(IN) :: itag
2723   INTEGER, INTENT(IN) :: gid
2724   INTEGER, INTENT(INOUT) :: requests(2)
2725   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2726
2727#if defined (__MPI)
2728
2729   !set null requests
2730   requests = mpi_request_null
2731
2732   !communicator
2733   group = gid
2734   !
2735   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
2736   CALL mpi_comm_size( group, npe, ierr )
2737   IF (ierr/=0) CALL mp_stop( 8100 )
2738   CALL mpi_comm_rank( group, mype, ierr )
2739   IF (ierr/=0) CALL mp_stop( 8101 )
2740   !
2741   sour = modulo(mype + 1, npe)
2742   dest = modulo(mype + npe - 1, npe)
2743   !
2744   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_PRECISION, &
2745        sour, itag, group, requests(1), ierr)
2746   !
2747   IF (ierr/=0) CALL mp_stop( 8102 )
2748   !
2749   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_PRECISION, &
2750        dest, itag, group, requests(2), ierr)
2751   !
2752   IF (ierr/=0) CALL mp_stop( 8103 )
2753   !
2754#else
2755
2756   recvbuf = sendbuf
2757
2758#endif
2759   RETURN
2760END SUBROUTINE mp_circular_shift_left_start_r2d
2761
2762
2763SUBROUTINE mp_circular_shift_left_start_c2d( sendbuf, recvbuf, itag, gid, requests)
2764   IMPLICIT NONE
2765   COMPLEX(DP) :: sendbuf( :, : ), recvbuf( :, : )
2766   INTEGER, INTENT(IN) :: itag
2767   INTEGER, INTENT(IN) :: gid
2768   INTEGER, INTENT(INOUT) :: requests(2)
2769   INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
2770
2771#if defined (__MPI)
2772
2773   !set null requests
2774   requests = mpi_request_null
2775
2776   !communicator
2777   group = gid
2778   !
2779   IF( size(sendbuf)/=size(recvbuf) ) CALL mp_stop(8099)
2780   CALL mpi_comm_size( group, npe, ierr )
2781   IF (ierr/=0) CALL mp_stop( 8100 )
2782   CALL mpi_comm_rank( group, mype, ierr )
2783   IF (ierr/=0) CALL mp_stop( 8101 )
2784   !
2785   sour = modulo(mype + 1, npe)
2786   dest = modulo(mype + npe - 1, npe)
2787   !
2788   CALL MPI_Irecv( recvbuf, size(recvbuf), MPI_DOUBLE_COMPLEX, &
2789        sour, itag, group, requests(1), ierr)
2790   !
2791   IF (ierr/=0) CALL mp_stop( 8102 )
2792   !
2793   CALL MPI_Isend( sendbuf, size(sendbuf), MPI_DOUBLE_COMPLEX, &
2794        dest, itag, group, requests(2), ierr)
2795   !
2796   IF (ierr/=0) CALL mp_stop( 8103 )
2797   !
2798#else
2799
2800   recvbuf = sendbuf
2801
2802#endif
2803   RETURN
2804END SUBROUTINE mp_circular_shift_left_start_c2d
2805!
2806!
2807!------------------------------------------------------------------------------!
2808!..mp_count_nodes
2809SUBROUTINE mp_count_nodes(num_nodes, color, key, group)
2810  !
2811  ! ... This routine counts the number of nodes using
2812  ! ...  MPI_GET_PROCESSOR_NAME in the group specified by `group`.
2813  ! ...  It returns colors and keys to be used in MPI_COMM_SPLIT.
2814  ! ...  When running in parallel, the evaluation of color and key
2815  ! ...  is done by all processors.
2816  ! ...
2817  ! ...
2818  ! ... input:
2819  ! ...    group      Communicator used to count nodes.
2820  !
2821  ! ... output:
2822  ! ...    num_nodes  Number of unique nodes in the communicator
2823  ! ...    color      Integer (positive), same for all processes residing on a node.
2824  ! ...    key        Integer, unique number identifying each process on the same node.
2825  ! ...
2826  IMPLICIT NONE
2827  INTEGER, INTENT (OUT) :: num_nodes
2828  INTEGER, INTENT (OUT) :: color
2829  INTEGER, INTENT (OUT) :: key
2830  INTEGER, INTENT (IN)  :: group
2831#if defined (__MPI)
2832  CHARACTER(len=MPI_MAX_PROCESSOR_NAME) :: hostname
2833  CHARACTER(len=MPI_MAX_PROCESSOR_NAME), ALLOCATABLE :: host_list(:)
2834#endif
2835
2836  LOGICAL, ALLOCATABLE   :: found_list(:)
2837  INTEGER, ALLOCATABLE   :: color_list(:)
2838  INTEGER, ALLOCATABLE   :: key_list(:)
2839  !
2840  INTEGER :: hostname_len, max_hostname_len, numtask, me, ierr
2841  !
2842  ! Loops variables
2843  INTEGER :: i, j, e, s, c, k
2844  ! ...
2845  ierr      = 0
2846  num_nodes = 1
2847  color     = 1
2848  key       = 0
2849  !
2850#if defined(__MPI)
2851  !
2852  CALL MPI_GET_PROCESSOR_NAME(hostname, hostname_len, ierr)
2853  IF (ierr/=0)  CALL mp_stop( 8103 )
2854
2855  ! find total number of ranks and my rank in communicator
2856  CALL MPI_COMM_SIZE(group, numtask, ierr)
2857  IF (ierr/=0) CALL mp_stop( 8104 )
2858  !
2859  CALL MPI_COMM_RANK(group, me, ierr)
2860  IF (ierr/=0) CALL mp_stop( 8105 )
2861  !
2862  ALLOCATE(host_list(0:numtask-1))
2863  !
2864  host_list(me) = hostname(1:hostname_len)
2865  !
2866  ! Each process broadcast its name to the others
2867  DO i=0,numtask-1
2868    CALL MPI_BCAST(host_list(i), MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER,&
2869                     i, group, ierr)
2870    IF (ierr/=0) CALL mp_stop( 8106 )
2871  END DO
2872  !
2873  ! Simple algorithm to count unique entries.
2874  !
2875  ALLOCATE(found_list(0:numtask-1),color_list(0:numtask-1))
2876  ALLOCATE(key_list(0:numtask-1))
2877  found_list(:) = .false.
2878  color_list(:) = -1
2879  key_list(:)   = -1
2880  !
2881  ! c is the counter for colors
2882  ! k is the counter for keys
2883  !
2884  c = 0
2885  DO i=0,numtask-1
2886    ! if node_counter == .true., this element has already been found,
2887    ! so skip it.
2888    IF (found_list(i)) CYCLE
2889    ! else increment color counter and reset key counter
2890    c = c + 1; k = 0
2891    color_list(i) = c
2892    key_list(i)   = k
2893    !
2894    DO j=i+1,numtask-1
2895      !
2896      IF ( LLE(host_list(i),host_list(j)) .and. &
2897           LGE(host_list(i),host_list(j))        ) THEN
2898        ! increment the key, key=0 is the one we are comparing to
2899        k = k + 1
2900        ! element should not be already found
2901        IF ( found_list(j) ) CALL mp_stop( 8107 )
2902        found_list(j) = .true.
2903        color_list(j) = c
2904        key_list(j)   = k
2905      END IF
2906    END DO
2907  END DO
2908  ! Sanity checks
2909  IF ( MINVAL(color_list) < 0 ) CALL mp_stop( 8108 )
2910  IF ( MINVAL(key_list)   < 0 ) CALL mp_stop( 8109 )
2911  !
2912  color     = color_list(me)
2913  key       = key_list(me)
2914  num_nodes = MAXVAL(color_list)
2915  DEALLOCATE(host_list,found_list,color_list,key_list)
2916!
2917#endif
2918  RETURN
2919END SUBROUTINE mp_count_nodes
2920!
2921FUNCTION mp_get_comm_null( )
2922  IMPLICIT NONE
2923  INTEGER :: mp_get_comm_null
2924  mp_get_comm_null = MPI_COMM_NULL
2925END FUNCTION mp_get_comm_null
2926
2927FUNCTION mp_get_comm_self( )
2928  IMPLICIT NONE
2929  INTEGER :: mp_get_comm_self
2930  mp_get_comm_self = MPI_COMM_SELF
2931END FUNCTION mp_get_comm_self
2932
2933SUBROUTINE mp_type_create_cplx_column_section(dummy, start, length, stride, mytype)
2934  IMPLICIT NONE
2935  !
2936  COMPLEX (DP), INTENT(IN) :: dummy
2937  INTEGER, INTENT(IN) :: start, length, stride
2938  INTEGER, INTENT(OUT) :: mytype
2939  !
2940#if defined(__MPI)
2941  INTEGER :: ierr
2942  !
2943  CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,&
2944                                MPI_DOUBLE_COMPLEX, mytype, ierr)
2945  IF (ierr/=0) CALL mp_stop( 8081 )
2946  CALL MPI_Type_commit(mytype, ierr)
2947  IF (ierr/=0) CALL mp_stop( 8082 )
2948#else
2949  mytype = 0;
2950#endif
2951  !
2952  RETURN
2953END SUBROUTINE mp_type_create_cplx_column_section
2954
2955SUBROUTINE mp_type_create_real_column_section(dummy, start, length, stride, mytype)
2956  IMPLICIT NONE
2957  !
2958  REAL (DP), INTENT(IN) :: dummy
2959  INTEGER, INTENT(IN) :: start, length, stride
2960  INTEGER, INTENT(OUT) :: mytype
2961  !
2962#if defined(__MPI)
2963  INTEGER :: ierr
2964  !
2965  CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,&
2966                                MPI_DOUBLE_PRECISION, mytype, ierr)
2967  IF (ierr/=0) CALL mp_stop( 8083 )
2968  CALL MPI_Type_commit(mytype, ierr)
2969  IF (ierr/=0) CALL mp_stop( 8084 )
2970#else
2971  mytype = 0;
2972#endif
2973  !
2974  RETURN
2975END SUBROUTINE mp_type_create_real_column_section
2976
2977SUBROUTINE mp_type_create_cplx_row_section(dummy, column_start, column_stride, row_length, mytype)
2978  IMPLICIT NONE
2979  !
2980  COMPLEX (DP), INTENT(IN) :: dummy
2981  INTEGER, INTENT(IN) :: column_start, column_stride, row_length
2982  INTEGER, INTENT(OUT) :: mytype
2983  !
2984#if defined(__MPI)
2985  INTEGER :: ierr, temporary
2986  INTEGER :: strides(2), lengths(2), starts(2)
2987  INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
2988  !
2989  strides(1) = column_stride ; strides(2) = row_length
2990  lengths(1) = 1             ; lengths(2) = row_length
2991  starts(1)  = column_start  ; starts(2)  = 0
2992  CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
2993                                MPI_DOUBLE_COMPLEX, temporary, ierr)
2994  IF (ierr/=0) CALL mp_stop( 8085 )
2995  CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
2996  IF (ierr/=0) CALL mp_stop( 8085 )
2997  CALL MPI_TYPE_COMMIT(temporary, ierr)
2998  IF (ierr/=0) CALL mp_stop( 8085 )
2999  CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
3000  IF (ierr/=0) CALL mp_stop( 8086 )
3001  CALL MPI_Type_commit(mytype, ierr)
3002  IF (ierr/=0) CALL mp_stop( 8086 )
3003#else
3004  mytype = 0;
3005#endif
3006  !
3007  RETURN
3008END SUBROUTINE mp_type_create_cplx_row_section
3009
3010SUBROUTINE mp_type_create_real_row_section(dummy, column_start, column_stride, row_length, mytype)
3011  IMPLICIT NONE
3012  !
3013  REAL (DP), INTENT(IN) :: dummy
3014  INTEGER, INTENT(IN) :: column_start, column_stride, row_length
3015  INTEGER, INTENT(OUT) :: mytype
3016  !
3017#if defined(__MPI)
3018  INTEGER :: ierr, temporary
3019  INTEGER :: strides(2), lengths(2), starts(2)
3020  INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
3021  !
3022  strides(1) = column_stride ; strides(2) = row_length
3023  lengths(1) = 1             ; lengths(2) = row_length
3024  starts(1)  = column_start  ; starts(2)  = 0
3025  CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
3026                                MPI_DOUBLE_PRECISION, temporary, ierr)
3027  IF (ierr/=0) CALL mp_stop( 8087 )
3028  CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
3029  IF (ierr/=0) CALL mp_stop( 8087 )
3030  CALL MPI_TYPE_COMMIT(temporary, ierr)
3031  IF (ierr/=0) CALL mp_stop( 8087 )
3032  CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
3033  IF (ierr/=0) CALL mp_stop( 8088 )
3034  CALL MPI_Type_commit(mytype, ierr)
3035  IF (ierr/=0) CALL mp_stop( 8088 )
3036#else
3037  mytype = 0;
3038#endif
3039  !
3040  RETURN
3041END SUBROUTINE mp_type_create_real_row_section
3042
3043SUBROUTINE mp_type_free(mytype)
3044  IMPLICIT NONE
3045  INTEGER :: mytype, ierr
3046  !
3047#if defined(__MPI)
3048  CALL MPI_TYPE_FREE(mytype, ierr)
3049  IF (ierr/=0) CALL mp_stop( 8089 )
3050#endif
3051  !
3052  RETURN
3053END SUBROUTINE mp_type_free
3054!------------------------------------------------------------------------------!
3055!  GPU specific subroutines (Pietro Bonfa')
3056!------------------------------------------------------------------------------!
3057! Before hacking on the CUDA part remember that:
3058!
3059! 1. all mp_* interface should be blocking with respect to both MPI and CUDA.
3060!    MPI will only wait for completion on the default stream therefore device
3061!    synchronization must be enforced.
3062! 2. Host -> device memory copies of a memory block of 64 KB or less are
3063!    asynchronous in the sense that they may return before the data is actually
3064!    available on the GPU. However, the user is still free to change the buffer
3065!    as soon as those calls return with no ill effects.
3066!    (https://devtalk.nvidia.com/default/topic/471866/cuda-programming-and-performance/host-device-memory-copies-up-to-64-kb-are-asynchronous/)
3067! 3. For transfers from device to either pageable or pinned host memory,
3068!    the function returns only once the copy has completed.
3069! 4. GPU synchronization is always enforced even if no communication takes place.
3070!------------------------------------------------------------------------------!
3071#ifdef __CUDA
3072
3073!------------------------------------------------------------------------------!
3074!..mp_bcast
3075
3076      SUBROUTINE mp_bcast_i1_gpu(msg_d,source,gid)
3077        IMPLICIT NONE
3078        INTEGER, DEVICE :: msg_d
3079        INTEGER :: msg_h
3080        INTEGER :: source
3081        INTEGER, INTENT(IN) :: gid
3082        INTEGER :: group
3083        INTEGER :: msglen, ierr
3084        !
3085#if defined(__MPI)
3086        msglen = 1
3087        group = gid
3088#if defined(__GPU_MPI)
3089        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI case
3090        CALL bcast_integer_gpu( msg_d, msglen, source, group )
3091        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3092#else
3093        msg_h = msg_d                   ! This syncs __MPI case
3094        CALL bcast_integer( msg_h, msglen, source, group )
3095        msg_d = msg_h
3096#endif
3097#endif
3098        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3099      END SUBROUTINE mp_bcast_i1_gpu
3100!
3101!------------------------------------------------------------------------------!
3102!
3103      SUBROUTINE mp_bcast_iv_gpu(msg_d,source,gid)
3104        IMPLICIT NONE
3105        INTEGER, DEVICE :: msg_d(:)
3106        INTEGER, ALLOCATABLE :: msg_h(:)
3107        INTEGER, INTENT(IN) :: source
3108        INTEGER, INTENT(IN) :: gid
3109        INTEGER :: msglen, ierr
3110        !
3111#if defined(__MPI)
3112#if defined(__GPU_MPI)
3113        msglen = size(msg_d)
3114        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3115        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
3116        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3117#else
3118        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3119        msglen = size(msg_h)
3120        CALL bcast_integer( msg_h, msglen, source, gid )
3121        msg_d = msg_h; DEALLOCATE( msg_h )
3122#endif
3123#endif
3124        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3125      END SUBROUTINE mp_bcast_iv_gpu
3126!
3127!------------------------------------------------------------------------------!
3128!
3129      SUBROUTINE mp_bcast_im_gpu( msg_d, source, gid )
3130        IMPLICIT NONE
3131        INTEGER, DEVICE :: msg_d(:,:)
3132        INTEGER, ALLOCATABLE :: msg_h(:,:)
3133        INTEGER, INTENT(IN) :: source
3134        INTEGER, INTENT(IN) :: gid
3135        INTEGER :: msglen, ierr
3136#if defined(__MPI)
3137#if defined(__GPU_MPI)
3138        msglen = size(msg_d)
3139        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3140        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
3141        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3142#else
3143        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3144        msglen = size(msg_h)
3145        CALL bcast_integer( msg_h, msglen, source, gid )
3146        msg_d = msg_h; DEALLOCATE( msg_h )
3147#endif
3148#endif
3149        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3150      END SUBROUTINE mp_bcast_im_gpu
3151!
3152!------------------------------------------------------------------------------!
3153!
3154      SUBROUTINE mp_bcast_it_gpu( msg_d, source, gid )
3155        IMPLICIT NONE
3156        INTEGER, DEVICE :: msg_d(:,:,:)
3157
3158        INTEGER, INTENT(IN) :: source
3159        INTEGER, INTENT(IN) :: gid
3160        INTEGER :: msglen, ierr
3161#if defined(__MPI)
3162#if defined(__GPU_MPI)
3163        msglen = size(msg_d)
3164        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3165        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
3166        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3167#else
3168        INTEGER, ALLOCATABLE :: msg_h(:,:,:)
3169        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3170        msglen = size(msg_h)
3171        CALL bcast_integer( msg_h, msglen, source, gid )
3172        msg_d = msg_h; DEALLOCATE( msg_h )
3173#endif
3174#endif
3175        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3176      END SUBROUTINE mp_bcast_it_gpu
3177!
3178!------------------------------------------------------------------------------!
3179!
3180      SUBROUTINE mp_bcast_i4d_gpu(msg_d, source, gid)
3181        IMPLICIT NONE
3182        INTEGER, DEVICE :: msg_d(:,:,:,:)
3183        INTEGER, ALLOCATABLE :: msg_h(:,:,:,:)
3184        INTEGER, INTENT(IN) :: source
3185        INTEGER, INTENT(IN) :: gid
3186        INTEGER :: msglen, ierr
3187#if defined(__MPI)
3188#if defined(__GPU_MPI)
3189        msglen = size(msg_d)
3190        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3191        CALL bcast_integer_gpu( msg_d, msglen, source, gid )
3192        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3193#else
3194        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3195        msglen = size(msg_h)
3196        CALL bcast_integer( msg_h, msglen, source, gid )
3197        msg_d = msg_h; DEALLOCATE( msg_h )
3198#endif
3199#endif
3200        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3201      END SUBROUTINE mp_bcast_i4d_gpu
3202!
3203!------------------------------------------------------------------------------!
3204!
3205      SUBROUTINE mp_bcast_r1_gpu( msg_d, source, gid )
3206        IMPLICIT NONE
3207        REAL (DP), DEVICE :: msg_d
3208        REAL (DP) :: msg_h
3209        INTEGER, INTENT(IN) :: source
3210        INTEGER, INTENT(IN) :: gid
3211        INTEGER :: msglen, ierr
3212#if defined(__MPI)
3213        msglen = 1
3214#if defined(__GPU_MPI)
3215        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3216        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3217        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3218#else
3219        msg_h=msg_d                         ! This syncs __MPI case
3220        CALL bcast_real( msg_h, msglen, source, gid )
3221        msg_d = msg_h
3222#endif
3223#endif
3224        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3225      END SUBROUTINE mp_bcast_r1_gpu
3226!
3227!------------------------------------------------------------------------------!
3228!
3229      SUBROUTINE mp_bcast_rv_gpu(msg_d,source,gid)
3230        IMPLICIT NONE
3231        REAL (DP), DEVICE :: msg_d(:)
3232        REAL (DP), ALLOCATABLE :: msg_h(:)
3233        INTEGER, INTENT(IN) :: source
3234        INTEGER, INTENT(IN) :: gid
3235        INTEGER :: msglen, ierr
3236#if defined(__MPI)
3237#if defined(__GPU_MPI)
3238        msglen = size(msg_d)
3239        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3240        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3241        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3242#else
3243        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3244        msglen = size(msg_h)
3245        CALL bcast_real( msg_h, msglen, source, gid )
3246        msg_d = msg_h ; DEALLOCATE(msg_h)
3247#endif
3248#endif
3249        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3250      END SUBROUTINE mp_bcast_rv_gpu
3251!
3252!------------------------------------------------------------------------------!
3253!
3254      SUBROUTINE mp_bcast_rm_gpu(msg_d,source,gid)
3255        IMPLICIT NONE
3256        REAL (DP), DEVICE :: msg_d(:,:)
3257        INTEGER, INTENT(IN) :: source
3258        INTEGER, INTENT(IN) :: gid
3259        INTEGER :: msglen, ierr
3260#if defined(__MPI)
3261#if defined(__GPU_MPI)
3262        msglen = size(msg_d)
3263        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3264        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3265        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3266#else
3267        REAL (DP), ALLOCATABLE :: msg_h(:,:)
3268        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3269        msglen = size(msg_h)
3270        CALL bcast_real( msg_h, msglen, source, gid )
3271        msg_d = msg_h ; DEALLOCATE(msg_h)
3272#endif
3273#endif
3274        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3275      END SUBROUTINE mp_bcast_rm_gpu
3276!
3277!------------------------------------------------------------------------------!
3278!
3279      SUBROUTINE mp_bcast_rt_gpu(msg_d,source,gid)
3280        IMPLICIT NONE
3281        REAL (DP), DEVICE :: msg_d(:,:,:)
3282        REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
3283        INTEGER, INTENT(IN) :: source
3284        INTEGER, INTENT(IN) :: gid
3285        INTEGER :: msglen, ierr
3286#if defined(__MPI)
3287#if defined(__GPU_MPI)
3288        msglen = size(msg_d)
3289        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3290        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3291        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3292#else
3293        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3294        msglen = size(msg_h)
3295        CALL bcast_real( msg_h, msglen, source, gid )
3296        msg_d = msg_h ; DEALLOCATE(msg_h)
3297#endif
3298#endif
3299        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3300      END SUBROUTINE mp_bcast_rt_gpu
3301!
3302!------------------------------------------------------------------------------!
3303!
3304      SUBROUTINE mp_bcast_r4d_gpu(msg_d, source, gid)
3305        IMPLICIT NONE
3306        REAL (DP), DEVICE :: msg_d(:,:,:,:)
3307        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
3308        INTEGER, INTENT(IN) :: source
3309        INTEGER, INTENT(IN) :: gid
3310        INTEGER :: msglen, ierr
3311#if defined(__MPI)
3312#if defined(__GPU_MPI)
3313        msglen = size(msg_d)
3314        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3315        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3316        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3317#else
3318        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3319        msglen = size(msg_h)
3320        CALL bcast_real( msg_h, msglen, source, gid )
3321        msg_d = msg_h ; DEALLOCATE(msg_h)
3322#endif
3323#endif
3324        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3325      END SUBROUTINE mp_bcast_r4d_gpu
3326!
3327!------------------------------------------------------------------------------!
3328!
3329      SUBROUTINE mp_bcast_r5d_gpu(msg_d, source, gid)
3330        IMPLICIT NONE
3331        REAL (DP), DEVICE :: msg_d(:,:,:,:,:)
3332        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
3333        INTEGER, INTENT(IN) :: source
3334        INTEGER, INTENT(IN) :: gid
3335        INTEGER :: msglen, ierr
3336#if defined(__MPI)
3337#if defined(__GPU_MPI)
3338        msglen = size(msg_d)
3339        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3340        CALL bcast_real_gpu( msg_d, msglen, source, gid )
3341        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3342#else
3343        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3344        msglen = size(msg_h)
3345        CALL bcast_real( msg_h, msglen, source, gid )
3346        msg_d = msg_h ; DEALLOCATE(msg_h)
3347#endif
3348#endif
3349        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3350      END SUBROUTINE mp_bcast_r5d_gpu
3351!
3352!------------------------------------------------------------------------------!
3353!
3354      SUBROUTINE mp_bcast_c1_gpu(msg_d,source,gid)
3355        IMPLICIT NONE
3356        COMPLEX (DP), DEVICE :: msg_d
3357        COMPLEX (DP) :: msg_h
3358        INTEGER, INTENT(IN) :: source
3359        INTEGER, INTENT(IN) :: gid
3360        INTEGER :: msglen, ierr
3361#if defined(__MPI)
3362        msglen = 1
3363#if defined(__GPU_MPI)
3364        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3365        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3366        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3367#else
3368        msg_h=msg_d                         ! This syncs __MPI case
3369        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3370        msg_d = msg_h
3371#endif
3372#endif
3373        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3374      END SUBROUTINE mp_bcast_c1_gpu
3375!
3376!------------------------------------------------------------------------------!
3377!
3378      SUBROUTINE mp_bcast_cv_gpu(msg_d,source,gid)
3379        IMPLICIT NONE
3380        COMPLEX (DP), DEVICE :: msg_d(:)
3381        COMPLEX (DP), ALLOCATABLE :: msg_h(:)
3382        INTEGER, INTENT(IN) :: source
3383        INTEGER, INTENT(IN) :: gid
3384        INTEGER :: msglen, ierr
3385#if defined(__MPI)
3386#if defined(__GPU_MPI)
3387        msglen = size(msg_d)
3388        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3389        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3390        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3391#else
3392        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3393        msglen = size(msg_h)
3394        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3395        msg_d = msg_h ; DEALLOCATE(msg_h)
3396#endif
3397#endif
3398        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3399      END SUBROUTINE mp_bcast_cv_gpu
3400!
3401!------------------------------------------------------------------------------!
3402!
3403      SUBROUTINE mp_bcast_cm_gpu(msg_d,source,gid)
3404        IMPLICIT NONE
3405        COMPLEX (DP), DEVICE :: msg_d(:,:)
3406        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
3407        INTEGER, INTENT(IN) :: source
3408        INTEGER, INTENT(IN) :: gid
3409        INTEGER :: msglen, ierr
3410#if defined(__MPI)
3411#if defined(__GPU_MPI)
3412        msglen = size(msg_d)
3413        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3414        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3415        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3416#else
3417        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3418        msglen = size(msg_h)
3419        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3420        msg_d = msg_h ; DEALLOCATE(msg_h)
3421#endif
3422#endif
3423        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3424      END SUBROUTINE mp_bcast_cm_gpu
3425!
3426!------------------------------------------------------------------------------!
3427!
3428      SUBROUTINE mp_bcast_ct_gpu(msg_d,source,gid)
3429        IMPLICIT NONE
3430        COMPLEX (DP), DEVICE :: msg_d(:,:,:)
3431        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
3432        INTEGER, INTENT(IN) :: source
3433        INTEGER, INTENT(IN) :: gid
3434        INTEGER :: msglen, ierr
3435#if defined(__MPI)
3436#if defined(__GPU_MPI)
3437        msglen = size(msg_d)
3438        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3439        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3440        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3441#else
3442        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3443        msglen = size(msg_h)
3444        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3445        msg_d = msg_h ; DEALLOCATE(msg_h)
3446#endif
3447#endif
3448        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3449      END SUBROUTINE mp_bcast_ct_gpu
3450!
3451!------------------------------------------------------------------------------!
3452!
3453      SUBROUTINE mp_bcast_c4d_gpu(msg_d,source,gid)
3454        IMPLICIT NONE
3455        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:)
3456        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:)
3457        INTEGER, INTENT(IN) :: source
3458        INTEGER, INTENT(IN) :: gid
3459        INTEGER :: msglen, ierr
3460#if defined(__MPI)
3461#if defined(__GPU_MPI)
3462        msglen = size(msg_d)
3463        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3464        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3465        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3466#else
3467        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3468        msglen = size(msg_h)
3469        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3470        msg_d = msg_h ; DEALLOCATE(msg_h)
3471#endif
3472#endif
3473        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3474      END SUBROUTINE mp_bcast_c4d_gpu
3475!
3476!------------------------------------------------------------------------------!
3477!
3478      SUBROUTINE mp_bcast_c5d_gpu(msg_d,source,gid)
3479        IMPLICIT NONE
3480        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:)
3481        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
3482        INTEGER, INTENT(IN) :: source
3483        INTEGER, INTENT(IN) :: gid
3484        INTEGER :: msglen, ierr
3485#if defined(__MPI)
3486#if defined(__GPU_MPI)
3487        msglen = size(msg_d)
3488        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3489        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3490        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3491#else
3492        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3493        msglen = size(msg_h)
3494        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3495        msg_d = msg_h ; DEALLOCATE(msg_h)
3496#endif
3497#endif
3498        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3499      END SUBROUTINE mp_bcast_c5d_gpu
3500!
3501!------------------------------------------------------------------------------!
3502!
3503      SUBROUTINE mp_bcast_c6d_gpu(msg_d,source,gid)
3504        IMPLICIT NONE
3505        COMPLEX (DP), DEVICE :: msg_d(:,:,:,:,:,:)
3506        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
3507        INTEGER, INTENT(IN) :: source
3508        INTEGER, INTENT(IN) :: gid
3509        INTEGER :: msglen, ierr
3510#if defined(__MPI)
3511#if defined(__GPU_MPI)
3512        msglen = size(msg_d)
3513        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3514        CALL bcast_real_gpu( msg_d, 2 * msglen, source, gid )
3515        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3516#else
3517        ALLOCATE( msg_h, source=msg_d )     ! This syncs __MPI case
3518        msglen = size(msg_h)
3519        CALL bcast_real( msg_h, 2 * msglen, source, gid )
3520        msg_d = msg_h ; DEALLOCATE(msg_h)
3521#endif
3522#endif
3523        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3524      END SUBROUTINE mp_bcast_c6d_gpu
3525!
3526!------------------------------------------------------------------------------!
3527!
3528      SUBROUTINE mp_bcast_l_gpu(msg_d,source,gid)
3529        IMPLICIT NONE
3530        LOGICAL, DEVICE :: msg_d
3531        LOGICAL         :: msg_h
3532        INTEGER, INTENT(IN) :: source
3533        INTEGER, INTENT(IN) :: gid
3534        INTEGER :: msglen, ierr
3535#if defined(__MPI)
3536        msglen = 1
3537#if defined(__GPU_MPI)
3538        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3539        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
3540        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3541#else
3542        msg_h = msg_d                       ! This syncs __MPI case
3543        CALL bcast_logical( msg_h, msglen, source, gid )
3544        msg_d = msg_h
3545#endif
3546#endif
3547        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3548      END SUBROUTINE mp_bcast_l_gpu
3549!
3550!------------------------------------------------------------------------------!
3551!
3552      SUBROUTINE mp_bcast_lv_gpu(msg_d,source,gid)
3553        IMPLICIT NONE
3554        LOGICAL, DEVICE :: msg_d(:)
3555        INTEGER, INTENT(IN) :: source
3556        INTEGER, INTENT(IN) :: gid
3557        INTEGER :: msglen, ierr
3558#if defined(__MPI)
3559#if defined(__GPU_MPI)
3560        msglen = size(msg_d)
3561        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3562        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
3563        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3564#else
3565        LOGICAL, ALLOCATABLE :: msg_h(:)
3566        ALLOCATE(msg_h, source=msg_d)       ! This syncs __MPI case
3567        msglen = size(msg_h)
3568        CALL bcast_logical( msg_h, msglen, source, gid )
3569        msg_d = msg_h; DEALLOCATE(msg_h)
3570#endif
3571#endif
3572        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3573      END SUBROUTINE mp_bcast_lv_gpu
3574!
3575!------------------------------------------------------------------------------!
3576!
3577      SUBROUTINE mp_bcast_lm_gpu(msg_d,source,gid)
3578        IMPLICIT NONE
3579        LOGICAL, DEVICE :: msg_d(:,:)
3580        INTEGER, INTENT(IN) :: source
3581        INTEGER, INTENT(IN) :: gid
3582        INTEGER :: msglen, ierr
3583#if defined(__MPI)
3584#if defined(__GPU_MPI)
3585        msglen = size(msg_d)
3586        ierr = cudaDeviceSynchronize()      ! This syncs __GPU_MPI case
3587        CALL bcast_logical_gpu( msg_d, msglen, source, gid )
3588        RETURN ! Sync done by MPI call (or inside bcast_xxx_gpu)
3589#else
3590        LOGICAL, ALLOCATABLE :: msg_h(:,:)
3591        ALLOCATE(msg_h, source=msg_d)       ! This syncs __MPI case
3592        msglen = size(msg_h)
3593        CALL bcast_logical( msg_h, msglen, source, gid )
3594        msg_d = msg_h; DEALLOCATE(msg_h)
3595#endif
3596#endif
3597        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
3598      END SUBROUTINE mp_bcast_lm_gpu
3599!
3600!------------------------------------------------------------------------------!
3601!
3602      SUBROUTINE mp_get_i1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3603        INTEGER, DEVICE             :: msg_dest_d
3604        INTEGER, INTENT(IN), DEVICE :: msg_sour_d
3605        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3606        INTEGER, INTENT(IN) :: gid
3607        INTEGER :: group, ierr
3608#if defined(__MPI)
3609        INTEGER :: istatus(MPI_STATUS_SIZE)
3610#endif
3611        INTEGER :: nrcv
3612        INTEGER :: msglen = 1
3613
3614#if ! defined(__GPU_MPI)
3615        ! Call CPU implementation
3616        INTEGER :: msg_dest_h, msg_sour_h
3617        !
3618        msg_dest_h = msg_dest_d; msg_sour_h = msg_sour_d      ! This syncs __MPI case
3619        CALL mp_get_i1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3620        msg_dest_d = msg_dest_h
3621#else
3622
3623#if defined(__MPI)
3624        group = gid
3625#endif
3626
3627        ! processors not taking part in the communication have 0 length message
3628
3629        msglen = 0
3630        !
3631        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3632        !
3633        IF(dest .NE. sour) THEN
3634#if defined(__MPI)
3635           IF(mpime .EQ. sour) THEN
3636             msglen=1
3637             CALL MPI_SEND( msg_sour_d, msglen, MPI_INTEGER, dest, ip, group, ierr)
3638             IF (ierr/=0) CALL mp_stop( -8001 )
3639           ELSE IF(mpime .EQ. dest) THEN
3640             msglen=1
3641             CALL MPI_RECV( msg_dest_d, msglen, MPI_INTEGER, sour, ip, group, istatus, IERR )
3642             IF (ierr/=0) CALL mp_stop( -8002 )
3643             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
3644             IF (ierr/=0) CALL mp_stop( -8003 )
3645           END IF
3646#endif
3647        ELSEIF(mpime .EQ. sour)THEN
3648          msg_dest_d = msg_sour_d
3649          msglen = 1
3650        END IF
3651
3652#if defined(__MPI)
3653        CALL MPI_BARRIER(group, IERR)
3654        IF (ierr/=0) CALL mp_stop( -8004 )
3655#endif
3656
3657#endif
3658        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3659        RETURN
3660      END SUBROUTINE mp_get_i1_gpu
3661!
3662!------------------------------------------------------------------------------!
3663!
3664      SUBROUTINE mp_get_iv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3665        INTEGER, DEVICE             :: msg_dest_d(:)
3666        INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
3667        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3668        INTEGER, INTENT(IN) :: gid
3669        INTEGER :: group
3670#if defined(__MPI)
3671        INTEGER :: istatus(MPI_STATUS_SIZE)
3672#endif
3673        INTEGER :: ierr, nrcv
3674        INTEGER :: msglen
3675        !
3676#if ! defined(__GPU_MPI)
3677        INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
3678        !
3679        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );       ! This syncs __MPI case
3680        CALL mp_get_iv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3681        msg_dest_d = msg_dest_h
3682        DEALLOCATE(msg_dest_h, msg_sour_h)
3683#else
3684
3685#if defined(__MPI)
3686        group = gid
3687#endif
3688
3689        ! processors not taking part in the communication have 0 length message
3690
3691        msglen = 0
3692        !
3693        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3694        !
3695        IF(sour .NE. dest) THEN
3696#if defined(__MPI)
3697           IF(mpime .EQ. sour) THEN
3698             msglen = SIZE(msg_sour_d)
3699             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
3700             IF (ierr/=0) CALL mp_stop( 9001 )
3701           ELSE IF(mpime .EQ. dest) THEN
3702             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
3703             IF (ierr/=0) CALL mp_stop( 9002 )
3704             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
3705             IF (ierr/=0) CALL mp_stop( 9003 )
3706             msglen = nrcv
3707           END IF
3708#endif
3709        ELSEIF(mpime .EQ. sour)THEN
3710          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
3711          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
3712          msglen = SIZE(msg_sour_d)
3713        END IF
3714#if defined(__MPI)
3715        CALL MPI_BARRIER(group, IERR)
3716        IF (ierr/=0) CALL mp_stop( 9004 )
3717#endif
3718#endif
3719        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3720        RETURN
3721      END SUBROUTINE mp_get_iv_gpu
3722!
3723!------------------------------------------------------------------------------!
3724!
3725      SUBROUTINE mp_get_r1_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3726        REAL (DP), DEVICE             :: msg_dest_d
3727        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d
3728        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3729        INTEGER, INTENT(IN) :: gid
3730        INTEGER :: group
3731#if defined(__MPI)
3732        INTEGER :: istatus(MPI_STATUS_SIZE)
3733#endif
3734        INTEGER :: ierr, nrcv
3735        INTEGER :: msglen
3736#if ! defined(__GPU_MPI)
3737        REAL(DP) :: msg_dest_h, msg_sour_h
3738        !
3739        msg_dest_h=msg_dest_d; msg_sour_h=msg_sour_d      ! This syncs __MPI case
3740        CALL mp_get_r1(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3741        msg_dest_d = msg_dest_h
3742#else
3743#if defined(__MPI)
3744        group = gid
3745#endif
3746
3747        ! processors not taking part in the communication have 0 length message
3748
3749        msglen = 0
3750        !
3751        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3752        !
3753        IF(sour .NE. dest) THEN
3754#if defined(__MPI)
3755           IF(mpime .EQ. sour) THEN
3756             msglen = 1
3757             CALL MPI_SEND( msg_sour_d, msglen, MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
3758             IF (ierr/=0) CALL mp_stop( 9005 )
3759           ELSE IF(mpime .EQ. dest) THEN
3760             msglen = 1
3761             CALL MPI_RECV( msg_dest_d, msglen, MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
3762             IF (ierr/=0) CALL mp_stop( 9006 )
3763             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
3764             IF (ierr/=0) CALL mp_stop( 9007 )
3765             msglen = nrcv
3766           END IF
3767#endif
3768        ELSEIF(mpime .EQ. sour)THEN
3769          msg_dest_d = msg_sour_d
3770          msglen = 1
3771        END IF
3772#if defined(__MPI)
3773        CALL MPI_BARRIER(group, IERR)
3774        IF (ierr/=0) CALL mp_stop( 9008 )
3775#endif
3776#endif
3777        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3778        RETURN
3779      END SUBROUTINE mp_get_r1_gpu
3780!
3781!------------------------------------------------------------------------------!
3782!
3783      SUBROUTINE mp_get_rv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3784        REAL (DP), DEVICE             :: msg_dest_d(:)
3785        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
3786        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3787        INTEGER, INTENT(IN) :: gid
3788        INTEGER :: group
3789#if defined(__MPI)
3790        INTEGER :: istatus(MPI_STATUS_SIZE)
3791#endif
3792        INTEGER :: ierr, nrcv
3793        INTEGER :: msglen
3794        !
3795#if ! defined(__GPU_MPI)
3796        REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
3797        !
3798        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );       ! This syncs __MPI case
3799        CALL mp_get_rv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3800        msg_dest_d = msg_dest_h
3801        DEALLOCATE(msg_dest_h, msg_sour_h)
3802#else
3803        !
3804#if defined(__MPI)
3805        group = gid
3806#endif
3807
3808        ! processors not taking part in the communication have 0 length message
3809
3810        msglen = 0
3811        !
3812        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3813        !
3814        IF(sour .NE. dest) THEN
3815#if defined(__MPI)
3816           IF(mpime .EQ. sour) THEN
3817             msglen = SIZE(msg_sour_d)
3818             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
3819             IF (ierr/=0) CALL mp_stop( 9009 )
3820           ELSE IF(mpime .EQ. dest) THEN
3821             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
3822             IF (ierr/=0) CALL mp_stop( 9010 )
3823             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
3824             IF (ierr/=0) CALL mp_stop( 9011 )
3825             msglen = nrcv
3826           END IF
3827#endif
3828        ELSEIF(mpime .EQ. sour)THEN
3829          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
3830          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
3831          msglen = SIZE(msg_sour_d)
3832        END IF
3833#if defined(__MPI)
3834        CALL MPI_BARRIER(group, IERR)
3835        IF (ierr/=0) CALL mp_stop( 9012 )
3836#endif
3837#endif
3838        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3839        RETURN
3840      END SUBROUTINE mp_get_rv_gpu
3841!
3842!------------------------------------------------------------------------------!
3843!
3844      SUBROUTINE mp_get_rm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3845        REAL (DP), DEVICE             :: msg_dest_d(:,:)
3846        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
3847        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3848        INTEGER, INTENT(IN) :: gid
3849        INTEGER :: group
3850#if defined(__MPI)
3851        INTEGER :: istatus(MPI_STATUS_SIZE)
3852#endif
3853        INTEGER :: ierr, nrcv
3854        INTEGER :: msglen
3855        !
3856#if ! defined(__GPU_MPI)
3857        REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
3858        !
3859        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );        ! This syncs __MPI case
3860        CALL mp_get_rm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3861        msg_dest_d = msg_dest_h
3862        DEALLOCATE(msg_dest_h, msg_sour_h)
3863#else
3864
3865#if defined(__MPI)
3866        group = gid
3867#endif
3868
3869        ! processors not taking part in the communication have 0 length message
3870
3871        msglen = 0
3872        !
3873        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3874        !
3875        IF(sour .NE. dest) THEN
3876#if defined(__MPI)
3877           IF(mpime .EQ. sour) THEN
3878             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
3879             IF (ierr/=0) CALL mp_stop( 9013 )
3880             msglen = SIZE(msg_sour_d)
3881           ELSE IF(mpime .EQ. dest) THEN
3882             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
3883             IF (ierr/=0) CALL mp_stop( 9014 )
3884             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
3885             IF (ierr/=0) CALL mp_stop( 9015 )
3886             msglen = nrcv
3887           END IF
3888#endif
3889        ELSEIF(mpime .EQ. sour)THEN
3890          !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
3891          ! function cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kdir)
3892          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
3893                              msg_sour_d, SIZE(msg_sour_d,1),&
3894                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
3895                              cudaMemcpyDeviceToDevice )
3896          msglen = SIZE( msg_sour_d )
3897        END IF
3898#if defined(__MPI)
3899        CALL MPI_BARRIER(group, IERR)
3900        IF (ierr/=0) CALL mp_stop( 9016 )
3901#endif
3902#endif
3903        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3904        RETURN
3905      END SUBROUTINE mp_get_rm_gpu
3906!
3907!------------------------------------------------------------------------------!
3908!
3909      SUBROUTINE mp_get_cv_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3910        COMPLEX (DP), DEVICE             :: msg_dest_d(:)
3911        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
3912        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3913        INTEGER, INTENT(IN) :: gid
3914        INTEGER :: group
3915#if defined(__MPI)
3916        INTEGER :: istatus(MPI_STATUS_SIZE)
3917#endif
3918        INTEGER :: ierr, nrcv
3919        INTEGER :: msglen
3920        !
3921#if ! defined(__GPU_MPI)
3922        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
3923        !
3924        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );         ! This syncs __MPI case
3925        CALL mp_get_cv(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3926        msg_dest_d = msg_dest_h;
3927        DEALLOCATE(msg_dest_h, msg_sour_h)
3928#else
3929        !
3930#if defined(__MPI)
3931        group = gid
3932#endif
3933
3934        ! processors not taking part in the communication have 0 length message
3935
3936        msglen = 0
3937        !
3938        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
3939        !
3940        IF( dest .NE. sour ) THEN
3941#if defined(__MPI)
3942           IF(mpime .EQ. sour) THEN
3943             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
3944             IF (ierr/=0) CALL mp_stop( 9017 )
3945             msglen = SIZE(msg_sour_d)
3946           ELSE IF(mpime .EQ. dest) THEN
3947             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
3948             IF (ierr/=0) CALL mp_stop( 9018 )
3949             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
3950             IF (ierr/=0) CALL mp_stop( 9019 )
3951             msglen = nrcv
3952           END IF
3953#endif
3954        ELSEIF(mpime .EQ. sour)THEN
3955          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
3956          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
3957          msglen = SIZE(msg_sour_d)
3958        END IF
3959#if defined(__MPI)
3960        CALL MPI_BARRIER(group, IERR)
3961        IF (ierr/=0) CALL mp_stop( 9020 )
3962#endif
3963#endif
3964        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
3965        RETURN
3966      END SUBROUTINE mp_get_cv_gpu
3967!
3968!------------------------------------------------------------------------------!
3969!
3970      SUBROUTINE mp_get_cm_gpu(msg_dest_d, msg_sour_d, mpime, dest, sour, ip, gid)
3971        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
3972        COMPLEX (DP), DEVICE             :: msg_dest_d(:,:)
3973        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
3974        INTEGER, INTENT(IN) :: gid
3975        INTEGER :: group
3976#if defined(__MPI)
3977        INTEGER :: istatus(MPI_STATUS_SIZE)
3978#endif
3979        INTEGER :: ierr, nrcv
3980        INTEGER :: msglen
3981        !
3982#if ! defined(__GPU_MPI)
3983        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
3984        !
3985        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );          ! This syncs __MPI case
3986        CALL mp_get_cm(msg_dest_h, msg_sour_h, mpime, dest, sour, ip, gid)
3987        msg_dest_d = msg_dest_h;
3988        DEALLOCATE(msg_dest_h, msg_sour_h)
3989#else
3990        !
3991#if defined(__MPI)
3992        group = gid
3993#endif
3994
3995        ! processors not taking part in the communication have 0 length message
3996
3997        msglen = 0
3998        !
3999        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4000        !
4001        IF(sour .NE. dest) THEN
4002#if defined(__MPI)
4003           IF(mpime .EQ. sour) THEN
4004             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
4005             IF (ierr/=0) CALL mp_stop( 9021 )
4006             msglen = SIZE(msg_sour_d)
4007           ELSE IF(mpime .EQ. dest) THEN
4008             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
4009             IF (ierr/=0) CALL mp_stop( 9022 )
4010             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
4011             IF (ierr/=0) CALL mp_stop( 9023 )
4012             msglen = nrcv
4013           END IF
4014#endif
4015        ELSEIF(mpime .EQ. sour)THEN
4016          !msg_dest_d(1:SIZE(msg_sour_d,1), 1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
4017          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
4018                              msg_sour_d, SIZE(msg_sour_d,1),&
4019                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
4020                              cudaMemcpyDeviceToDevice )
4021          msglen = SIZE( msg_sour_d )
4022        END IF
4023#if defined(__MPI)
4024        CALL MPI_BARRIER(group, IERR)
4025        IF (ierr/=0) CALL mp_stop( 9024 )
4026#endif
4027#endif
4028        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4029        RETURN
4030      END SUBROUTINE mp_get_cm_gpu
4031!
4032!------------------------------------------------------------------------------!
4033!
4034      SUBROUTINE mp_put_i1_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
4035        INTEGER, DEVICE             :: msg_dest_d
4036        INTEGER, INTENT(IN), DEVICE :: msg_sour_d
4037        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
4038        INTEGER, INTENT(IN) :: gid
4039        INTEGER :: group
4040#if defined(__MPI)
4041        INTEGER :: istatus(MPI_STATUS_SIZE)
4042#endif
4043        INTEGER :: ierr, nrcv
4044        INTEGER :: msglen
4045        !
4046#if ! defined(__GPU_MPI)
4047        INTEGER :: msg_dest_h, msg_sour_h
4048        !
4049        msg_dest_h=msg_dest_d ; msg_sour_h=msg_sour_d           ! This syncs __MPI case
4050        CALL mp_put_i1(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
4051        msg_dest_d = msg_dest_h
4052#else
4053
4054#if defined(__MPI)
4055        group = gid
4056#endif
4057
4058        ! processors not taking part in the communication have 0 length message
4059
4060        msglen = 0
4061        !
4062        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4063        !
4064        IF(dest .NE. sour) THEN
4065#if defined(__MPI)
4066           IF(mpime .EQ. sour) THEN
4067             CALL MPI_SEND( msg_sour_d, 1, MPI_INTEGER, dest, ip, group, ierr)
4068             IF (ierr/=0) CALL mp_stop( 9025 )
4069             msglen = 1
4070           ELSE IF(mpime .EQ. dest) THEN
4071             CALL MPI_RECV( msg_dest_d, 1, MPI_INTEGER, sour, ip, group, istatus, IERR )
4072             IF (ierr/=0) CALL mp_stop( 9026 )
4073             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
4074             IF (ierr/=0) CALL mp_stop( 9027 )
4075             msglen = 1
4076           END IF
4077#endif
4078        ELSEIF(mpime .EQ. sour)THEN
4079          msg_dest_d = msg_sour_d
4080          msglen = 1
4081        END IF
4082#if defined(__MPI)
4083        CALL MPI_BARRIER(group, IERR)
4084        IF (ierr/=0) CALL mp_stop( 9028 )
4085#endif
4086#endif
4087        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4088        RETURN
4089      END SUBROUTINE mp_put_i1_gpu
4090!
4091!------------------------------------------------------------------------------!
4092!
4093      SUBROUTINE mp_put_iv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
4094        INTEGER, DEVICE             :: msg_dest_d(:)
4095        INTEGER, INTENT(IN), DEVICE :: msg_sour_d(:)
4096        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
4097        INTEGER, INTENT(IN) :: gid
4098        INTEGER :: group
4099#if defined(__MPI)
4100        INTEGER :: istatus(MPI_STATUS_SIZE)
4101#endif
4102        INTEGER :: ierr, nrcv
4103        INTEGER :: msglen
4104        !
4105#if ! defined(__GPU_MPI)
4106        INTEGER, ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
4107        !
4108        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
4109        CALL mp_put_iv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
4110        msg_dest_d = msg_dest_h
4111        DEALLOCATE(msg_dest_h, msg_sour_h)
4112#else
4113        !
4114#if defined(__MPI)
4115        group = gid
4116#endif
4117        ! processors not taking part in the communication have 0 length message
4118
4119        msglen = 0
4120        !
4121        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4122        !
4123        IF(sour .NE. dest) THEN
4124#if defined(__MPI)
4125           IF(mpime .EQ. sour) THEN
4126             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_INTEGER, dest, ip, group, ierr)
4127             IF (ierr/=0) CALL mp_stop( 9029 )
4128             msglen = SIZE(msg_sour_d)
4129           ELSE IF(mpime .EQ. dest) THEN
4130             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_INTEGER, sour, ip, group, istatus, IERR )
4131             IF (ierr/=0) CALL mp_stop( 9030 )
4132             CALL MPI_GET_COUNT(istatus, MPI_INTEGER, nrcv, ierr)
4133             IF (ierr/=0) CALL mp_stop( 9031 )
4134             msglen = nrcv
4135           END IF
4136#endif
4137        ELSEIF(mpime .EQ. sour)THEN
4138          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
4139          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
4140          msglen = SIZE(msg_sour_d)
4141        END IF
4142#if defined(__MPI)
4143        CALL MPI_BARRIER(group, IERR)
4144        IF (ierr/=0) CALL mp_stop( 9032 )
4145#endif
4146#endif
4147        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4148        RETURN
4149      END SUBROUTINE mp_put_iv_gpu
4150!
4151!------------------------------------------------------------------------------!
4152!
4153      SUBROUTINE mp_put_rv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
4154        REAL (DP), DEVICE             :: msg_dest_d(:)
4155        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
4156        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
4157        INTEGER, INTENT(IN) :: gid
4158        INTEGER :: group
4159#if defined(__MPI)
4160        INTEGER :: istatus(MPI_STATUS_SIZE)
4161#endif
4162        INTEGER :: ierr, nrcv
4163        INTEGER :: msglen
4164        !
4165#if ! defined(__GPU_MPI)
4166        REAL (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
4167        !
4168        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
4169        CALL mp_put_rv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
4170        msg_dest_d = msg_dest_h
4171        DEALLOCATE(msg_dest_h, msg_sour_h)
4172#else
4173        !
4174#if defined(__MPI)
4175        group = gid
4176#endif
4177        ! processors not taking part in the communication have 0 length message
4178
4179        msglen = 0
4180        !
4181        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4182        !
4183        IF(sour .NE. dest) THEN
4184#if defined(__MPI)
4185           IF(mpime .EQ. sour) THEN
4186             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
4187             IF (ierr/=0) CALL mp_stop( 9033 )
4188             msglen = SIZE(msg_sour_d)
4189           ELSE IF(mpime .EQ. dest) THEN
4190             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
4191             IF (ierr/=0) CALL mp_stop( 9034 )
4192             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
4193             IF (ierr/=0) CALL mp_stop( 9035 )
4194             msglen = nrcv
4195           END IF
4196#endif
4197        ELSEIF(mpime .EQ. sour)THEN
4198          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
4199          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
4200          msglen = SIZE(msg_sour_d)
4201        END IF
4202#if defined(__MPI)
4203        CALL MPI_BARRIER(group, IERR)
4204        IF (ierr/=0) CALL mp_stop( 9036 )
4205#endif
4206#endif
4207        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4208        RETURN
4209      END SUBROUTINE mp_put_rv_gpu
4210!
4211!------------------------------------------------------------------------------!
4212!
4213      SUBROUTINE mp_put_rm_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
4214        REAL (DP), DEVICE             :: msg_dest_d(:,:)
4215        REAL (DP), INTENT(IN), DEVICE :: msg_sour_d(:,:)
4216        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
4217        INTEGER, INTENT(IN) :: gid
4218        INTEGER :: group
4219#if defined(__MPI)
4220        INTEGER :: istatus(MPI_STATUS_SIZE)
4221#endif
4222        INTEGER :: ierr, nrcv
4223        INTEGER :: msglen
4224        !
4225#if ! defined(__GPU_MPI)
4226        REAL (DP), ALLOCATABLE :: msg_dest_h(:,:), msg_sour_h(:,:)
4227        !
4228        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
4229        CALL mp_put_rm(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
4230        msg_dest_d = msg_dest_h
4231        DEALLOCATE(msg_dest_h, msg_sour_h)
4232#else
4233        !
4234#if defined(__MPI)
4235        group = gid
4236#endif
4237        ! processors not taking part in the communication have 0 length message
4238
4239        msglen = 0
4240        !
4241        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4242        !
4243        IF(sour .NE. dest) THEN
4244#if defined(__MPI)
4245           IF(mpime .EQ. sour) THEN
4246             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_PRECISION, dest, ip, group, ierr)
4247             IF (ierr/=0) CALL mp_stop( 9037 )
4248             msglen = SIZE(msg_sour_d)
4249           ELSE IF(mpime .EQ. dest) THEN
4250             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_PRECISION, sour, ip, group, istatus, IERR )
4251             IF (ierr/=0) CALL mp_stop( 9038 )
4252             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_PRECISION, nrcv, ierr)
4253             IF (ierr/=0) CALL mp_stop( 9039 )
4254             msglen = nrcv
4255           END IF
4256#endif
4257        ELSEIF(mpime .EQ. sour)THEN
4258          !msg_dest_d(1:SIZE(msg_sour_d,1),1:SIZE(msg_sour_d,2)) = msg_sour_d(:,:)
4259          ierr = cudaMemcpy2D(msg_dest_d, SIZE(msg_dest_d,1),&
4260                              msg_sour_d, SIZE(msg_sour_d,1),&
4261                              SIZE(msg_sour_d,1), SIZE(msg_sour_d,2), &
4262                              cudaMemcpyDeviceToDevice )
4263          msglen = SIZE(msg_sour_d)
4264        END IF
4265#if defined(__MPI)
4266        CALL MPI_BARRIER(group, IERR)
4267        IF (ierr/=0) CALL mp_stop( 9040 )
4268#endif
4269#endif
4270        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4271        RETURN
4272      END SUBROUTINE mp_put_rm_gpu
4273!
4274!------------------------------------------------------------------------------!
4275!
4276      SUBROUTINE mp_put_cv_gpu(msg_dest_d, msg_sour_d, mpime, sour, dest, ip, gid)
4277        COMPLEX (DP),             DEVICE :: msg_dest_d(:)
4278        COMPLEX (DP), INTENT(IN), DEVICE :: msg_sour_d(:)
4279        INTEGER, INTENT(IN) :: dest, sour, ip, mpime
4280        INTEGER, INTENT(IN) :: gid
4281        INTEGER :: group
4282#if defined(__MPI)
4283        INTEGER :: istatus(MPI_STATUS_SIZE)
4284#endif
4285        INTEGER :: ierr, nrcv
4286        INTEGER :: msglen
4287        !
4288#if ! defined(__GPU_MPI)
4289        COMPLEX (DP), ALLOCATABLE :: msg_dest_h(:), msg_sour_h(:)
4290        !
4291        ALLOCATE( msg_dest_h, source=msg_dest_d ); ALLOCATE( msg_sour_h, source=msg_sour_d );           ! This syncs __MPI case
4292        CALL mp_put_cv(msg_dest_h, msg_sour_h, mpime, sour, dest, ip, gid)
4293        msg_dest_d = msg_dest_h
4294        DEALLOCATE(msg_dest_h, msg_sour_h)
4295#else
4296        !
4297#if defined(__MPI)
4298        group = gid
4299#endif
4300        ! processors not taking part in the communication have 0 length message
4301
4302        msglen = 0
4303        !
4304        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL and __GPU_MPI
4305        !
4306        IF( dest .NE. sour ) THEN
4307#if defined(__MPI)
4308           IF(mpime .EQ. sour) THEN
4309             CALL MPI_SEND( msg_sour_d, SIZE(msg_sour_d), MPI_DOUBLE_COMPLEX, dest, ip, group, ierr)
4310             IF (ierr/=0) CALL mp_stop( 9041 )
4311             msglen = SIZE(msg_sour_d)
4312           ELSE IF(mpime .EQ. dest) THEN
4313             CALL MPI_RECV( msg_dest_d, SIZE(msg_dest_d), MPI_DOUBLE_COMPLEX, sour, ip, group, istatus, IERR )
4314             IF (ierr/=0) CALL mp_stop( 9042 )
4315             CALL MPI_GET_COUNT(istatus, MPI_DOUBLE_COMPLEX, nrcv, ierr)
4316             IF (ierr/=0) CALL mp_stop( 9043 )
4317             msglen = nrcv
4318           END IF
4319#endif
4320        ELSEIF(mpime .EQ. sour)THEN
4321          !msg_dest_d(1:SIZE(msg_sour_d)) = msg_sour_d(:)
4322          ierr = cudaMemcpy(msg_dest_d(1) , msg_sour_d(1), SIZE(msg_sour_d), cudaMemcpyDeviceToDevice )
4323          msglen = SIZE(msg_sour_d)
4324        END IF
4325#if defined(__MPI)
4326        CALL MPI_BARRIER(group, IERR)
4327        IF (ierr/=0) CALL mp_stop( 9044 )
4328#endif
4329#endif
4330        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI and __GPU_MPI
4331        RETURN
4332      END SUBROUTINE mp_put_cv_gpu
4333!
4334!------------------------------------------------------------------------------!
4335!
4336!..mp_sum
4337      SUBROUTINE mp_sum_i1_gpu(msg_d,gid)
4338        IMPLICIT NONE
4339        INTEGER, INTENT (INOUT), DEVICE :: msg_d
4340        INTEGER, msg_h
4341        INTEGER, INTENT(IN) :: gid
4342        INTEGER :: msglen, ierr
4343        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4344        IF ( mp_size(gid) == 1 ) THEN
4345          ierr = cudaDeviceSynchronize()
4346          RETURN
4347        END IF
4348        !
4349#if defined(__MPI)
4350        msglen = 1
4351#if defined(__GPU_MPI)
4352        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4353        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
4354        ! No need for final syncronization
4355#else
4356        !
4357        msg_h = msg_d                   ! This syncs __MPI case
4358        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
4359        msg_d = msg_h
4360        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4361#endif
4362#endif
4363      END SUBROUTINE mp_sum_i1_gpu
4364!
4365!------------------------------------------------------------------------------!
4366!
4367      SUBROUTINE mp_sum_iv_gpu(msg_d,gid)
4368        IMPLICIT NONE
4369        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
4370        INTEGER, ALLOCATABLE :: msg_h(:)
4371        INTEGER, INTENT(IN) :: gid
4372        !
4373        INTEGER :: msglen, ierr
4374        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4375        IF ( mp_size(gid) == 1 ) THEN
4376          ierr = cudaDeviceSynchronize()
4377          RETURN
4378        END IF
4379        !
4380#if defined(__MPI)
4381#if defined(__GPU_MPI)
4382        msglen = size(msg_d)
4383        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4384        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
4385        ! No need for final syncronization
4386#else
4387        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
4388        msglen = size(msg_h)
4389        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
4390        msg_d = msg_h; DEALLOCATE(msg_h)
4391        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4392#endif
4393#endif
4394      END SUBROUTINE mp_sum_iv_gpu
4395!
4396!------------------------------------------------------------------------------!
4397!
4398      SUBROUTINE mp_sum_im_gpu(msg_d,gid)
4399        IMPLICIT NONE
4400        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:)
4401        INTEGER, ALLOCATABLE :: msg_h(:,:)
4402        INTEGER, INTENT(IN) :: gid
4403        !
4404        INTEGER :: msglen, ierr
4405        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4406        IF ( mp_size(gid) == 1 ) THEN
4407          ierr = cudaDeviceSynchronize()
4408          RETURN
4409        END IF
4410        !
4411#if defined(__MPI)
4412#if defined(__GPU_MPI)
4413        msglen = size(msg_d)
4414        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4415        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
4416        ! No need for final syncronization
4417#else
4418        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
4419        msglen = size(msg_h)
4420        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
4421        msg_d = msg_h; DEALLOCATE(msg_h)
4422        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4423#endif
4424#endif
4425      END SUBROUTINE mp_sum_im_gpu
4426!
4427!------------------------------------------------------------------------------!
4428!
4429      SUBROUTINE mp_sum_it_gpu(msg_d,gid)
4430        IMPLICIT NONE
4431        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:,:,:)
4432        INTEGER, ALLOCATABLE :: msg_h(:,:,:)
4433        INTEGER, INTENT (IN) :: gid
4434        !
4435        INTEGER :: msglen, ierr
4436        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4437        IF ( mp_size(gid) == 1 ) THEN
4438          ierr = cudaDeviceSynchronize()
4439          RETURN
4440        END IF
4441        !
4442#if defined(__MPI)
4443#if defined(__GPU_MPI)
4444        msglen = size(msg_d)
4445        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4446        CALL reduce_base_integer_gpu( msglen, msg_d, gid, -1 )
4447        ! No need for final syncronization
4448#else
4449        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
4450        msglen = size(msg_h)
4451        CALL reduce_base_integer( msglen, msg_h, gid, -1 )
4452        msg_d = msg_h; DEALLOCATE(msg_h)
4453        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4454#endif
4455#endif
4456      END SUBROUTINE mp_sum_it_gpu
4457!
4458!------------------------------------------------------------------------------!
4459!
4460      SUBROUTINE mp_sum_r1_gpu(msg_d,gid)
4461        IMPLICIT NONE
4462        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
4463        REAL(DP) :: msg_h
4464        INTEGER, INTENT (IN) :: gid
4465        !
4466        INTEGER :: msglen, ierr
4467        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4468        IF ( mp_size(gid) == 1 ) THEN
4469          ierr = cudaDeviceSynchronize()
4470          RETURN
4471        END IF
4472        !
4473#if defined(__MPI)
4474        msglen = 1
4475#if defined(__GPU_MPI)
4476        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4477        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4478        ! No need for final syncronization
4479#else
4480        msg_h=msg_d                     ! This syncs __MPI case
4481        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4482        msg_d = msg_h
4483        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4484#endif
4485#endif
4486      END SUBROUTINE mp_sum_r1_gpu
4487!
4488!------------------------------------------------------------------------------!
4489!
4490      SUBROUTINE mp_sum_rv_gpu(msg_d,gid)
4491        IMPLICIT NONE
4492        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
4493        REAL(DP), ALLOCATABLE :: msg_h(:)
4494        INTEGER, INTENT (IN) :: gid
4495        !
4496        INTEGER :: msglen, ierr
4497        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4498        IF ( mp_size(gid) == 1 ) THEN
4499          ierr = cudaDeviceSynchronize()
4500          RETURN
4501        END IF
4502        !
4503#if defined(__MPI)
4504#if defined(__GPU_MPI)
4505        msglen = size(msg_d)
4506        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4507        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4508        ! No need for final syncronization
4509#else
4510        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
4511        msglen = size(msg_h)
4512        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4513        msg_d = msg_h; DEALLOCATE(msg_h)
4514        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4515#endif
4516#endif
4517      END SUBROUTINE mp_sum_rv_gpu
4518!
4519!------------------------------------------------------------------------------!
4520!
4521      SUBROUTINE mp_sum_rm_gpu(msg_d, gid)
4522        IMPLICIT NONE
4523        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
4524        REAL (DP), ALLOCATABLE :: msg_h(:,:)
4525        INTEGER, INTENT (IN) :: gid
4526        !
4527        INTEGER :: msglen, ierr
4528        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4529        IF ( mp_size(gid) == 1 ) THEN
4530          ierr = cudaDeviceSynchronize()
4531          RETURN
4532        END IF
4533        !
4534#if defined(__MPI)
4535#if defined(__GPU_MPI)
4536        msglen = size(msg_d)
4537        ierr = cudaDeviceSynchronize()  ! This syncs __GPU_MPI
4538        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4539        ! No need for final syncronization
4540#else
4541        ALLOCATE( msg_h, source=msg_d ) ! This syncs __MPI case
4542        msglen = size(msg_h)
4543        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4544        msg_d = msg_h; DEALLOCATE(msg_h)
4545        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4546#endif
4547#endif
4548      END SUBROUTINE mp_sum_rm_gpu
4549!
4550!------------------------------------------------------------------------------!
4551!
4552      SUBROUTINE mp_root_sum_rm_gpu( msg_d, res_d, root, gid )
4553        IMPLICIT NONE
4554        REAL (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
4555        REAL (DP), INTENT (OUT), DEVICE :: res_d(:,:)
4556        REAL (DP), ALLOCATABLE :: res_h(:,:), msg_h(:,:)
4557        INTEGER,   INTENT (IN) :: root
4558        INTEGER,   INTENT (IN) :: gid
4559        !
4560        INTEGER :: msglen, ierr, taskid
4561#if defined(__MPI)
4562        !
4563        CALL mpi_comm_rank( gid, taskid, ierr)
4564        IF( ierr /= 0 ) CALL mp_stop( 9045 )
4565        !
4566        msglen = size(msg_d)
4567        IF( taskid == root ) THEN
4568           IF( msglen > size(res_d) ) CALL mp_stop( 9046 )
4569        END IF
4570#if  defined(__GPU_MPI)
4571        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4572        CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, gid, root )
4573        RETURN ! Sync not needed in this case
4574#else
4575        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4576        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
4577        CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
4578        IF( taskid == root ) res_d = res_h;
4579        IF( taskid == root ) DEALLOCATE(res_h)
4580        DEALLOCATE(msg_h)
4581#endif
4582
4583#else
4584        res_d = msg_d
4585#endif
4586        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
4587      END SUBROUTINE mp_root_sum_rm_gpu
4588!
4589!------------------------------------------------------------------------------!
4590!
4591      SUBROUTINE mp_root_sum_cm_gpu( msg_d, res_d, root, gid )
4592        IMPLICIT NONE
4593        COMPLEX (DP), INTENT (IN) , DEVICE :: msg_d(:,:)
4594        COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
4595        COMPLEX (DP), ALLOCATABLE          :: res_h(:,:), msg_h(:,:)
4596        INTEGER,   INTENT (IN)  :: root
4597        INTEGER,  INTENT (IN) :: gid
4598        !
4599        INTEGER :: msglen, ierr, taskid
4600#if defined(__MPI)
4601        msglen = size(msg_d)
4602
4603        CALL mpi_comm_rank( gid, taskid, ierr)
4604        IF( ierr /= 0 ) CALL mp_stop( 9047 )
4605
4606        IF( taskid == root ) THEN
4607           IF( msglen > size(res_d) ) CALL mp_stop( 9048 )
4608        END IF
4609#if  defined(__GPU_MPI)
4610        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4611        CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_d, gid, root )
4612        RETURN ! Sync not needed in this case
4613#else
4614        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4615        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
4616        CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, root )
4617        IF( taskid == root ) res_d = res_h;
4618        IF( taskid == root ) DEALLOCATE(res_h)
4619        DEALLOCATE(msg_h)
4620#endif
4621#else
4622        res_d = msg_d
4623#endif
4624        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
4625      END SUBROUTINE mp_root_sum_cm_gpu
4626!
4627!------------------------------------------------------------------------------!
4628!
4629      SUBROUTINE mp_sum_rmm_gpu( msg_d, res_d, root, gid )
4630        IMPLICIT NONE
4631        REAL (DP), INTENT (IN), DEVICE :: msg_d(:,:)
4632        REAL (DP), INTENT (OUT),DEVICE :: res_d(:,:)
4633        REAL (DP), ALLOCATABLE         :: res_h(:,:), msg_h(:,:)
4634        INTEGER, INTENT (IN) :: root
4635        INTEGER, INTENT (IN) :: gid
4636        INTEGER :: group
4637        INTEGER :: msglen
4638        INTEGER :: taskid, ierr
4639
4640
4641
4642#if defined(__MPI)
4643
4644        msglen = size(msg_d)
4645        !
4646        group = gid
4647        !
4648        CALL mpi_comm_rank( group, taskid, ierr)
4649        IF( ierr /= 0 ) CALL mp_stop( 9049 )
4650
4651        IF( taskid == root ) THEN
4652           IF( msglen > size(res_d) ) CALL mp_stop( 9050 )
4653        END IF
4654        !
4655#if  defined(__GPU_MPI)
4656        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4657        CALL reduce_base_real_to_gpu( msglen, msg_d, res_d, group, root )
4658        RETURN ! Sync not needed in this case
4659#else
4660        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4661        IF( taskid == root ) ALLOCATE( res_h(lbound(res_d,1):ubound(res_d,1), lbound(res_d,2):ubound(res_d,2)));
4662        CALL reduce_base_real_to( msglen, msg_h, res_h, gid, root )
4663        IF( taskid == root ) res_d = res_h;
4664        IF( taskid == root ) DEALLOCATE(res_h)
4665        DEALLOCATE(msg_h)
4666#endif
4667        !
4668#else
4669        res_d = msg_d
4670#endif
4671        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
4672      END SUBROUTINE mp_sum_rmm_gpu
4673!
4674!------------------------------------------------------------------------------!
4675!
4676      SUBROUTINE mp_sum_rt_gpu( msg_d, gid )
4677        IMPLICIT NONE
4678        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
4679        REAL (DP), ALLOCATABLE :: msg_h(:,:,:)
4680        INTEGER, INTENT(IN) :: gid
4681        INTEGER :: msglen, ierr
4682        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4683        IF ( mp_size(gid) == 1 ) THEN
4684          ierr = cudaDeviceSynchronize()
4685          RETURN
4686        END IF
4687        !
4688#if defined(__MPI)
4689#if  defined(__GPU_MPI)
4690        msglen = size(msg_d)
4691        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4692        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4693        ! Sync not needed after MPI call
4694#else
4695        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4696        msglen = size(msg_h)
4697        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4698        msg_d = msg_h; DEALLOCATE(msg_h)
4699        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4700#endif
4701#endif
4702      END SUBROUTINE mp_sum_rt_gpu
4703!
4704!------------------------------------------------------------------------------!
4705!
4706      SUBROUTINE mp_sum_r4d_gpu(msg_d,gid)
4707        IMPLICIT NONE
4708        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
4709        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:)
4710        INTEGER, INTENT(IN) :: gid
4711        !
4712        INTEGER :: msglen, ierr
4713        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4714        IF ( mp_size(gid) == 1 ) THEN
4715          ierr = cudaDeviceSynchronize()
4716          RETURN
4717        END IF
4718        !
4719#if defined(__MPI)
4720#if  defined(__GPU_MPI)
4721        msglen = size(msg_d)
4722        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4723        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4724        ! Sync not needed after MPI call
4725#else
4726        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4727        msglen = size(msg_h)
4728        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4729        msg_d = msg_h; DEALLOCATE(msg_h)
4730        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4731#endif
4732#endif
4733      END SUBROUTINE mp_sum_r4d_gpu
4734!
4735!------------------------------------------------------------------------------!
4736!
4737      SUBROUTINE mp_sum_c1_gpu(msg_d,gid)
4738        IMPLICIT NONE
4739        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d
4740        COMPLEX (DP) :: msg_h
4741        INTEGER, INTENT(IN) :: gid
4742        !
4743        INTEGER :: msglen, ierr
4744        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4745        IF ( mp_size(gid) == 1 ) THEN
4746          ierr = cudaDeviceSynchronize()
4747          RETURN
4748        END IF
4749        !
4750#if defined(__MPI)
4751        msglen = 1
4752#if  defined(__GPU_MPI)
4753        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4754        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4755        ! Sync not needed after MPI call
4756#else
4757        msg_h=msg_d                               ! This syncs __MPI case
4758        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4759        msg_d = msg_h
4760        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4761#endif
4762#endif
4763      END SUBROUTINE mp_sum_c1_gpu
4764!
4765!------------------------------------------------------------------------------!
4766!
4767      SUBROUTINE mp_sum_cv_gpu(msg_d,gid)
4768        IMPLICIT NONE
4769        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:)
4770        COMPLEX (DP), ALLOCATABLE :: msg_h(:)
4771        INTEGER, INTENT(IN) :: gid
4772        !
4773        INTEGER :: msglen, ierr
4774        !
4775        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4776        IF ( mp_size(gid) == 1 ) THEN
4777          ierr = cudaDeviceSynchronize()
4778          RETURN
4779        END IF
4780        !
4781#if defined(__MPI)
4782#if  defined(__GPU_MPI)
4783        msglen = size(msg_d)
4784        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4785        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4786        ! Sync not needed after MPI call
4787#else
4788        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4789        msglen = size(msg_h)
4790        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4791        msg_d = msg_h; DEALLOCATE(msg_h)
4792        ierr = cudaDeviceSynchronize()  ! This syncs the device after small message copies
4793#endif
4794#endif
4795      END SUBROUTINE mp_sum_cv_gpu
4796!
4797!------------------------------------------------------------------------------!
4798!
4799      SUBROUTINE mp_sum_cm_gpu(msg_d, gid)
4800        IMPLICIT NONE
4801        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:)
4802        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:)
4803        INTEGER, INTENT (IN) :: gid
4804        INTEGER :: msglen, ierr
4805        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4806        IF ( mp_size(gid) == 1 ) THEN
4807          ierr = cudaDeviceSynchronize()
4808          RETURN
4809        END IF
4810        !
4811#if defined(__MPI)
4812#if  defined(__GPU_MPI)
4813        msglen = size(msg_d)
4814        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4815        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4816        ! Sync not needed after MPI call
4817#else
4818        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4819        msglen = size(msg_h)
4820        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4821        msg_d = msg_h; DEALLOCATE(msg_h)
4822        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4823#endif
4824#endif
4825      END SUBROUTINE mp_sum_cm_gpu
4826!
4827!------------------------------------------------------------------------------!
4828!
4829      SUBROUTINE mp_sum_cmm_gpu(msg_d, res_d, gid)
4830        IMPLICIT NONE
4831        COMPLEX (DP), INTENT (IN), DEVICE :: msg_d(:,:)
4832        COMPLEX (DP), INTENT (OUT), DEVICE :: res_d(:,:)
4833        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:), res_h(:,:)
4834        INTEGER, INTENT (IN) :: gid
4835        !
4836        INTEGER :: msglen, ierr
4837#if defined(__MPI)
4838#if  defined(__GPU_MPI)
4839        msglen = size(msg_d)
4840        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4841        CALL reduce_base_real_to_gpu( 2 * msglen, msg_d, res_h, gid, -1 )
4842        RETURN ! Sync not needed after MPI call
4843#else
4844        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4845        msglen = size(msg_h)
4846        ALLOCATE( res_h(lbound(msg_h,1):ubound(msg_h,1), lbound(msg_h,2):ubound(msg_h,2)));
4847        CALL reduce_base_real_to( 2 * msglen, msg_h, res_h, gid, -1 )
4848        res_d = res_h; DEALLOCATE(msg_h, res_h)
4849#endif
4850#else
4851        res_d = msg_d
4852#endif
4853        ierr = cudaDeviceSynchronize()  ! This syncs SERIAL, __MPI
4854      END SUBROUTINE mp_sum_cmm_gpu
4855!
4856!------------------------------------------------------------------------------!
4857!
4858      SUBROUTINE mp_sum_ct_gpu(msg_d,gid)
4859        IMPLICIT NONE
4860        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:)
4861        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:)
4862        INTEGER, INTENT(IN) :: gid
4863        !
4864        INTEGER :: msglen, ierr
4865        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4866        IF ( mp_size(gid) == 1 ) THEN
4867          ierr = cudaDeviceSynchronize()
4868          RETURN
4869        END IF
4870        !
4871#if defined(__MPI)
4872#if  defined(__GPU_MPI)
4873        msglen = SIZE(msg_d)
4874        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4875        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4876        ! Sync not needed after MPI call
4877#else
4878        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4879        msglen = size(msg_h)
4880        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4881        msg_d = msg_h; DEALLOCATE(msg_h)
4882        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4883#endif
4884#endif
4885      END SUBROUTINE mp_sum_ct_gpu
4886!
4887!------------------------------------------------------------------------------!
4888!
4889      SUBROUTINE mp_sum_c4d_gpu(msg_d,gid)
4890        IMPLICIT NONE
4891        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:)
4892        COMPLEX (DP),ALLOCATABLE :: msg_h(:,:,:,:)
4893        INTEGER, INTENT(IN) :: gid
4894        !
4895        INTEGER :: msglen, ierr
4896        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4897        IF ( mp_size(gid) == 1 ) THEN
4898          ierr = cudaDeviceSynchronize()
4899          RETURN
4900        END IF
4901        !
4902#if defined(__MPI)
4903#if  defined(__GPU_MPI)
4904        msglen = size(msg_d)
4905        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4906        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4907        ! Sync not needed after MPI call
4908#else
4909        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4910        msglen = size(msg_h)
4911        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4912        msg_d = msg_h; DEALLOCATE(msg_h)
4913        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4914#endif
4915#endif
4916      END SUBROUTINE mp_sum_c4d_gpu
4917!
4918!------------------------------------------------------------------------------!
4919!
4920      SUBROUTINE mp_sum_c5d_gpu(msg_d,gid)
4921        IMPLICIT NONE
4922        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
4923        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
4924        INTEGER, INTENT(IN) :: gid
4925        !
4926        INTEGER :: msglen, ierr
4927        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4928        IF ( mp_size(gid) == 1 ) THEN
4929          ierr = cudaDeviceSynchronize()
4930          RETURN
4931        END IF
4932        !
4933#if defined(__MPI)
4934#if  defined(__GPU_MPI)
4935        msglen = size(msg_d)
4936        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4937        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
4938        ! Sync not needed after MPI call
4939#else
4940        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4941        msglen = size(msg_h)
4942        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
4943        msg_d = msg_h; DEALLOCATE(msg_h)
4944        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4945#endif
4946#endif
4947      END SUBROUTINE mp_sum_c5d_gpu
4948!
4949!------------------------------------------------------------------------------!
4950!
4951      SUBROUTINE mp_sum_r5d_gpu(msg_d,gid)
4952        IMPLICIT NONE
4953        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:)
4954        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:)
4955        INTEGER, INTENT(IN) :: gid
4956        !
4957        INTEGER :: msglen, ierr
4958        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4959        IF ( mp_size(gid) == 1 ) THEN
4960          ierr = cudaDeviceSynchronize()
4961          RETURN
4962        END IF
4963        !
4964#if defined(__MPI)
4965#if  defined(__GPU_MPI)
4966        msglen = size(msg_d)
4967        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4968        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
4969        ! Sync not needed after MPI call
4970#else
4971        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
4972        msglen = size(msg_h)
4973        CALL reduce_base_real( msglen, msg_h, gid, -1 )
4974        msg_d = msg_h; DEALLOCATE(msg_h)
4975        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
4976#endif
4977#endif
4978      END SUBROUTINE mp_sum_r5d_gpu
4979!
4980!------------------------------------------------------------------------------!
4981!
4982      SUBROUTINE mp_sum_r6d_gpu(msg_d,gid)
4983        IMPLICIT NONE
4984        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
4985        REAL (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
4986        INTEGER, INTENT(IN) :: gid
4987        !
4988        INTEGER :: msglen, ierr
4989        ! Avoid unnecessary communications on __MPI and syncs SERIAL
4990        IF ( mp_size(gid) == 1 ) THEN
4991          ierr = cudaDeviceSynchronize()
4992          RETURN
4993        END IF
4994        !
4995#if defined(__MPI)
4996#if  defined(__GPU_MPI)
4997        msglen = size(msg_d)
4998        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
4999        CALL reduce_base_real_gpu( msglen, msg_d, gid, -1 )
5000        ! Sync not needed after MPI call
5001#else
5002        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
5003        msglen = size(msg_h)
5004        CALL reduce_base_real( msglen, msg_h, gid, -1 )
5005        msg_d = msg_h; DEALLOCATE(msg_h)
5006        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5007#endif
5008#endif
5009      END SUBROUTINE mp_sum_r6d_gpu
5010!
5011!------------------------------------------------------------------------------!
5012!
5013      SUBROUTINE mp_sum_c6d_gpu(msg_d,gid)
5014        IMPLICIT NONE
5015        COMPLEX (DP), INTENT (INOUT), DEVICE :: msg_d(:,:,:,:,:,:)
5016        COMPLEX (DP), ALLOCATABLE :: msg_h(:,:,:,:,:,:)
5017        INTEGER, INTENT(IN) :: gid
5018        !
5019        INTEGER :: msglen, ierr
5020        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5021        IF ( mp_size(gid) == 1 ) THEN
5022          ierr = cudaDeviceSynchronize()
5023          RETURN
5024        END IF
5025        !
5026#if defined(__MPI)
5027#if  defined(__GPU_MPI)
5028        msglen = size(msg_d)
5029        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
5030        CALL reduce_base_real_gpu( 2 * msglen, msg_d, gid, -1 )
5031        ! Sync not needed after MPI call
5032#else
5033        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
5034        msglen = size(msg_h)
5035        CALL reduce_base_real( 2 * msglen, msg_h, gid, -1 )
5036        msg_d = msg_h; DEALLOCATE(msg_h)
5037        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5038#endif
5039#endif
5040      END SUBROUTINE mp_sum_c6d_gpu
5041!
5042!------------------------------------------------------------------------------!
5043!
5044      SUBROUTINE mp_max_i_gpu(msg_d,gid)
5045        IMPLICIT NONE
5046        INTEGER, INTENT (INOUT), DEVICE :: msg_d
5047        INTEGER :: msg_h
5048        INTEGER, INTENT(IN) :: gid
5049        !
5050        INTEGER :: msglen, ierr
5051        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5052        IF ( mp_size(gid) == 1 ) THEN
5053          ierr = cudaDeviceSynchronize()
5054          RETURN
5055        END IF
5056        !
5057#if defined(__MPI)
5058        msglen = 1
5059#if  defined(__GPU_MPI)
5060        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
5061        CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
5062        ! Sync not needed after MPI call
5063#else
5064        msg_h = msg_d                             ! This syncs __MPI case
5065        CALL parallel_max_integer( msglen, msg_h, gid, -1 )
5066        msg_d = msg_h
5067        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5068#endif
5069#endif
5070      END SUBROUTINE mp_max_i_gpu
5071!
5072!------------------------------------------------------------------------------!
5073!
5074      SUBROUTINE mp_max_iv_gpu(msg_d,gid)
5075        IMPLICIT NONE
5076        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
5077        INTEGER, ALLOCATABLE :: msg_h(:)
5078        INTEGER, INTENT(IN) :: gid
5079        !
5080        INTEGER :: msglen, ierr
5081        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5082        IF ( mp_size(gid) == 1 ) THEN
5083          ierr = cudaDeviceSynchronize()
5084          RETURN
5085        END IF
5086        !
5087#if defined(__MPI)
5088#if  defined(__GPU_MPI)
5089        msglen = size(msg_d)
5090        ierr = cudaDeviceSynchronize()            ! This syncs __GPU_MPI
5091        CALL parallel_max_integer_gpu( msglen, msg_d, gid, -1 )
5092        ! Sync not needed after MPI call
5093#else
5094        ALLOCATE( msg_h, source=msg_d )           ! This syncs __MPI case
5095        msglen = size(msg_h)
5096        CALL parallel_max_integer( msglen, msg_h, gid, -1 )
5097        msg_d = msg_h; DEALLOCATE(msg_h)
5098        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5099#endif
5100#endif
5101      END SUBROUTINE mp_max_iv_gpu
5102!
5103!----------------------------------------------------------------------
5104!
5105      SUBROUTINE mp_max_r_gpu(msg_d,gid)
5106        IMPLICIT NONE
5107        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
5108        REAL (DP) :: msg_h
5109        INTEGER, INTENT(IN) :: gid
5110        !
5111        INTEGER :: msglen, ierr
5112        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5113        IF ( mp_size(gid) == 1 ) THEN
5114          ierr = cudaDeviceSynchronize()
5115          RETURN
5116        END IF
5117        !
5118#if defined(__MPI)
5119        msglen = 1
5120#if defined(__GPU_MPI)
5121        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5122        CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
5123        ! Sync not needed after MPI call
5124#else
5125        msg_h = msg_d                    ! This syncs __MPI case
5126        CALL parallel_max_real( msglen, msg_h, gid, -1 )
5127        msg_d = msg_h
5128        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5129#endif
5130#endif
5131      END SUBROUTINE mp_max_r_gpu
5132!
5133!------------------------------------------------------------------------------!
5134!
5135      SUBROUTINE mp_max_rv_gpu(msg_d,gid)
5136        IMPLICIT NONE
5137        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
5138        REAL (DP), ALLOCATABLE :: msg_h(:)
5139        INTEGER, INTENT(IN) :: gid
5140        !
5141        INTEGER :: msglen, ierr
5142        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5143        IF ( mp_size(gid) == 1 ) THEN
5144          ierr = cudaDeviceSynchronize()
5145          RETURN
5146        END IF
5147        !
5148#if defined(__MPI)
5149#if  defined(__GPU_MPI)
5150        msglen = size(msg_d)
5151        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5152        CALL parallel_max_real_gpu( msglen, msg_d, gid, -1 )
5153        ! Sync not needed after MPI call
5154#else
5155        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
5156        msglen = size(msg_h)
5157        CALL parallel_max_real( msglen, msg_h, gid, -1 )
5158        msg_d = msg_h; DEALLOCATE(msg_h)
5159        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5160#endif
5161#endif
5162      END SUBROUTINE mp_max_rv_gpu
5163!
5164!------------------------------------------------------------------------------!
5165!
5166      SUBROUTINE mp_min_i_gpu(msg_d,gid)
5167        IMPLICIT NONE
5168        INTEGER, INTENT (INOUT), DEVICE :: msg_d
5169        INTEGER  :: msg_h
5170        INTEGER, INTENT(IN) :: gid
5171        !
5172        INTEGER :: msglen, ierr
5173        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5174        IF ( mp_size(gid) == 1 ) THEN
5175          ierr = cudaDeviceSynchronize()
5176          RETURN
5177        END IF
5178        !
5179#if defined(__MPI)
5180        msglen = 1
5181#if  defined(__GPU_MPI)
5182        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5183        CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
5184        ! Sync not needed after MPI call
5185#else
5186        msg_h = msg_d                    ! This syncs __MPI case
5187        CALL parallel_min_integer( msglen, msg_h, gid, -1 )
5188        msg_d = msg_h
5189        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5190#endif
5191#endif
5192      END SUBROUTINE mp_min_i_gpu
5193!
5194!------------------------------------------------------------------------------!
5195!
5196      SUBROUTINE mp_min_iv_gpu(msg_d,gid)
5197        IMPLICIT NONE
5198        INTEGER, INTENT (INOUT), DEVICE :: msg_d(:)
5199        INTEGER, ALLOCATABLE :: msg_h(:)
5200        INTEGER, INTENT(IN) :: gid
5201        !
5202        INTEGER :: msglen, ierr
5203        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5204        IF ( mp_size(gid) == 1 ) THEN
5205          ierr = cudaDeviceSynchronize()
5206          RETURN
5207        END IF
5208        !
5209#if defined(__MPI)
5210#if  defined(__GPU_MPI)
5211        msglen = SIZE(msg_d)
5212        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5213        CALL parallel_min_integer_gpu( msglen, msg_d, gid, -1 )
5214        ! Sync not needed after MPI call
5215#else
5216        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
5217        msglen = size(msg_h)
5218        CALL parallel_min_integer( msglen, msg_h, gid, -1 )
5219        msg_d = msg_h; DEALLOCATE(msg_h)
5220        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5221#endif
5222#endif
5223      END SUBROUTINE mp_min_iv_gpu
5224!
5225!------------------------------------------------------------------------------!
5226!
5227      SUBROUTINE mp_min_r_gpu(msg_d,gid)
5228        IMPLICIT NONE
5229        REAL (DP), INTENT (INOUT), DEVICE :: msg_d
5230        REAL (DP) :: msg_h
5231        INTEGER, INTENT(IN) :: gid
5232        !
5233        INTEGER :: msglen, ierr
5234        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5235        IF ( mp_size(gid) == 1 ) THEN
5236          ierr = cudaDeviceSynchronize()
5237          RETURN
5238        END IF
5239        !
5240#if defined(__MPI)
5241        msglen = 1
5242#if  defined(__GPU_MPI)
5243        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5244        CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
5245        ! Sync not needed after MPI call
5246#else
5247        msg_h = msg_d                    ! This syncs __MPI case
5248        CALL parallel_min_real( msglen, msg_h, gid, -1 )
5249        msg_d = msg_h
5250        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5251#endif
5252#endif
5253      END SUBROUTINE mp_min_r_gpu
5254!
5255!------------------------------------------------------------------------------!
5256!
5257      SUBROUTINE mp_min_rv_gpu(msg_d,gid)
5258        IMPLICIT NONE
5259        REAL (DP), INTENT (INOUT), DEVICE :: msg_d(:)
5260        REAL (DP), ALLOCATABLE :: msg_h(:)
5261        INTEGER, INTENT(IN) :: gid
5262        !
5263        INTEGER :: msglen, ierr
5264        ! Avoid unnecessary communications on __MPI and syncs SERIAL
5265        IF ( mp_size(gid) == 1 ) THEN
5266          ierr = cudaDeviceSynchronize()
5267          RETURN
5268        END IF
5269        !
5270#if defined(__MPI)
5271#if  defined(__GPU_MPI)
5272        msglen = size(msg_d)
5273        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5274        CALL parallel_min_real_gpu( msglen, msg_d, gid, -1 )
5275        ! Sync not needed after MPI call
5276#else
5277        ALLOCATE( msg_h, source=msg_d )  ! This syncs __MPI case
5278        msglen = size(msg_h)
5279        CALL parallel_min_real( msglen, msg_h, gid, -1 )
5280        msg_d = msg_h; DEALLOCATE(msg_h)
5281        ierr = cudaDeviceSynchronize()  ! This syncs __MPI for small copies
5282#endif
5283#endif
5284      END SUBROUTINE mp_min_rv_gpu
5285!
5286!------------------------------------------------------------------------------!
5287!..mp_gather
5288
5289      SUBROUTINE mp_gather_i1_gpu(mydata_d, alldata_d, root, gid)
5290        IMPLICIT NONE
5291        INTEGER, DEVICE :: mydata_d
5292        INTEGER, INTENT(IN) :: gid, root
5293        INTEGER :: group
5294        INTEGER, INTENT(OUT), DEVICE :: alldata_d(:)
5295        INTEGER :: ierr
5296
5297
5298#if defined (__MPI)
5299#if ! defined(__GPU_MPI)
5300        INTEGER :: mydata_h
5301        INTEGER, ALLOCATABLE :: alldata_h(:)
5302        ALLOCATE( alldata_h, source=alldata_d ) ! This syncs __MPI
5303        mydata_h = mydata_d
5304        CALL mp_gather_i1(mydata_h, alldata_h, root, gid)
5305        mydata_d = mydata_h; alldata_d = alldata_h
5306        DEALLOCATE(alldata_h)
5307#else
5308        group = gid
5309        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5310        CALL MPI_GATHER(mydata_d, 1, MPI_INTEGER, alldata_d, 1, MPI_INTEGER, root, group, IERR)
5311        IF (ierr/=0) CALL mp_stop( 9051 )
5312        RETURN ! Sync not needed after MPI call
5313#endif
5314#else
5315        !alldata_d(1) = mydata_d
5316        ierr = cudaMemcpy( alldata_d(1), mydata_d, 1, &
5317                                            & cudaMemcpyDeviceToDevice )
5318        IF (ierr/=0) CALL mp_stop( 9052 )
5319#endif
5320        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5321      END SUBROUTINE mp_gather_i1_gpu
5322!
5323!------------------------------------------------------------------------------!
5324!
5325      SUBROUTINE mp_gather_iv_gpu(mydata_d, alldata_d, root, gid)
5326        IMPLICIT NONE
5327        INTEGER, DEVICE :: mydata_d(:)
5328        INTEGER, INTENT(IN) :: gid, root
5329        INTEGER :: group
5330        INTEGER, INTENT(OUT), DEVICE :: alldata_d(:,:)
5331        INTEGER :: msglen, ierr, i
5332
5333
5334#if defined (__MPI)
5335#if ! defined(__GPU_MPI)
5336        INTEGER, ALLOCATABLE :: mydata_h(:)
5337        INTEGER, ALLOCATABLE :: alldata_h(:,:)
5338        ALLOCATE( mydata_h, source=mydata_d )     ! This syncs __MPI
5339        ALLOCATE( alldata_h, source=alldata_d )
5340
5341        CALL mp_gather_iv(mydata_h, alldata_h, root, gid)
5342        mydata_d = mydata_h; alldata_d = alldata_h
5343        DEALLOCATE(alldata_h, mydata_h)
5344#else
5345        msglen = SIZE(mydata_d)
5346        IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9053 )
5347        group = gid
5348        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5349        CALL MPI_GATHER(mydata_d, msglen, MPI_INTEGER, alldata_d, msglen, MPI_INTEGER, root, group, IERR)
5350        IF (ierr/=0) CALL mp_stop( 9054 )
5351        RETURN ! Sync not needed after MPI call
5352#endif
5353#else
5354        msglen = SIZE(mydata_d)
5355        IF( msglen .NE. SIZE(alldata_d, 1) ) CALL mp_stop( 9055 )
5356        !alldata_d(:,1) = mydata_d(:)
5357        ierr = cudaMemcpy(alldata_d(:,1) , mydata_d(1), msglen, cudaMemcpyDeviceToDevice )
5358        IF (ierr/=0) CALL mp_stop( 9056 )
5359#endif
5360        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5361      END SUBROUTINE mp_gather_iv_gpu
5362!
5363!------------------------------------------------------------------------------!
5364!..mp_gatherv_rv
5365!
5366      SUBROUTINE mp_gatherv_rv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
5367        IMPLICIT NONE
5368        REAL(DP), DEVICE :: mydata_d(:)
5369        REAL(DP), DEVICE :: alldata_d(:)
5370        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
5371        INTEGER, INTENT(IN) :: gid
5372        INTEGER :: group
5373        INTEGER :: ierr, npe, myid
5374#if defined (__MPI)
5375#if ! defined(__GPU_MPI)
5376        REAL(DP), ALLOCATABLE :: mydata_h(:)
5377        REAL(DP), ALLOCATABLE :: alldata_h(:)
5378
5379        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
5380        ALLOCATE(alldata_h, source=alldata_d)
5381        CALL mp_gatherv_rv( mydata_h, alldata_h, recvcount, displs, root, gid)
5382        alldata_d = alldata_h ; mydata_d = mydata_h
5383        DEALLOCATE(alldata_h , mydata_h)
5384#else
5385
5386        group = gid
5387        CALL mpi_comm_size( group, npe, ierr )
5388        IF (ierr/=0) CALL mp_stop( 9057 )
5389        CALL mpi_comm_rank( group, myid, ierr )
5390        IF (ierr/=0) CALL mp_stop( 9058 )
5391        !
5392        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9059 )
5393        IF ( myid == root ) THEN
5394           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9060 )
5395        END IF
5396        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9061 )
5397        !
5398        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5399        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_PRECISION, &
5400                         alldata_d, recvcount, displs, MPI_DOUBLE_PRECISION, root, group, ierr )
5401        IF (ierr/=0) CALL mp_stop( 9062 )
5402        RETURN ! Sync not needed after MPI call
5403#endif
5404#else
5405        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9063 )
5406        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9064 )
5407        !
5408        !alldata_d( 1:recvcount( 1 ) ) = mydata_d( 1:recvcount( 1 ) )
5409        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
5410        IF (ierr/=0) CALL mp_stop( 9065 )
5411#endif
5412        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5413      END SUBROUTINE mp_gatherv_rv_gpu
5414!
5415!------------------------------------------------------------------------------!
5416!..mp_gatherv_cv
5417!
5418      SUBROUTINE mp_gatherv_cv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
5419        IMPLICIT NONE
5420        COMPLEX(DP), DEVICE :: mydata_d(:)
5421        COMPLEX(DP), DEVICE :: alldata_d(:)
5422        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
5423        INTEGER, INTENT(IN) :: gid
5424        INTEGER :: group
5425        INTEGER :: ierr, npe, myid
5426
5427#if defined (__MPI)
5428#if ! defined(__GPU_MPI)
5429        COMPLEX(DP), ALLOCATABLE :: mydata_h(:)
5430        COMPLEX(DP), ALLOCATABLE :: alldata_h(:)
5431
5432        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
5433        ALLOCATE(alldata_h, source=alldata_d)
5434        CALL mp_gatherv_cv( mydata_h, alldata_h, recvcount, displs, root, gid)
5435        alldata_d = alldata_h ; mydata_d = mydata_h
5436        DEALLOCATE(alldata_h , mydata_h)
5437#else
5438        group = gid
5439        CALL mpi_comm_size( group, npe, ierr )
5440        IF (ierr/=0) CALL mp_stop( 9066 )
5441        CALL mpi_comm_rank( group, myid, ierr )
5442        IF (ierr/=0) CALL mp_stop( 9067 )
5443        !
5444        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9068 )
5445        IF ( myid == root ) THEN
5446           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9069 )
5447        END IF
5448        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9070 )
5449        !
5450        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5451        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_DOUBLE_COMPLEX, &
5452                         alldata_d, recvcount, displs, MPI_DOUBLE_COMPLEX, root, group, ierr )
5453        IF (ierr/=0) CALL mp_stop( 9071 )
5454        RETURN ! Sync not needed after MPI call
5455#endif
5456#else
5457        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9072 )
5458        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9073 )
5459        !
5460        !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
5461        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
5462#endif
5463        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5464      END SUBROUTINE mp_gatherv_cv_gpu
5465!
5466!------------------------------------------------------------------------------!
5467!..mp_gatherv_rv_gpu
5468!
5469
5470      SUBROUTINE mp_gatherv_iv_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
5471        IMPLICIT NONE
5472        INTEGER, DEVICE :: mydata_d(:)
5473        INTEGER, DEVICE :: alldata_d(:)
5474        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
5475        INTEGER, INTENT(IN) :: gid
5476        INTEGER :: group
5477        INTEGER :: ierr, npe, myid
5478
5479#if defined (__MPI)
5480#if ! defined(__GPU_MPI)
5481        INTEGER, ALLOCATABLE :: mydata_h(:)
5482        INTEGER, ALLOCATABLE :: alldata_h(:)
5483
5484        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
5485        ALLOCATE(alldata_h, source=alldata_d)
5486        CALL mp_gatherv_iv( mydata_h, alldata_h, recvcount, displs, root, gid)
5487        alldata_d = alldata_h ; mydata_d = mydata_h
5488        DEALLOCATE(alldata_h , mydata_h)
5489#else
5490        group = gid
5491        CALL mpi_comm_size( group, npe, ierr )
5492        IF (ierr/=0) CALL mp_stop( 9074 )
5493        CALL mpi_comm_rank( group, myid, ierr )
5494        IF (ierr/=0) CALL mp_stop( 9075 )
5495        !
5496        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9076 )
5497        IF ( myid == root ) THEN
5498           IF ( SIZE( alldata_d ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9077 )
5499        END IF
5500        IF ( SIZE( mydata_d ) < recvcount( myid + 1 ) ) CALL mp_stop( 9078 )
5501        !
5502        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5503        CALL MPI_GATHERV( mydata_d, recvcount( myid + 1 ), MPI_INTEGER, &
5504                         alldata_d, recvcount, displs, MPI_INTEGER, root, group, ierr )
5505        IF (ierr/=0) CALL mp_stop( 9079 )
5506        RETURN ! Sync not needed after MPI call
5507#endif
5508#else
5509        IF ( SIZE( alldata_d ) < recvcount( 1 ) ) CALL mp_stop( 9080 )
5510        IF ( SIZE( mydata_d  ) < recvcount( 1 ) ) CALL mp_stop( 9081 )
5511        !
5512        !alldata( 1:recvcount( 1 ) ) = mydata( 1:recvcount( 1 ) )
5513        ierr = cudaMemcpy(alldata_d(1) , mydata_d(1), recvcount( 1 ), cudaMemcpyDeviceToDevice )
5514#endif
5515        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5516      END SUBROUTINE mp_gatherv_iv_gpu
5517!
5518!------------------------------------------------------------------------------!
5519!..mp_gatherv_rm
5520!
5521
5522      SUBROUTINE mp_gatherv_rm_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
5523        IMPLICIT NONE
5524        REAL(DP), DEVICE :: mydata_d(:,:)  ! Warning first dimension is supposed constant!
5525        REAL(DP), DEVICE :: alldata_d(:,:)
5526        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
5527        INTEGER, INTENT(IN) :: gid
5528        INTEGER :: group
5529        INTEGER :: ierr, npe, myid, nsiz
5530        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
5531
5532
5533#if defined (__MPI)
5534#if ! defined(__GPU_MPI)
5535        REAL(DP), ALLOCATABLE :: mydata_h(:,:)
5536        REAL(DP), ALLOCATABLE :: alldata_h(:,:)
5537
5538        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
5539        ALLOCATE(alldata_h, source=alldata_d)
5540        CALL mp_gatherv_rm( mydata_h, alldata_h, recvcount, displs, root, gid)
5541        alldata_d = alldata_h ; mydata_d = mydata_h
5542        DEALLOCATE(alldata_h , mydata_h)
5543#else
5544        group = gid
5545        CALL mpi_comm_size( group, npe, ierr )
5546        IF (ierr/=0) CALL mp_stop( 9082 )
5547        CALL mpi_comm_rank( group, myid, ierr )
5548        IF (ierr/=0) CALL mp_stop( 9083 )
5549        !
5550        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9084 )
5551        IF ( myid == root ) THEN
5552           IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9085 )
5553           IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9086 )
5554        END IF
5555        IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9087 )
5556        !
5557        ALLOCATE( nrecv( npe ), ndisp( npe ) )
5558        !
5559        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
5560        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
5561        !
5562        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5563        CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_DOUBLE_PRECISION, &
5564                         alldata_d, nrecv, ndisp, MPI_DOUBLE_PRECISION, root, group, ierr )
5565        IF (ierr/=0) CALL mp_stop( 9088 )
5566        !
5567        DEALLOCATE( nrecv, ndisp )
5568        !
5569        RETURN ! Sync not needed after MPI call
5570#endif
5571#else
5572        IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9089 )
5573        IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9090 )
5574        IF ( SIZE( mydata_d, 2  ) < recvcount( 1 ) ) CALL mp_stop( 9091 )
5575        !
5576        !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
5577
5578        ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
5579                              mydata_d, SIZE(mydata_d,1),&
5580                              SIZE(mydata_d,1), recvcount( 1 ), &
5581                              cudaMemcpyDeviceToDevice )
5582
5583        IF (ierr/=0) CALL mp_stop( 9092 )
5584#endif
5585        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5586      END SUBROUTINE mp_gatherv_rm_gpu
5587!
5588!------------------------------------------------------------------------------!
5589!..mp_gatherv_im
5590!
5591      SUBROUTINE mp_gatherv_im_gpu( mydata_d, alldata_d, recvcount, displs, root, gid)
5592        IMPLICIT NONE
5593        INTEGER, DEVICE :: mydata_d(:,:)  ! Warning first dimension is supposed constant!
5594        INTEGER, DEVICE :: alldata_d(:,:)
5595        INTEGER, INTENT(IN) :: recvcount(:), displs(:), root
5596        INTEGER, INTENT(IN) :: gid
5597        INTEGER :: group
5598        INTEGER :: ierr, npe, myid, nsiz
5599        INTEGER, ALLOCATABLE :: nrecv(:), ndisp(:)
5600
5601
5602#if defined (__MPI)
5603#if ! defined(__GPU_MPI)
5604        INTEGER, ALLOCATABLE :: mydata_h(:,:)
5605        INTEGER, ALLOCATABLE :: alldata_h(:,:)
5606
5607        ALLOCATE(mydata_h, source=mydata_d)     ! This syncs __MPI
5608        ALLOCATE(alldata_h, source=alldata_d)
5609        CALL mp_gatherv_im( mydata_h, alldata_h, recvcount, displs, root, gid)
5610        alldata_d = alldata_h ; mydata_d = mydata_h
5611        DEALLOCATE(alldata_h , mydata_h)
5612#else
5613        group = gid
5614        CALL mpi_comm_size( group, npe, ierr )
5615        IF (ierr/=0) CALL mp_stop( 9093 )
5616        CALL mpi_comm_rank( group, myid, ierr )
5617        IF (ierr/=0) CALL mp_stop( 9094 )
5618        !
5619        IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9095 )
5620        IF ( myid == root ) THEN
5621           IF ( SIZE( alldata_d, 2 ) < displs( npe ) + recvcount( npe ) ) CALL mp_stop( 9096 )
5622           IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9097 )
5623        END IF
5624        IF ( SIZE( mydata_d, 2 ) < recvcount( myid + 1 ) ) CALL mp_stop( 9098 )
5625        !
5626        ALLOCATE( nrecv( npe ), ndisp( npe ) )
5627        !
5628        nrecv( 1:npe ) = recvcount( 1:npe ) * SIZE( mydata_d, 1 )
5629        ndisp( 1:npe ) = displs( 1:npe ) * SIZE( mydata_d, 1 )
5630        !
5631        ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5632        CALL MPI_GATHERV( mydata_d, nrecv( myid + 1 ), MPI_INTEGER, &
5633                         alldata_d, nrecv, ndisp, MPI_INTEGER, root, group, ierr )
5634        IF (ierr/=0) CALL mp_stop( 9099 )
5635        !
5636        DEALLOCATE( nrecv, ndisp )
5637        !
5638        RETURN ! Sync not needed after MPI call
5639#endif
5640#else
5641        IF ( SIZE( alldata_d, 1 ) /= SIZE( mydata_d, 1 ) ) CALL mp_stop( 9100 )
5642        IF ( SIZE( alldata_d, 2 ) < recvcount( 1 ) ) CALL mp_stop( 9101 )
5643        IF ( SIZE( mydata_d, 2  ) < recvcount( 1 ) ) CALL mp_stop( 9102 )
5644        !
5645        !alldata( :, 1:recvcount( 1 ) ) = mydata( :, 1:recvcount( 1 ) )
5646
5647        ierr = cudaMemcpy2D(alldata_d, SIZE(alldata_d,1),&
5648                              mydata_d, SIZE(mydata_d,1),&
5649                              SIZE(mydata_d,1), recvcount( 1 ), &
5650                              cudaMemcpyDeviceToDevice )
5651
5652        IF (ierr/=0) CALL mp_stop( 9103 )
5653#endif
5654        ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5655      END SUBROUTINE mp_gatherv_im_gpu
5656!
5657!------------------------------------------------------------------------------!
5658!
5659      SUBROUTINE mp_alltoall_c3d_gpu( sndbuf_d, rcvbuf_d, gid )
5660         IMPLICIT NONE
5661         COMPLEX(DP), DEVICE :: sndbuf_d( :, :, : )
5662         COMPLEX(DP), DEVICE :: rcvbuf_d( :, :, : )
5663         INTEGER, INTENT(IN) :: gid
5664         INTEGER :: nsiz, group, ierr, npe
5665
5666#if defined (__MPI)
5667#if ! defined(__GPU_MPI)
5668         COMPLEX(DP), ALLOCATABLE :: sndbuf_h(:,:,:)
5669         COMPLEX(DP), ALLOCATABLE :: rcvbuf_h(:,:,:)
5670
5671         ALLOCATE(sndbuf_h, source=sndbuf_d)     ! This syncs __MPI
5672         ALLOCATE(rcvbuf_h, source=rcvbuf_d)
5673         CALL mp_alltoall_c3d( sndbuf_h, rcvbuf_h, gid )
5674         sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
5675         DEALLOCATE(sndbuf_h , rcvbuf_h)
5676#else
5677         group = gid
5678
5679         CALL mpi_comm_size( group, npe, ierr )
5680         IF (ierr/=0) CALL mp_stop( 9104 )
5681
5682         IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9105 )
5683         IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9106 )
5684
5685         nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
5686         !
5687         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5688         !
5689         CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_DOUBLE_COMPLEX, &
5690                            rcvbuf_d, nsiz, MPI_DOUBLE_COMPLEX, group, ierr )
5691
5692         IF (ierr/=0) CALL mp_stop( 9107 )
5693         RETURN ! Sync not needed after MPI call
5694#endif
5695#else
5696         rcvbuf_d = sndbuf_d
5697#endif
5698         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5699      END SUBROUTINE mp_alltoall_c3d_gpu
5700!
5701!------------------------------------------------------------------------------!
5702!
5703      SUBROUTINE mp_alltoall_i3d_gpu( sndbuf_d, rcvbuf_d, gid )
5704         IMPLICIT NONE
5705         INTEGER, DEVICE :: sndbuf_d( :, :, : )
5706         INTEGER, DEVICE :: rcvbuf_d( :, :, : )
5707         INTEGER, INTENT(IN) :: gid
5708         INTEGER :: nsiz, group, ierr, npe
5709
5710#if defined (__MPI)
5711#if ! defined(__GPU_MPI)
5712         INTEGER, ALLOCATABLE :: sndbuf_h(:,:,:)
5713         INTEGER, ALLOCATABLE :: rcvbuf_h(:,:,:)
5714
5715         ALLOCATE(sndbuf_h, source=sndbuf_d)     ! This syncs __MPI
5716         ALLOCATE(rcvbuf_h, source=rcvbuf_d)
5717         CALL mp_alltoall_i3d( sndbuf_h, rcvbuf_h, gid )
5718         sndbuf_d = sndbuf_h ; rcvbuf_d = rcvbuf_h
5719         DEALLOCATE(sndbuf_h , rcvbuf_h)
5720#else
5721         group = gid
5722
5723         CALL mpi_comm_size( group, npe, ierr )
5724         IF (ierr/=0) CALL mp_stop( 9108 )
5725
5726         IF ( SIZE( sndbuf_d, 3 ) < npe ) CALL mp_stop( 9109 )
5727         IF ( SIZE( rcvbuf_d, 3 ) < npe ) CALL mp_stop( 9110 )
5728
5729         nsiz = SIZE( sndbuf_d, 1 ) * SIZE( sndbuf_d, 2 )
5730         !
5731         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5732         !
5733         CALL MPI_ALLTOALL( sndbuf_d, nsiz, MPI_INTEGER, &
5734                            rcvbuf_d, nsiz, MPI_INTEGER, group, ierr )
5735
5736         IF (ierr/=0) CALL mp_stop( 9111 )
5737         RETURN ! Sync not needed after MPI call
5738#endif
5739#else
5740
5741         rcvbuf_d = sndbuf_d
5742
5743#endif
5744         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5745      END SUBROUTINE mp_alltoall_i3d_gpu
5746!
5747!------------------------------------------------------------------------------!
5748!
5749      SUBROUTINE mp_circular_shift_left_i0_gpu( buf_d, itag, gid )
5750         IMPLICIT NONE
5751         INTEGER, DEVICE :: buf_d
5752         INTEGER, INTENT(IN) :: itag
5753         INTEGER, INTENT(IN) :: gid
5754         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
5755
5756#if defined (__MPI)
5757#if ! defined(__GPU_MPI)
5758         INTEGER :: buf_h
5759         buf_h = buf_d     ! This syncs __MPI
5760         CALL mp_circular_shift_left_i0( buf_h, itag, gid )
5761         buf_d = buf_h
5762#else
5763         INTEGER :: istatus( mpi_status_size )
5764         !
5765         group = gid
5766         !
5767         CALL mpi_comm_size( group, npe, ierr )
5768         IF (ierr/=0) CALL mp_stop( 9112 )
5769         CALL mpi_comm_rank( group, mype, ierr )
5770         IF (ierr/=0) CALL mp_stop( 9113 )
5771         !
5772         sour = mype + 1
5773         IF( sour == npe ) sour = 0
5774         dest = mype - 1
5775         IF( dest == -1 ) dest = npe - 1
5776         !
5777         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5778         CALL MPI_Sendrecv_replace( buf_d, 1, MPI_INTEGER, &
5779              dest, itag, sour, itag, group, istatus, ierr)
5780         !
5781         IF (ierr/=0) CALL mp_stop( 9114 )
5782         !
5783         RETURN ! Sync not needed after MPI call
5784#endif
5785#else
5786         ! do nothing
5787#endif
5788         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5789      END SUBROUTINE mp_circular_shift_left_i0_gpu
5790!
5791!------------------------------------------------------------------------------!
5792!
5793      SUBROUTINE mp_circular_shift_left_i1_gpu( buf_d, itag, gid )
5794         IMPLICIT NONE
5795         INTEGER, DEVICE :: buf_d(:)
5796         INTEGER, INTENT(IN) :: itag
5797         INTEGER, INTENT(IN) :: gid
5798         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
5799
5800#if defined (__MPI)
5801#if ! defined(__GPU_MPI)
5802         INTEGER, ALLOCATABLE :: buf_h(:)
5803         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
5804         CALL mp_circular_shift_left_i1( buf_h, itag, gid )
5805         buf_d = buf_h; DEALLOCATE(buf_h)
5806#else
5807         INTEGER :: istatus( mpi_status_size )
5808         !
5809         group = gid
5810         !
5811         CALL mpi_comm_size( group, npe, ierr )
5812         IF (ierr/=0) CALL mp_stop( 9115 )
5813         CALL mpi_comm_rank( group, mype, ierr )
5814         IF (ierr/=0) CALL mp_stop( 9116 )
5815         !
5816         sour = mype + 1
5817         IF( sour == npe ) sour = 0
5818         dest = mype - 1
5819         IF( dest == -1 ) dest = npe - 1
5820         !
5821         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5822         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
5823              dest, itag, sour, itag, group, istatus, ierr)
5824         !
5825         IF (ierr/=0) CALL mp_stop( 9117 )
5826         !
5827         RETURN ! Sync not needed after MPI call
5828#endif
5829#else
5830         ! do nothing
5831#endif
5832         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5833      END SUBROUTINE mp_circular_shift_left_i1_gpu
5834!
5835!------------------------------------------------------------------------------!
5836!
5837      SUBROUTINE mp_circular_shift_left_i2_gpu( buf_d, itag, gid )
5838         IMPLICIT NONE
5839         INTEGER, DEVICE :: buf_d(:,:)
5840         INTEGER, INTENT(IN) :: itag
5841         INTEGER, INTENT(IN) :: gid
5842         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
5843
5844#if defined (__MPI)
5845#if ! defined(__GPU_MPI)
5846         INTEGER, ALLOCATABLE :: buf_h(:,:)
5847         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
5848         CALL mp_circular_shift_left_i2( buf_h, itag, gid )
5849         buf_d = buf_h; DEALLOCATE(buf_h)
5850#else
5851         INTEGER :: istatus( mpi_status_size )
5852         !
5853         group = gid
5854         !
5855         CALL mpi_comm_size( group, npe, ierr )
5856         IF (ierr/=0) CALL mp_stop( 9118 )
5857         CALL mpi_comm_rank( group, mype, ierr )
5858         IF (ierr/=0) CALL mp_stop( 9119 )
5859         !
5860         sour = mype + 1
5861         IF( sour == npe ) sour = 0
5862         dest = mype - 1
5863         IF( dest == -1 ) dest = npe - 1
5864         !
5865         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5866         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_INTEGER, &
5867              dest, itag, sour, itag, group, istatus, ierr)
5868         !
5869         IF (ierr/=0) CALL mp_stop( 9120 )
5870         !
5871         RETURN ! Sync not needed after MPI call
5872#endif
5873#else
5874         ! do nothing
5875#endif
5876         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5877      END SUBROUTINE mp_circular_shift_left_i2_gpu
5878!
5879!------------------------------------------------------------------------------!
5880!
5881      SUBROUTINE mp_circular_shift_left_r2d_gpu( buf_d, itag, gid )
5882         IMPLICIT NONE
5883         REAL(DP), DEVICE :: buf_d( :, : )
5884         INTEGER, INTENT(IN) :: itag
5885         INTEGER, INTENT(IN) :: gid
5886         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
5887
5888#if defined (__MPI)
5889#if ! defined(__GPU_MPI)
5890         REAL(DP), ALLOCATABLE :: buf_h(:, :)
5891         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
5892         CALL mp_circular_shift_left_r2d( buf_h, itag, gid )
5893         buf_d = buf_h; DEALLOCATE(buf_h)
5894#else
5895         INTEGER :: istatus( mpi_status_size )
5896         !
5897         group = gid
5898         !
5899         CALL mpi_comm_size( group, npe, ierr )
5900         IF (ierr/=0) CALL mp_stop( 9121 )
5901         CALL mpi_comm_rank( group, mype, ierr )
5902         IF (ierr/=0) CALL mp_stop( 9122 )
5903         !
5904         sour = mype + 1
5905         IF( sour == npe ) sour = 0
5906         dest = mype - 1
5907         IF( dest == -1 ) dest = npe - 1
5908         !
5909         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5910         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_PRECISION, &
5911              dest, itag, sour, itag, group, istatus, ierr)
5912         !
5913         IF (ierr/=0) CALL mp_stop( 9123 )
5914         !
5915         RETURN ! Sync not needed after MPI call
5916#endif
5917#else
5918         ! do nothing
5919#endif
5920         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5921      END SUBROUTINE mp_circular_shift_left_r2d_gpu
5922!
5923!------------------------------------------------------------------------------!
5924!
5925      SUBROUTINE mp_circular_shift_left_c2d_gpu( buf_d, itag, gid )
5926         IMPLICIT NONE
5927         COMPLEX(DP), DEVICE :: buf_d( :, : )
5928         INTEGER, INTENT(IN) :: itag
5929         INTEGER, INTENT(IN) :: gid
5930         INTEGER :: nsiz, group, ierr, npe, sour, dest, mype
5931
5932#if defined (__MPI)
5933#if ! defined(__GPU_MPI)
5934         COMPLEX(DP), ALLOCATABLE :: buf_h(:, :)
5935         ALLOCATE(buf_h, source=buf_d)    ! This syncs __MPI
5936         CALL mp_circular_shift_left_c2d( buf_h, itag, gid )
5937         buf_d = buf_h; DEALLOCATE(buf_h)
5938#else
5939         INTEGER :: istatus( mpi_status_size )
5940         !
5941         group = gid
5942         !
5943         CALL mpi_comm_size( group, npe, ierr )
5944         IF (ierr/=0) CALL mp_stop( 9124 )
5945         CALL mpi_comm_rank( group, mype, ierr )
5946         IF (ierr/=0) CALL mp_stop( 9125 )
5947         !
5948         sour = mype + 1
5949         IF( sour == npe ) sour = 0
5950         dest = mype - 1
5951         IF( dest == -1 ) dest = npe - 1
5952         !
5953         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
5954         CALL MPI_Sendrecv_replace( buf_d, SIZE(buf_d), MPI_DOUBLE_COMPLEX, &
5955              dest, itag, sour, itag, group, istatus, ierr)
5956         !
5957         IF (ierr/=0) CALL mp_stop( 9126 )
5958         !
5959         RETURN ! Sync not needed after MPI call
5960#endif
5961#else
5962         ! do nothing
5963#endif
5964         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
5965      END SUBROUTINE mp_circular_shift_left_c2d_gpu
5966
5967!------------------------------------------------------------------------------!
5968!..mp_gatherv_inplace_cplx_array
5969!
5970
5971      SUBROUTINE mp_gatherv_inplace_cplx_array_gpu(alldata_d, my_column_type, recvcount, displs, root, gid)
5972         IMPLICIT NONE
5973         COMPLEX(DP), DEVICE :: alldata_d(:,:)
5974         INTEGER, INTENT(IN) :: my_column_type
5975         INTEGER, INTENT(IN) :: recvcount(:), displs(:)
5976         INTEGER, INTENT(IN) :: root, gid
5977         INTEGER :: ierr, npe, myid
5978
5979#if defined (__MPI)
5980#if ! defined(__GPU_MPI)
5981         COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
5982         !
5983         ! Avoid unnecessary communications on __MPI
5984         IF ( mp_size(gid) == 1 ) THEN
5985           ierr = cudaDeviceSynchronize()
5986           RETURN
5987         END IF
5988         !
5989         ALLOCATE(alldata_h, source=alldata_d)    ! This syncs __MPI
5990         CALL mp_gatherv_inplace_cplx_array(alldata_h, my_column_type, recvcount, displs, root, gid)
5991         alldata_d = alldata_h; DEALLOCATE(alldata_h)
5992         ierr = cudaDeviceSynchronize()   ! This syncs in case of small data chunks
5993         RETURN
5994#else
5995         CALL mpi_comm_size( gid, npe, ierr )
5996         IF (ierr/=0) CALL mp_stop( 9127 )
5997         CALL mpi_comm_rank( gid, myid, ierr )
5998         IF (ierr/=0) CALL mp_stop( 9128 )
5999         !
6000         IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9129 )
6001         !
6002         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
6003         IF (myid==root) THEN
6004            CALL MPI_GATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
6005                              alldata_d, recvcount, displs, my_column_type, root, gid, ierr )
6006         ELSE
6007            CALL MPI_GATHERV( alldata_d(1,displs(myid+1)+1), recvcount(myid+1), my_column_type, &
6008                              MPI_IN_PLACE, recvcount, displs, MPI_DATATYPE_NULL, root, gid, ierr )
6009         ENDIF
6010         !
6011         IF (ierr/=0) CALL mp_stop( 9130 )
6012         !
6013         RETURN ! Sync not needed after MPI call
6014#endif
6015#endif
6016         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL
6017      END SUBROUTINE mp_gatherv_inplace_cplx_array_gpu
6018
6019!------------------------------------------------------------------------------!
6020!..mp_allgatherv_inplace_cplx_array
6021!
6022
6023      SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu(alldata_d, my_element_type, recvcount, displs, gid)
6024         IMPLICIT NONE
6025         COMPLEX(DP), DEVICE :: alldata_d(:,:)
6026         INTEGER, INTENT(IN) :: my_element_type
6027         INTEGER, INTENT(IN) :: recvcount(:), displs(:)
6028         INTEGER, INTENT(IN) :: gid
6029         INTEGER :: ierr, npe, myid
6030
6031#if defined (__MPI)
6032#if ! defined(__GPU_MPI)
6033         COMPLEX(DP), ALLOCATABLE :: alldata_h(:, :)
6034         !
6035         ! Avoid unnecessary communications on __MPI
6036         IF ( mp_size(gid) == 1 ) THEN
6037           ierr = cudaDeviceSynchronize()
6038           RETURN
6039         END IF
6040         !
6041         ALLOCATE(alldata_h, source=alldata_d)! This syncs __MPI
6042         CALL mp_allgatherv_inplace_cplx_array(alldata_h, my_element_type, recvcount, displs, gid)
6043         alldata_d = alldata_h; DEALLOCATE(alldata_h)
6044         ierr = cudaDeviceSynchronize()   ! This syncs in case of small data chunks
6045         RETURN
6046#else
6047         CALL mpi_comm_size( gid, npe, ierr )
6048         IF (ierr/=0) CALL mp_stop( 9131 )
6049         CALL mpi_comm_rank( gid, myid, ierr )
6050         IF (ierr/=0) CALL mp_stop( 9132 )
6051         !
6052         IF ( SIZE( recvcount ) < npe .OR. SIZE( displs ) < npe ) CALL mp_stop( 9133 )
6053         !
6054         ierr = cudaDeviceSynchronize()   ! This syncs __GPU_MPI
6055         CALL MPI_ALLGATHERV( MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &
6056                              alldata_d, recvcount, displs, my_element_type, gid, ierr )
6057         IF (ierr/=0) CALL mp_stop( 9134 )
6058         RETURN ! Sync not needed after MPI call
6059#endif
6060#endif
6061         ierr = cudaDeviceSynchronize()   ! This syncs SERIAL, __MPI
6062      END SUBROUTINE mp_allgatherv_inplace_cplx_array_gpu
6063
6064      SUBROUTINE mp_type_create_cplx_column_section_gpu(dummy, start, length, stride, mytype)
6065         IMPLICIT NONE
6066         !
6067         COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
6068         INTEGER, INTENT(IN) :: start, length, stride
6069         INTEGER, INTENT(OUT) :: mytype
6070         !
6071#if defined(__MPI)
6072         INTEGER :: ierr
6073         !
6074         CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,&
6075                                       MPI_DOUBLE_COMPLEX, mytype, ierr)
6076         IF (ierr/=0) CALL mp_stop( 8081 )
6077         CALL MPI_Type_commit(mytype, ierr)
6078         IF (ierr/=0) CALL mp_stop( 8082 )
6079#else
6080         mytype = 0;
6081#endif
6082         !
6083         RETURN
6084      END SUBROUTINE mp_type_create_cplx_column_section_gpu
6085
6086      SUBROUTINE mp_type_create_real_column_section_gpu(dummy, start, length, stride, mytype)
6087         IMPLICIT NONE
6088         !
6089         REAL (DP), DEVICE, INTENT(IN) :: dummy
6090         INTEGER, INTENT(IN) :: start, length, stride
6091         INTEGER, INTENT(OUT) :: mytype
6092         !
6093#if defined(__MPI)
6094         INTEGER :: ierr
6095         !
6096         CALL MPI_TYPE_CREATE_SUBARRAY(1, stride, length, start, MPI_ORDER_FORTRAN,&
6097                                       MPI_DOUBLE_PRECISION, mytype, ierr)
6098         IF (ierr/=0) CALL mp_stop( 8083 )
6099         CALL MPI_Type_commit(mytype, ierr)
6100         IF (ierr/=0) CALL mp_stop( 8084 )
6101#else
6102         mytype = 0;
6103#endif
6104         !
6105         RETURN
6106      END SUBROUTINE mp_type_create_real_column_section_gpu
6107
6108      SUBROUTINE mp_type_create_cplx_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
6109         IMPLICIT NONE
6110         !
6111         COMPLEX (DP), DEVICE, INTENT(IN) :: dummy
6112         INTEGER, INTENT(IN) :: column_start, column_stride, row_length
6113         INTEGER, INTENT(OUT) :: mytype
6114         !
6115#if defined(__MPI)
6116         INTEGER :: ierr, temporary
6117         INTEGER :: strides(2), lengths(2), starts(2)
6118         INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
6119         !
6120         strides(1) = column_stride ; strides(2) = row_length
6121         lengths(1) = 1             ; lengths(2) = row_length
6122         starts(1)  = column_start  ; starts(2)  = 0
6123         CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
6124                                       MPI_DOUBLE_COMPLEX, temporary, ierr)
6125         IF (ierr/=0) CALL mp_stop( 8085 )
6126         CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_COMPLEX, lb, extent, ierr)
6127         IF (ierr/=0) CALL mp_stop( 8085 )
6128         CALL MPI_TYPE_COMMIT(temporary, ierr)
6129         IF (ierr/=0) CALL mp_stop( 8085 )
6130         CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
6131         IF (ierr/=0) CALL mp_stop( 8086 )
6132         CALL MPI_Type_commit(mytype, ierr)
6133         IF (ierr/=0) CALL mp_stop( 8086 )
6134#else
6135         mytype = 0;
6136#endif
6137         !
6138         RETURN
6139      END SUBROUTINE mp_type_create_cplx_row_section_gpu
6140
6141      SUBROUTINE mp_type_create_real_row_section_gpu(dummy, column_start, column_stride, row_length, mytype)
6142         IMPLICIT NONE
6143         !
6144         REAL (DP), DEVICE, INTENT(IN) :: dummy
6145         INTEGER, INTENT(IN) :: column_start, column_stride, row_length
6146         INTEGER, INTENT(OUT) :: mytype
6147         !
6148#if defined(__MPI)
6149         INTEGER :: ierr, temporary
6150         INTEGER :: strides(2), lengths(2), starts(2)
6151         INTEGER(KIND=MPI_ADDRESS_KIND) :: lb, extent
6152         !
6153         strides(1) = column_stride ; strides(2) = row_length
6154         lengths(1) = 1             ; lengths(2) = row_length
6155         starts(1)  = column_start  ; starts(2)  = 0
6156         CALL MPI_TYPE_CREATE_SUBARRAY(2, strides, lengths, starts, MPI_ORDER_FORTRAN,&
6157                                       MPI_DOUBLE_PRECISION, temporary, ierr)
6158         IF (ierr/=0) CALL mp_stop( 8087 )
6159         CALL MPI_TYPE_GET_EXTENT(MPI_DOUBLE_PRECISION, lb, extent, ierr)
6160         IF (ierr/=0) CALL mp_stop( 8087 )
6161         CALL MPI_TYPE_COMMIT(temporary, ierr)
6162         IF (ierr/=0) CALL mp_stop( 8087 )
6163         CALL MPI_TYPE_CREATE_RESIZED(temporary, lb, extent, mytype, ierr)
6164         IF (ierr/=0) CALL mp_stop( 8088 )
6165         CALL MPI_Type_commit(mytype, ierr)
6166         IF (ierr/=0) CALL mp_stop( 8088 )
6167#else
6168         mytype = 0;
6169#endif
6170         !
6171         RETURN
6172      END SUBROUTINE mp_type_create_real_row_section_gpu
6173
6174!------------------------------------------------------------------------------!
6175
6176#endif
6177!------------------------------------------------------------------------------!
6178END MODULE mp
6179!------------------------------------------------------------------------------!
6180!
6181! Script to generate stop messages:
6182!   # coding: utf-8
6183!   import re
6184!   import sys
6185!   i = 8000
6186!   def replace(match):
6187!       global i
6188!       i += 1
6189!       return 'mp_stop( {0} )'.format(i)
6190!
6191!   with open(sys.argv[1],'r') as f:
6192!       data = re.sub(r"mp_stop\(\s?\d+\s?\)", replace, f.read())
6193!       with open(sys.argv[1]+'.new','w') as fo:
6194!           fo.write(data)
6195