1! { dg-do compile }
2! { dg-options "-cpp -fcoarray=lib" }
3! PR 87397 - this used to generate an ICE.
4
5! Coarray Distributed Transpose Test
6!
7! Copyright (c) 2012-2014, Sourcery, Inc.
8! All rights reserved.
9!
10! Redistribution and use in source and binary forms, with or without
11! modification, are permitted provided that the following conditions are met:
12!     * Redistributions of source code must retain the above copyright
13!       notice, this list of conditions and the following disclaimer.
14!     * Redistributions in binary form must reproduce the above copyright
15!       notice, this list of conditions and the following disclaimer in the
16!       documentation and/or other materials provided with the distribution.
17!     * Neither the name of the Sourcery, Inc., nor the
18!       names of its contributors may be used to endorse or promote products
19!       derived from this software without specific prior written permission.
20!
21! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22! ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23! WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24! DISCLAIMED. IN NO EVENT SHALL SOURCERY, INC., BE LIABLE FOR ANY
25! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28! ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30!
31! Robodoc header:
32!****m* dist_transpose/run_size
33! NAME
34!   run_size
35!  SYNOPSIS
36!   Encapsulate problem state, wall-clock timer interface, integer broadcasts, and a data copy.
37!******
38!==================  test transposes with integer x,y,z values  ===============================
39module run_size
40    use iso_fortran_env
41    implicit none
42
43    integer(int64), codimension[*] :: nx, ny, nz
44    integer(int64), codimension[*] :: my, mx, first_y, last_y, first_x, last_x
45    integer(int64) :: my_node, num_nodes
46    real(real64), codimension[*] :: tran_time
47
48
49contains
50
51!****s* run_size/broadcast_int
52! NAME
53!   broadcast_int
54!  SYNOPSIS
55!   Broadcast a scalar coarray integer from image 1 to all other images.
56!******
57    subroutine broadcast_int( variable )
58        integer(int64), codimension[*] :: variable
59        integer(int64) :: i
60        if( my_node == 1 ) then
61            do i = 2, num_nodes;    variable[i] = variable;      end do
62        end if
63    end subroutine broadcast_int
64
65subroutine copy3( A,B, n1, sA1, sB1, n2, sA2, sB2, n3, sA3, sB3 )
66  implicit none
67  complex, intent(in)  :: A(0:*)
68  complex, intent(out) :: B(0:*)
69  integer(int64), intent(in) :: n1, sA1, sB1
70  integer(int64), intent(in) :: n2, sA2, sB2
71  integer(int64), intent(in) :: n3, sA3, sB3
72  integer(int64) i,j,k
73
74  do k=0,n3-1
75     do j=0,n2-1
76        do i=0,n1-1
77           B(i*sB1+j*sB2+k*sB3) = A(i*sA1+j*sA2+k*sA3)
78        end do
79     end do
80  end do
81end subroutine copy3
82
83end module run_size
84
85!****e* dist_transpose/coarray_distributed_transpose
86! NAME
87!   coarray_distributed_transpose
88! SYNOPSIS
89!   This program tests the transpose routines used in Fourier-spectral simulations of homogeneous turbulence.
90!   The data is presented to the physics routines as groups of y-z or x-z planes distributed among the images.
91!   The (out-of-place) transpose routines do the x <--> y transposes required and consist of transposes within
92!   data blocks (intra-image) and a transpose of the distribution of these blocks among the images (inter-image).
93!
94!   Two methods are tested here:
95!   RECEIVE: receive block from other image and transpose it
96!   SEND:    transpose block and send it to other image
97!
98!   This code is the coarray analog of mpi_distributed_transpose.
99!******
100
101program coarray_distributed_transpose
102  !(***********************************************************************************************************
103  !                   m a i n   p r o g r a m
104  !***********************************************************************************************************)
105      use run_size
106      implicit none
107
108      complex, allocatable ::  u(:,:,:,:)[:]    ! u(nz,4,first_x:last_x,ny)[*]    !(*-- ny = my * num_nodes --*)
109      complex, allocatable ::  ur(:,:,:,:)[:]   !ur(nz,4,first_y:last_y,nx/2)[*]  !(*-- nx/2 = mx * num_nodes --*)
110      complex, allocatable :: bufr_X_Y(:,:,:,:)
111      complex, allocatable :: bufr_Y_X(:,:,:,:)
112      integer(int64) :: x, y, z, msg_size, iter
113
114      num_nodes = num_images()
115      my_node = this_image()
116
117      if( my_node == 1 ) then
118           !write(6,*) "nx,ny,nz : ";      read(5,*) nx, ny, nz
119            nx=32; ny=32; nz=32
120            call broadcast_int( nx );        call broadcast_int( ny );        call broadcast_int( nz );
121       end if
122      sync all  !-- other nodes wait for broadcast!
123
124
125      if ( mod(ny,num_nodes) == 0)  then;   my = ny / num_nodes
126                                    else;   write(6,*) "node ", my_node, " ny not multiple of num_nodes";     error stop
127      end if
128
129      if ( mod(nx/2,num_nodes) == 0)  then;   mx = nx/2 / num_nodes
130                                    else;   write(6,*) "node ", my_node, "nx/2 not multiple of num_nodes";     error stop
131      end if
132
133      first_y = (my_node-1)*my + 1;   last_y  = (my_node-1)*my + my
134      first_x = (my_node-1)*mx + 1;   last_x  = (my_node-1)*mx + mx
135
136      allocate (  u(nz , 4 , first_x:last_x , ny)  [*] )   !(*-- y-z planes --*)
137      allocate ( ur(nz , 4 , first_y:last_y , nx/2)[*] )   !(*-- x-z planes --*)
138      allocate ( bufr_X_Y(nz,4,mx,my) )
139      allocate ( bufr_Y_X(nz,4,my,mx) )
140
141      msg_size = nz*4*mx*my     !-- message size (complex data items)
142
143!---------  initialize data u (mx y-z planes per image) ----------
144
145        do x = first_x, last_x
146            do y = 1, ny
147                do z = 1, nz
148                    u(z,1,x,y) = x
149                    u(z,2,x,y) = y
150                    u(z,3,x,y) = z
151                end do
152            end do
153        end do
154
155    tran_time = 0
156    do iter = 1, 2  !--- 2 transform pairs per second-order time step
157
158!---------  transpose data u -> ur (mx y-z planes to my x-z planes per image)  --------
159
160      ur = 0
161
162      call transpose_X_Y
163
164!--------- test data ur (my x-z planes per image) ----------
165
166        do x = 1, nx/2
167            do y = first_y, last_y
168                do z = 1, nz
169                    if ( real(ur(z,1,y,x)) /= x .or. real(ur(z,2,y,x)) /= y .or. real(ur(z,3,y,x)) /= z )then
170                        write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_X_Y failed:  image ", my_node &
171                            , " X ",real(ur(z,1,y,x)),x, "  Y ",real(ur(z,2,y,x)),y, "  Z ", real(ur(z,3,y,x)),z
172                        stop
173                    end if
174                end do
175            end do
176        end do
177
178!---------  transpose data ur -> u (my x-z planes to mx y-z planes per image)  --------
179
180      u = 0
181      call transpose_Y_X
182
183!--------- test data u (mx y-z planes per image) ----------
184
185        do x = first_x, last_x
186            do y = 1, ny
187                do z = 1, nz
188                    if ( real(u(z,1,x,y)) /= x .or. real(u(z,2,x,y)) /= y .or. real(u(z,3,x,y)) /= z )then
189                        write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_Y_X failed:  image ", my_node &
190                            , " X ",real(u(z,1,x,y)),x, "  Y ",real(u(z,2,x,y)),y, "  Z ", real(u(z,3,x,y)),z
191                        stop
192                    end if
193                end do
194            end do
195        end do
196    end do
197
198        sync all
199        if( my_node == 1 )  write(6,fmt="(A,f8.3)")  "test passed:  tran_time ", tran_time
200
201    deallocate ( bufr_X_Y );    deallocate ( bufr_Y_X )
202
203!=========================   end of main executable  =============================
204
205contains
206
207!-------------   out-of-place transpose data_s --> data_r  ----------------------------
208
209 subroutine transpose_X_Y
210
211    use run_size
212    implicit none
213
214    integer(int64) :: i,stage
215    real(real64) :: tmp
216
217    sync all   !--  wait for other nodes to finish compute
218    call cpu_time(tmp)
219    tran_time = tran_time - tmp
220
221    call copy3 (    u(1,1,first_x,1+(my_node-1)*my) &                   !-- intra-node transpose
222                ,  ur(1,1,first_y,1+(my_node-1)*mx) &                   !-- no inter-node transpose needed
223                ,   nz*3, 1_8, 1_8        &                                 !-- note: only 3 of 4 words needed
224                ,   mx, nz*4, nz*4*my &
225                ,   my, nz*4*mx, nz*4 )
226
227#define RECEIVE
228#ifdef RECEIVE
229
230    do stage = 1, num_nodes-1
231        i = 1 + mod( my_node-1+stage, num_nodes )
232        bufr_X_Y(:,:,:,:) = u(:,:,:,1+(my_node-1)*my:my_node*my)[i]         !-- inter-node transpose to buffer
233        call copy3 ( bufr_X_Y, ur(1,1,first_y,1+(i-1)*mx)  &                !-- intra-node transpose from buffer
234                        ,   nz*3, 1_8, 1_8        &                             !-- note: only 3 of 4 words needed
235                        ,   mx, nz*4, nz*4*my &
236                        ,   my, nz*4*mx, nz*4 )
237    end do
238
239#else
240
241    do stage = 1, num_nodes-1
242        i = 1 + mod( my_node-1+stage, num_nodes )
243        call  copy3 ( u(1,1,first_x,1+(i-1)*my), bufr_Y_X   &        !-- intra-node transpose to buffer
244                    ,   nz*3, 1_8, 1_8        &
245                    ,   mx, nz*4, nz*4*my &
246                    ,   my, nz*4*mx, nz*4 )
247        ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i] = bufr_Y_X(:,:,:,:)        !-- inter-node transpose from buffer
248    end do
249
250#endif
251
252    sync all     !--  wait for other nodes to finish transpose
253    call cpu_time(tmp)
254    tran_time = tran_time + tmp
255
256 end  subroutine transpose_X_Y
257
258!-------------   out-of-place transpose data_r --> data_s  ----------------------------
259
260subroutine transpose_Y_X
261    use run_size
262    implicit none
263
264    integer(int64) :: i, stage
265    real(real64) :: tmp
266
267    sync all   !--  wait for other nodes to finish compute
268    call cpu_time(tmp)
269    tran_time = tran_time - tmp
270
271    call copy3 (   ur(1,1,first_y,1+(my_node-1)*mx) &                   !-- intra-node transpose
272                ,   u(1,1,first_x,1+(my_node-1)*my) &                   !-- no inter-node transpose needed
273                ,   nz*4, 1_8, 1_8        &                                 !-- note: all 4 words needed
274                ,   my, nz*4, nz*4*mx &
275                ,   mx, nz*4*my, nz*4 )
276
277#define RECEIVE
278#ifdef RECEIVE
279
280    do stage = 1, num_nodes-1
281        i = 1 + mod( my_node-1+stage, num_nodes )
282        bufr_Y_X(:,:,:,:) = ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i]        !-- inter-node transpose to buffer
283        call copy3 ( bufr_Y_X, u(1,1,first_x,1+(i-1)*my)  &                 !-- intra-node transpose from buffer
284                    ,   nz*4, 1_8, 1_8        &
285                    ,   my, nz*4, nz*4*mx &
286                    ,   mx, nz*4*my, nz*4 )
287    end do
288
289#else
290
291    do stage = 1, num_nodes-1
292        i = 1 + mod( my_node-1+stage, num_nodes )
293        call copy3 ( ur(1,1,first_y,1+(i-1)*mx), bufr_X_Y  &                 !-- intra-node transpose from buffer
294                    ,   nz*4, 1_8, 1_8        &
295                    ,   my, nz*4, nz*4*mx &
296                    ,   mx, nz*4*my, nz*4 )
297        u(:,:,:,1+(my_node-1)*my:my_node*my)[i] = bufr_X_Y(:,:,:,:)        !-- inter-node transpose from buffer
298    end do
299
300#endif
301
302    sync all     !--  wait for other nodes to finish transpose
303    call cpu_time(tmp)
304    tran_time = tran_time + tmp
305
306 end  subroutine transpose_Y_X
307
308
309end program coarray_distributed_transpose
310