1! { dg-do compile } 2! { dg-options "-cpp -fcoarray=lib" } 3! PR 87397 - this used to generate an ICE. 4 5! Coarray Distributed Transpose Test 6! 7! Copyright (c) 2012-2014, Sourcery, Inc. 8! All rights reserved. 9! 10! Redistribution and use in source and binary forms, with or without 11! modification, are permitted provided that the following conditions are met: 12! * Redistributions of source code must retain the above copyright 13! notice, this list of conditions and the following disclaimer. 14! * Redistributions in binary form must reproduce the above copyright 15! notice, this list of conditions and the following disclaimer in the 16! documentation and/or other materials provided with the distribution. 17! * Neither the name of the Sourcery, Inc., nor the 18! names of its contributors may be used to endorse or promote products 19! derived from this software without specific prior written permission. 20! 21! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22! ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23! WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24! DISCLAIMED. IN NO EVENT SHALL SOURCERY, INC., BE LIABLE FOR ANY 25! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 28! ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30! 31! Robodoc header: 32!****m* dist_transpose/run_size 33! NAME 34! run_size 35! SYNOPSIS 36! Encapsulate problem state, wall-clock timer interface, integer broadcasts, and a data copy. 37!****** 38!================== test transposes with integer x,y,z values =============================== 39module run_size 40 use iso_fortran_env 41 implicit none 42 43 integer(int64), codimension[*] :: nx, ny, nz 44 integer(int64), codimension[*] :: my, mx, first_y, last_y, first_x, last_x 45 integer(int64) :: my_node, num_nodes 46 real(real64), codimension[*] :: tran_time 47 48 49contains 50 51!****s* run_size/broadcast_int 52! NAME 53! broadcast_int 54! SYNOPSIS 55! Broadcast a scalar coarray integer from image 1 to all other images. 56!****** 57 subroutine broadcast_int( variable ) 58 integer(int64), codimension[*] :: variable 59 integer(int64) :: i 60 if( my_node == 1 ) then 61 do i = 2, num_nodes; variable[i] = variable; end do 62 end if 63 end subroutine broadcast_int 64 65subroutine copy3( A,B, n1, sA1, sB1, n2, sA2, sB2, n3, sA3, sB3 ) 66 implicit none 67 complex, intent(in) :: A(0:*) 68 complex, intent(out) :: B(0:*) 69 integer(int64), intent(in) :: n1, sA1, sB1 70 integer(int64), intent(in) :: n2, sA2, sB2 71 integer(int64), intent(in) :: n3, sA3, sB3 72 integer(int64) i,j,k 73 74 do k=0,n3-1 75 do j=0,n2-1 76 do i=0,n1-1 77 B(i*sB1+j*sB2+k*sB3) = A(i*sA1+j*sA2+k*sA3) 78 end do 79 end do 80 end do 81end subroutine copy3 82 83end module run_size 84 85!****e* dist_transpose/coarray_distributed_transpose 86! NAME 87! coarray_distributed_transpose 88! SYNOPSIS 89! This program tests the transpose routines used in Fourier-spectral simulations of homogeneous turbulence. 90! The data is presented to the physics routines as groups of y-z or x-z planes distributed among the images. 91! The (out-of-place) transpose routines do the x <--> y transposes required and consist of transposes within 92! data blocks (intra-image) and a transpose of the distribution of these blocks among the images (inter-image). 93! 94! Two methods are tested here: 95! RECEIVE: receive block from other image and transpose it 96! SEND: transpose block and send it to other image 97! 98! This code is the coarray analog of mpi_distributed_transpose. 99!****** 100 101program coarray_distributed_transpose 102 !(*********************************************************************************************************** 103 ! m a i n p r o g r a m 104 !***********************************************************************************************************) 105 use run_size 106 implicit none 107 108 complex, allocatable :: u(:,:,:,:)[:] ! u(nz,4,first_x:last_x,ny)[*] !(*-- ny = my * num_nodes --*) 109 complex, allocatable :: ur(:,:,:,:)[:] !ur(nz,4,first_y:last_y,nx/2)[*] !(*-- nx/2 = mx * num_nodes --*) 110 complex, allocatable :: bufr_X_Y(:,:,:,:) 111 complex, allocatable :: bufr_Y_X(:,:,:,:) 112 integer(int64) :: x, y, z, msg_size, iter 113 114 num_nodes = num_images() 115 my_node = this_image() 116 117 if( my_node == 1 ) then 118 !write(6,*) "nx,ny,nz : "; read(5,*) nx, ny, nz 119 nx=32; ny=32; nz=32 120 call broadcast_int( nx ); call broadcast_int( ny ); call broadcast_int( nz ); 121 end if 122 sync all !-- other nodes wait for broadcast! 123 124 125 if ( mod(ny,num_nodes) == 0) then; my = ny / num_nodes 126 else; write(6,*) "node ", my_node, " ny not multiple of num_nodes"; error stop 127 end if 128 129 if ( mod(nx/2,num_nodes) == 0) then; mx = nx/2 / num_nodes 130 else; write(6,*) "node ", my_node, "nx/2 not multiple of num_nodes"; error stop 131 end if 132 133 first_y = (my_node-1)*my + 1; last_y = (my_node-1)*my + my 134 first_x = (my_node-1)*mx + 1; last_x = (my_node-1)*mx + mx 135 136 allocate ( u(nz , 4 , first_x:last_x , ny) [*] ) !(*-- y-z planes --*) 137 allocate ( ur(nz , 4 , first_y:last_y , nx/2)[*] ) !(*-- x-z planes --*) 138 allocate ( bufr_X_Y(nz,4,mx,my) ) 139 allocate ( bufr_Y_X(nz,4,my,mx) ) 140 141 msg_size = nz*4*mx*my !-- message size (complex data items) 142 143!--------- initialize data u (mx y-z planes per image) ---------- 144 145 do x = first_x, last_x 146 do y = 1, ny 147 do z = 1, nz 148 u(z,1,x,y) = x 149 u(z,2,x,y) = y 150 u(z,3,x,y) = z 151 end do 152 end do 153 end do 154 155 tran_time = 0 156 do iter = 1, 2 !--- 2 transform pairs per second-order time step 157 158!--------- transpose data u -> ur (mx y-z planes to my x-z planes per image) -------- 159 160 ur = 0 161 162 call transpose_X_Y 163 164!--------- test data ur (my x-z planes per image) ---------- 165 166 do x = 1, nx/2 167 do y = first_y, last_y 168 do z = 1, nz 169 if ( real(ur(z,1,y,x)) /= x .or. real(ur(z,2,y,x)) /= y .or. real(ur(z,3,y,x)) /= z )then 170 write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_X_Y failed: image ", my_node & 171 , " X ",real(ur(z,1,y,x)),x, " Y ",real(ur(z,2,y,x)),y, " Z ", real(ur(z,3,y,x)),z 172 stop 173 end if 174 end do 175 end do 176 end do 177 178!--------- transpose data ur -> u (my x-z planes to mx y-z planes per image) -------- 179 180 u = 0 181 call transpose_Y_X 182 183!--------- test data u (mx y-z planes per image) ---------- 184 185 do x = first_x, last_x 186 do y = 1, ny 187 do z = 1, nz 188 if ( real(u(z,1,x,y)) /= x .or. real(u(z,2,x,y)) /= y .or. real(u(z,3,x,y)) /= z )then 189 write(6,fmt="(A,i3,3(6X,A,f7.3,i4))") "transpose_Y_X failed: image ", my_node & 190 , " X ",real(u(z,1,x,y)),x, " Y ",real(u(z,2,x,y)),y, " Z ", real(u(z,3,x,y)),z 191 stop 192 end if 193 end do 194 end do 195 end do 196 end do 197 198 sync all 199 if( my_node == 1 ) write(6,fmt="(A,f8.3)") "test passed: tran_time ", tran_time 200 201 deallocate ( bufr_X_Y ); deallocate ( bufr_Y_X ) 202 203!========================= end of main executable ============================= 204 205contains 206 207!------------- out-of-place transpose data_s --> data_r ---------------------------- 208 209 subroutine transpose_X_Y 210 211 use run_size 212 implicit none 213 214 integer(int64) :: i,stage 215 real(real64) :: tmp 216 217 sync all !-- wait for other nodes to finish compute 218 call cpu_time(tmp) 219 tran_time = tran_time - tmp 220 221 call copy3 ( u(1,1,first_x,1+(my_node-1)*my) & !-- intra-node transpose 222 , ur(1,1,first_y,1+(my_node-1)*mx) & !-- no inter-node transpose needed 223 , nz*3, 1_8, 1_8 & !-- note: only 3 of 4 words needed 224 , mx, nz*4, nz*4*my & 225 , my, nz*4*mx, nz*4 ) 226 227#define RECEIVE 228#ifdef RECEIVE 229 230 do stage = 1, num_nodes-1 231 i = 1 + mod( my_node-1+stage, num_nodes ) 232 bufr_X_Y(:,:,:,:) = u(:,:,:,1+(my_node-1)*my:my_node*my)[i] !-- inter-node transpose to buffer 233 call copy3 ( bufr_X_Y, ur(1,1,first_y,1+(i-1)*mx) & !-- intra-node transpose from buffer 234 , nz*3, 1_8, 1_8 & !-- note: only 3 of 4 words needed 235 , mx, nz*4, nz*4*my & 236 , my, nz*4*mx, nz*4 ) 237 end do 238 239#else 240 241 do stage = 1, num_nodes-1 242 i = 1 + mod( my_node-1+stage, num_nodes ) 243 call copy3 ( u(1,1,first_x,1+(i-1)*my), bufr_Y_X & !-- intra-node transpose to buffer 244 , nz*3, 1_8, 1_8 & 245 , mx, nz*4, nz*4*my & 246 , my, nz*4*mx, nz*4 ) 247 ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i] = bufr_Y_X(:,:,:,:) !-- inter-node transpose from buffer 248 end do 249 250#endif 251 252 sync all !-- wait for other nodes to finish transpose 253 call cpu_time(tmp) 254 tran_time = tran_time + tmp 255 256 end subroutine transpose_X_Y 257 258!------------- out-of-place transpose data_r --> data_s ---------------------------- 259 260subroutine transpose_Y_X 261 use run_size 262 implicit none 263 264 integer(int64) :: i, stage 265 real(real64) :: tmp 266 267 sync all !-- wait for other nodes to finish compute 268 call cpu_time(tmp) 269 tran_time = tran_time - tmp 270 271 call copy3 ( ur(1,1,first_y,1+(my_node-1)*mx) & !-- intra-node transpose 272 , u(1,1,first_x,1+(my_node-1)*my) & !-- no inter-node transpose needed 273 , nz*4, 1_8, 1_8 & !-- note: all 4 words needed 274 , my, nz*4, nz*4*mx & 275 , mx, nz*4*my, nz*4 ) 276 277#define RECEIVE 278#ifdef RECEIVE 279 280 do stage = 1, num_nodes-1 281 i = 1 + mod( my_node-1+stage, num_nodes ) 282 bufr_Y_X(:,:,:,:) = ur(:,:,:,1+(my_node-1)*mx:my_node*mx)[i] !-- inter-node transpose to buffer 283 call copy3 ( bufr_Y_X, u(1,1,first_x,1+(i-1)*my) & !-- intra-node transpose from buffer 284 , nz*4, 1_8, 1_8 & 285 , my, nz*4, nz*4*mx & 286 , mx, nz*4*my, nz*4 ) 287 end do 288 289#else 290 291 do stage = 1, num_nodes-1 292 i = 1 + mod( my_node-1+stage, num_nodes ) 293 call copy3 ( ur(1,1,first_y,1+(i-1)*mx), bufr_X_Y & !-- intra-node transpose from buffer 294 , nz*4, 1_8, 1_8 & 295 , my, nz*4, nz*4*mx & 296 , mx, nz*4*my, nz*4 ) 297 u(:,:,:,1+(my_node-1)*my:my_node*my)[i] = bufr_X_Y(:,:,:,:) !-- inter-node transpose from buffer 298 end do 299 300#endif 301 302 sync all !-- wait for other nodes to finish transpose 303 call cpu_time(tmp) 304 tran_time = tran_time + tmp 305 306 end subroutine transpose_Y_X 307 308 309end program coarray_distributed_transpose 310