1! 2! Copyright (C) 1996-2016 The SIESTA group 3! This file is distributed under the terms of the 4! GNU General Public License: see COPYING in the top directory 5! or http://www.gnu.org/copyleft/gpl.txt. 6! See Docs/Contributors.txt for a list of contributors. 7! 8 subroutine initparallel( no_u, na_u, lasto, xa, 9 & ucell, rmaxh ) 10C 11C Initialises parallel parameters 12C 13C Julian Gale, NRI, Curtin Uni, March 2004 14C 15 use precision 16 use fdf 17 use siesta_options, only: want_domain_decomposition 18 use siesta_options, only: want_spatial_decomposition 19 use siesta_options, only: isolve, SOLVE_ORDERN, SOLVE_PEXSI 20 use siesta_options, only: rcoor 21 use parallel, only : Node, Nodes, BlockSize, ProcessorY 22 use parallelsubs, only : WhichNodeOrb, GetNodeOrbs 23 use parallelsubs, only : LocalToGlobalOrb, GlobalToLocalOrb 24 use parallelsubs, only : pexsi_dist, pexsi_bs 25 use spatial 26 use sys 27 use alloc, only : re_alloc 28 use domain_decom, only: use_dd, use_dd_perm, preSetOrbitLimits 29 use parallelsubs, only : set_processorY, set_processorYdefault 30#ifdef MPI 31 use parallelsubs, only : set_BlockSizeDefault 32 use mpi_siesta, only: MPI_Comm_World, MPI_Comm_Self 33#endif 34 use class_OrbitalDistribution 35 use sparse_matrices, only : block_dist, single_dist 36 37 implicit none 38 39#ifdef MPI 40 integer, external :: numroc 41#endif 42C 43C Passed variables 44C 45 integer :: no_u 46 integer :: na_u 47 integer :: lasto(0:na_u) 48 real(dp) :: xa(3,na_u) 49 real(dp) :: ucell(3,3) 50 real(dp) :: rmaxh 51C 52C Local variables 53C 54#ifdef MPI 55 integer :: blocksizedefault 56 integer :: procYdefault 57 integer :: procYval 58 integer :: n1, nn 59#endif 60 integer :: ii, j, LOrb, GOrb, iu 61 integer :: maxorb 62 integer :: ncell(3) 63 integer :: ncelltotal 64 logical :: lspatialok 65 logical, save :: first = .true. 66 real(dp) :: acell 67 real(dp) :: bcell 68 real(dp) :: ccell 69 real(dp) :: cellmin 70 real(dp) :: cscale 71 real(dp) :: alpha 72 real(dp) :: bbeta 73 real(dp) :: gamma 74 real(dp) :: degtorad 75 real(dp) :: rcmax 76 real(dp) :: rcmaxopt 77 real(dp), save:: tiny = 1.0d-6 78 79#ifdef DEBUG 80 call write_debug( ' PRE initparallel' ) 81#endif 82 83C Initialise on first call 84 if (first) then 85 nullify(natomsNode) 86 nullify(natomsL2G) 87 nullify(natomsG2L) 88 nullify(ncellnodeptr) 89 nullify(lbuffercell) 90 nullify(nL2G) 91 nullify(nG2L) 92 nullify(nNode) 93 nullify(nOrbPerNode) 94 first = .false. 95 endif 96 97C Set spatial decomposition flag according to SCF method 98 99 100 lspatial = .false. 101#ifdef MPI 102 if (isolve.eq.SOLVE_ORDERN) then 103 if (want_domain_decomposition) then 104 use_dd = .true. 105 use_dd_perm = .false. 106 lspatial = .false. 107 else 108 lspatial = want_spatial_decomposition 109 endif 110 else 111 lspatial = .false. 112 endif 113#endif 114 115C Spatial decomposition flag 116 117 rspatial = fdf_physical('RcSpatial',0.0d0,'Bohr') 118 119C Processor grid 120 npgrid(1) = fdf_integer('ProcessorGridX',1) 121 npgrid(2) = fdf_integer('ProcessorGridY',1) 122 npgrid(3) = fdf_integer('ProcessorGridZ',1) 123 124C Set ProcessorY 125#ifdef MPI 126 call set_processorYdefault(Nodes,procYdefault) 127 procYval = fdf_integer('processorY',procYdefault) 128 ! Sanity check 129 if (procYval > Nodes) then 130 if (Node.eq.0) then 131 write(6,'(a)') 132 . '* Warning: ProcessorY > Nodes in fdf file. '// 133 $ 'Reset to estimated optimal value.' 134 endif 135 call set_processorY(procYdefault) 136 else 137 call set_processorY(procYval) 138 endif 139 140#else 141 call set_processorY(1) 142#endif 143 144C Override spatial cut-off 145 if (lspatial.and.rspatial.gt.0.0d0) then 146 rcmax = rspatial 147 else 148 rcmax = max(rmaxh,rcoor) 149 150C Check that this will lead to a reasonable number of domains relative to 151C number of processors and if not then modify 152 ncell(1) = abs(ucell(1,1)/rcmax) + 1 153 ncell(2) = abs(ucell(2,2)/rcmax) + 1 154 ncell(3) = abs(ucell(3,3)/rcmax) + 1 155 ncelltotal = ncell(1)*ncell(2)*ncell(3) 156 if (ncelltotal.lt.Nodes) then 157 cscale = dble(2*Nodes)/dble(ncelltotal) 158 cscale = cscale**(1.0d0/3.0d0) 159 ncell(1) = ncell(1)*cscale + 1 160 ncell(2) = ncell(2)*cscale + 1 161 ncell(3) = ncell(3)*cscale + 1 162 rcmax = (ucell(1,1)-tiny)/dble(ncell(1)) 163 rcmax = min(rcmax,(ucell(2,2)-tiny)/dble(ncell(2))) 164 rcmax = min(rcmax,(ucell(3,3)-tiny)/dble(ncell(3))) 165 endif 166 167 rcmaxopt = rcmax 168 endif 169 170C Find cell parameters 171 degtorad = 4.0d0*atan(1.0d0)/180.0d0 172 call uncell(ucell,acell,bcell,ccell,alpha,bbeta,gamma,degtorad) 173 174C Check that cut-off is less than cell parameter 175 cellmin = min(acell, bcell, ccell) 176 if (cellmin.lt.2.0_dp*rcmax) then 177 ! how big do we want the cells to be? 178 ! Ideally, each node should have either 179 ! one or zero atoms, which allows for the 180 ! best division of cells to processors. 181 ! Which rather suggests an alternative 182 ! strategy, of course. But anyway. 183 ! They should be small enough that there is 184 ! a reasonable division of cells per node. 185 ! However, they should not be so small that 186 ! we have so many cells that our accounting 187 ! arrarys become stupidly long. Therefore 188 ! I will arbitrarily decide (for the moment) 189 ! that we should have 190 if (Nodes<16) then 191 rcmax = 0.5_dp*cellmin/dble(Nodes) 192 else 193 rcmax = cellmin/32._dp 194 endif 195 endif 196 197 if (.not. lspatial) then 198C Set BlockSize 199#ifdef MPI 200 pexsi_dist = .false. 201 call set_blocksizedefault(Nodes,no_u,blocksizedefault) 202 BlockSize = fdf_integer('blocksize',blocksizedefault) 203 call newDistribution(blocksize,MPI_Comm_World,block_dist) 204#else 205 BlockSize = 8 206 call newDistribution(blocksize,-1,block_dist) 207#endif 208 endif 209 210 ! Create single communicator 211#ifdef MPI 212 call newDistribution(no_u,MPI_Comm_Self,single_dist, 213 & name='Single-dist') 214#else 215 call newDistribution(no_u,-1 ,single_dist, 216 & name='Singledist') 217#endif 218 219 220#ifdef MPI 221C Output indication of parallel parameters 222 if (Node.eq.0) then 223 if (lspatial) then 224 write(6,'(/,a,f8.4,/)') 225 . '* Spatial decomposition: Cutoff = ',rcmax 226 else 227 write(6,'(/,a,2i4,/)') 228 . '* ProcessorY, Blocksize: ', processorY, Blocksize 229 endif 230 n1 = numroc(no_u,Blocksize,0,0,Nodes) 231 nn = numroc(no_u,Blocksize,Nodes-1,0,Nodes) 232 write(6,"(/,a,2i6,/)") 233 $ "* Orbital distribution balance (max,min):", n1, nn 234 endif 235#endif 236 237C If spatial perform allocation of atoms to processors 238 if (lspatial) then 239 240 call setspatial(na_u,xa,ucell,rcmax,lspatialok) 241 if (.not.lspatialok) then 242 call die('Spatial decomposition failed') 243 endif 244 call setatomnodes(na_u,lasto,Node,Nodes) 245 if (Nodes.gt.1) then 246 ProcessorY = npgrid(2) 247 else 248 ProcessorY = 1 249 endif 250 if (Node.eq.0) then 251 write(6,'(/,a,i4,/)') 252 . '* ProcessorY ', processorY 253 endif 254 255 elseif ((isolve.eq.SOLVE_ORDERN) .AND. (.not. use_dd)) then 256C 257C Build dummy lists linking orbitals with parallel structure 258C 259 maxorb = lasto(na_u) 260 call re_alloc( nL2G, 1, maxorb, 1, Nodes, 'nL2G', 261 & 'initparallel' ) 262 call re_alloc( nG2L, 1, maxorb, 'nG2L', 'initparallel' ) 263 call re_alloc( nNode, 1, maxorb, 'nNode', 'initparallel' ) 264 call re_alloc( nOrbPerNode, 1, Nodes, 'nOrbPerNode', 265 & 'initparallel' ) 266C 267 268 do j = 1, Nodes 269 call GetNodeOrbs(maxorb,j-1,Nodes,nOrbPerNode(j)) 270 enddo 271 nG2L(:) = 0 272 do ii = 1,maxorb 273 call GlobalToLocalOrb(ii,Node,Nodes,nG2L(ii)) 274 call WhichNodeOrb(ii, Nodes, nNode(ii)) 275 do j = 1, Nodes 276 call LocalToGlobalOrb(ii,j-1,Nodes,nL2G(ii,j)) 277 enddo 278 enddo 279 280 if (node==0) then 281 call io_assign(iu) 282 open(unit=iu,file="BLOCK_INDEXES",form="formatted") 283 write(iu,*) "nl2g" 284 do Lorb = 1, no_u 285 write(iu, "(i6,8i9)") Lorb, (nL2G(Lorb,j),j=1,Nodes) 286 enddo 287 write(iu,*) "nNode" 288 do Gorb = 1, no_u 289 write(iu, "(i6,i5)") Gorb, nNode(Gorb) 290 enddo 291 write(iu,*) "nG2L (node 0)" 292 do Gorb = 1, no_u 293 write(iu, "(i6,i5)") Gorb, nG2L(Gorb) 294 enddo 295 write(iu,*) "nOrbPerNode" 296 do j = 1, Nodes 297 write(iu, "(i6,i8)") j-1, nOrbPerNode(j) 298 enddo 299 call io_close(iu) 300 endif 301 302 endif 303 304 if (use_dd) then 305 call preSetOrbitLimits( no_u ) 306 endif 307 308#ifdef DEBUG 309 call write_debug( ' POS initparallel' ) 310#endif 311 end subroutine initparallel 312