1!
2! Copyright (C) 1996-2016	The SIESTA group
3!  This file is distributed under the terms of the
4!  GNU General Public License: see COPYING in the top directory
5!  or http://www.gnu.org/copyleft/gpl.txt.
6! See Docs/Contributors.txt for a list of contributors.
7!
8      subroutine initparallel( no_u, na_u, lasto, xa,
9     &                         ucell, rmaxh )
10C
11C  Initialises parallel parameters
12C
13C  Julian Gale, NRI, Curtin Uni, March 2004
14C
15      use precision
16      use fdf
17      use siesta_options, only: want_domain_decomposition
18      use siesta_options, only: want_spatial_decomposition
19      use siesta_options, only: isolve, SOLVE_ORDERN, SOLVE_PEXSI
20      use siesta_options, only: rcoor
21      use parallel,     only : Node, Nodes, BlockSize, ProcessorY
22      use parallelsubs, only : WhichNodeOrb, GetNodeOrbs
23      use parallelsubs, only : LocalToGlobalOrb, GlobalToLocalOrb
24      use parallelsubs, only : pexsi_dist, pexsi_bs
25      use spatial
26      use sys
27      use alloc,        only : re_alloc
28      use domain_decom, only: use_dd, use_dd_perm, preSetOrbitLimits
29      use parallelsubs, only : set_processorY, set_processorYdefault
30#ifdef MPI
31      use parallelsubs, only : set_BlockSizeDefault
32      use mpi_siesta,   only: MPI_Comm_World, MPI_Comm_Self
33#endif
34      use class_OrbitalDistribution
35      use sparse_matrices,     only : block_dist, single_dist
36
37      implicit none
38
39#ifdef MPI
40      integer, external :: numroc
41#endif
42C
43C  Passed variables
44C
45      integer       :: no_u
46      integer       :: na_u
47      integer       :: lasto(0:na_u)
48      real(dp)      :: xa(3,na_u)
49      real(dp)      :: ucell(3,3)
50      real(dp)      :: rmaxh
51C
52C  Local variables
53C
54#ifdef MPI
55      integer       :: blocksizedefault
56      integer       :: procYdefault
57      integer       :: procYval
58      integer       :: n1, nn
59#endif
60      integer       :: ii, j, LOrb, GOrb, iu
61      integer       :: maxorb
62      integer       :: ncell(3)
63      integer       :: ncelltotal
64      logical       :: lspatialok
65      logical, save :: first = .true.
66      real(dp)      :: acell
67      real(dp)      :: bcell
68      real(dp)      :: ccell
69      real(dp)      :: cellmin
70      real(dp)      :: cscale
71      real(dp)      :: alpha
72      real(dp)      :: bbeta
73      real(dp)      :: gamma
74      real(dp)      :: degtorad
75      real(dp)      :: rcmax
76      real(dp)      :: rcmaxopt
77      real(dp), save:: tiny = 1.0d-6
78
79#ifdef DEBUG
80      call write_debug( '  PRE initparallel' )
81#endif
82
83C  Initialise on first call
84      if (first) then
85        nullify(natomsNode)
86        nullify(natomsL2G)
87        nullify(natomsG2L)
88        nullify(ncellnodeptr)
89        nullify(lbuffercell)
90        nullify(nL2G)
91        nullify(nG2L)
92        nullify(nNode)
93        nullify(nOrbPerNode)
94        first = .false.
95      endif
96
97C Set spatial decomposition flag according to SCF method
98
99
100      lspatial = .false.
101#ifdef MPI
102      if (isolve.eq.SOLVE_ORDERN) then
103         if (want_domain_decomposition) then
104            use_dd = .true.
105            use_dd_perm = .false.
106            lspatial = .false.
107         else
108            lspatial = want_spatial_decomposition
109         endif
110      else
111         lspatial = .false.
112      endif
113#endif
114
115C Spatial decomposition flag
116
117      rspatial = fdf_physical('RcSpatial',0.0d0,'Bohr')
118
119C Processor grid
120      npgrid(1) = fdf_integer('ProcessorGridX',1)
121      npgrid(2) = fdf_integer('ProcessorGridY',1)
122      npgrid(3) = fdf_integer('ProcessorGridZ',1)
123
124C Set ProcessorY
125#ifdef MPI
126      call set_processorYdefault(Nodes,procYdefault)
127      procYval = fdf_integer('processorY',procYdefault)
128      ! Sanity check
129      if (procYval > Nodes) then
130         if (Node.eq.0) then
131            write(6,'(a)')
132     .       '* Warning: ProcessorY > Nodes in fdf file. '//
133     $       'Reset to estimated optimal value.'
134         endif
135         call set_processorY(procYdefault)
136      else
137         call set_processorY(procYval)
138      endif
139
140#else
141      call set_processorY(1)
142#endif
143
144C Override spatial cut-off
145      if (lspatial.and.rspatial.gt.0.0d0) then
146        rcmax = rspatial
147      else
148        rcmax = max(rmaxh,rcoor)
149
150C Check that this will lead to a reasonable number of domains relative to
151C number of processors and if not then modify
152        ncell(1) = abs(ucell(1,1)/rcmax) + 1
153        ncell(2) = abs(ucell(2,2)/rcmax) + 1
154        ncell(3) = abs(ucell(3,3)/rcmax) + 1
155        ncelltotal = ncell(1)*ncell(2)*ncell(3)
156        if (ncelltotal.lt.Nodes) then
157          cscale = dble(2*Nodes)/dble(ncelltotal)
158          cscale = cscale**(1.0d0/3.0d0)
159          ncell(1) = ncell(1)*cscale + 1
160          ncell(2) = ncell(2)*cscale + 1
161          ncell(3) = ncell(3)*cscale + 1
162          rcmax = (ucell(1,1)-tiny)/dble(ncell(1))
163          rcmax = min(rcmax,(ucell(2,2)-tiny)/dble(ncell(2)))
164          rcmax = min(rcmax,(ucell(3,3)-tiny)/dble(ncell(3)))
165        endif
166
167        rcmaxopt = rcmax
168      endif
169
170C Find cell parameters
171      degtorad = 4.0d0*atan(1.0d0)/180.0d0
172      call uncell(ucell,acell,bcell,ccell,alpha,bbeta,gamma,degtorad)
173
174C Check that cut-off is less than cell parameter
175      cellmin = min(acell, bcell, ccell)
176      if (cellmin.lt.2.0_dp*rcmax) then
177        ! how big do we want the cells to be?
178        !  Ideally, each node should have either
179        ! one or zero atoms, which allows for the
180        ! best division of cells to processors.
181        !  Which rather suggests an alternative
182        ! strategy, of course. But anyway.
183        !  They should be small enough that there is
184        ! a reasonable division of cells per node.
185        !  However, they should not be so small that
186        ! we have so many cells that our accounting
187        ! arrarys become stupidly long. Therefore
188        ! I will arbitrarily decide (for the moment)
189        ! that we should have
190        if (Nodes<16) then
191          rcmax = 0.5_dp*cellmin/dble(Nodes)
192        else
193          rcmax = cellmin/32._dp
194        endif
195      endif
196
197      if (.not. lspatial) then
198C Set BlockSize
199#ifdef MPI
200        pexsi_dist = .false.
201        call set_blocksizedefault(Nodes,no_u,blocksizedefault)
202        BlockSize = fdf_integer('blocksize',blocksizedefault)
203        call newDistribution(blocksize,MPI_Comm_World,block_dist)
204#else
205        BlockSize = 8
206        call newDistribution(blocksize,-1,block_dist)
207#endif
208      endif
209
210      ! Create single communicator
211#ifdef MPI
212      call newDistribution(no_u,MPI_Comm_Self,single_dist,
213     &     name='Single-dist')
214#else
215      call newDistribution(no_u,-1           ,single_dist,
216     &     name='Singledist')
217#endif
218
219
220#ifdef MPI
221C Output indication of parallel parameters
222      if (Node.eq.0) then
223        if (lspatial) then
224           write(6,'(/,a,f8.4,/)')
225     .        '* Spatial decomposition: Cutoff = ',rcmax
226        else
227           write(6,'(/,a,2i4,/)')
228     .        '* ProcessorY, Blocksize: ', processorY, Blocksize
229        endif
230        n1 = numroc(no_u,Blocksize,0,0,Nodes)
231        nn = numroc(no_u,Blocksize,Nodes-1,0,Nodes)
232        write(6,"(/,a,2i6,/)")
233     $       "* Orbital distribution balance (max,min):", n1, nn
234      endif
235#endif
236
237C If spatial perform allocation of atoms to processors
238      if (lspatial) then
239
240        call setspatial(na_u,xa,ucell,rcmax,lspatialok)
241        if (.not.lspatialok) then
242          call die('Spatial decomposition failed')
243        endif
244        call setatomnodes(na_u,lasto,Node,Nodes)
245        if (Nodes.gt.1) then
246          ProcessorY = npgrid(2)
247        else
248          ProcessorY = 1
249        endif
250        if (Node.eq.0) then
251           write(6,'(/,a,i4,/)')
252     .        '* ProcessorY ', processorY
253        endif
254
255      elseif ((isolve.eq.SOLVE_ORDERN) .AND. (.not. use_dd)) then
256C
257C  Build dummy lists linking orbitals with parallel structure
258C
259        maxorb = lasto(na_u)
260        call re_alloc( nL2G, 1, maxorb, 1, Nodes, 'nL2G',
261     &                 'initparallel' )
262        call re_alloc( nG2L, 1, maxorb, 'nG2L', 'initparallel' )
263        call re_alloc( nNode, 1, maxorb, 'nNode', 'initparallel' )
264        call re_alloc( nOrbPerNode, 1, Nodes, 'nOrbPerNode',
265     &                 'initparallel' )
266C
267
268        do j = 1, Nodes
269           call GetNodeOrbs(maxorb,j-1,Nodes,nOrbPerNode(j))
270        enddo
271        nG2L(:) = 0
272        do ii = 1,maxorb
273          call GlobalToLocalOrb(ii,Node,Nodes,nG2L(ii))
274          call WhichNodeOrb(ii, Nodes, nNode(ii))
275          do j = 1, Nodes
276             call LocalToGlobalOrb(ii,j-1,Nodes,nL2G(ii,j))
277          enddo
278        enddo
279
280         if (node==0) then
281            call io_assign(iu)
282            open(unit=iu,file="BLOCK_INDEXES",form="formatted")
283            write(iu,*) "nl2g"
284            do Lorb = 1, no_u
285               write(iu, "(i6,8i9)") Lorb, (nL2G(Lorb,j),j=1,Nodes)
286            enddo
287            write(iu,*) "nNode"
288            do Gorb = 1, no_u
289               write(iu, "(i6,i5)") Gorb, nNode(Gorb)
290            enddo
291            write(iu,*) "nG2L (node 0)"
292            do Gorb = 1, no_u
293               write(iu, "(i6,i5)") Gorb, nG2L(Gorb)
294            enddo
295            write(iu,*) "nOrbPerNode"
296            do j = 1, Nodes
297               write(iu, "(i6,i8)") j-1, nOrbPerNode(j)
298            enddo
299            call io_close(iu)
300         endif
301
302      endif
303
304      if (use_dd) then
305        call preSetOrbitLimits( no_u )
306      endif
307
308#ifdef DEBUG
309      call write_debug( '  POS initparallel' )
310#endif
311      end subroutine initparallel
312