1!
2! Copyright (C) 1996-2016	The SIESTA group
3!  This file is distributed under the terms of the
4!  GNU General Public License: see COPYING in the top directory
5!  or http://www.gnu.org/copyleft/gpl.txt.
6! See Docs/Contributors.txt for a list of contributors.
7!
8      module parallelsubs
9C
10C  Parallelisation related subroutines
11C
12      use parallel,     only : BlockSize, ProcessorY
13      use parallel,     only : thisNode => Node
14      use spatial,      only : lspatial, nNode, nL2G, nG2L,
15     &                         nOrbPerNode
16      use domain_decom, only : use_dd, use_dd_perm, dd_nuo, dd_perm,
17     &                         ulimit, llimit, dd_invp, dd_nnode
18      use sys,          only : die
19      implicit none
20
21      logical, public, save :: pexsi_dist
22      integer, public, save :: pexsi_bs
23
24      contains
25
26      subroutine set_processorY(n)
27      integer, intent(in) :: n
28      ProcessorY = n
29      end subroutine set_processorY
30
31      subroutine GetNodeOrbs( NOrb, Node, Nodes, NOrbNode)
32      use parallel, only: thisNode => Node
33C
34C  Calculates the number of orbitals stored on the local Node.
35C
36C  Julian Gale, October 1998
37C
38C  Input :
39C
40C  integer NOrb     = The total number of orbitals in the calculation
41C  integer Node     = The local processor
42C  integer Nodes    = The total number of processors
43C
44C  Output :
45C
46C  integer NOrbNode = The number of orbitals stored on this Node - if zero
47C                     on input then calculated otherwise left unchanged
48C
49C  Modified so that value from last call is saved in case it can be
50C  re-used on the next call to save re-calculation.
51C
52
53C Passed arguments
54      integer NOrb, NOrbNode, Node, Nodes
55
56C Local variables
57      integer Remainder, MinPerNode, RemainderBlocks, NOrbLast,
58     .  NOrbNodeLast
59
60C Saved variables
61      save NOrbLast, NOrbNodeLast
62
63      data
64     .  NOrbLast / 0 /
65     .  NOrbNodeLast / 0 /
66
67!      if (thisNode /= Node)
68!     $       call message("Non-local use of GetNodeOrbs")
69
70      if (pexsi_dist) then
71         remainder = Norb - pexsi_bs*Nodes
72         if (Node == Nodes-1) then
73            NorbNode = pexsi_bs + remainder
74         else
75            NorbNode = pexsi_bs
76         endif
77
78         RETURN
79      endif
80
81      if (NOrb .eq. NOrbLast) then
82C Values are the same as last call - no need to recalculate
83        NOrbNode = NOrbNodeLast
84
85      else
86
87        if (lspatial) then
88C-------------------------
89C  Spatial distribution  -
90C-------------------------
91          NOrbNode = nOrbPerNode(Node+1)
92
93        else if (use_dd) then
94           if (use_dd_perm) then
95              !  NOrbNode = dd_nuo
96              ! We have this structure also
97              NOrbNode = nOrbPerNode(Node+1)
98           else
99              if (thisNode /= Node)
100     $             call die("Wrong use of dd GetNodeOrbs")
101              NOrbNode = dd_nuo
102           endif
103        else
104C-----------------------------
105C  Blockcyclic distribution  -
106C-----------------------------
107C Calculate the minimum number of orbitals per node
108          MinPerNode = NOrb / (Nodes*BlockSize)
109
110C Find the remainder of unassigned orbitals
111          Remainder = NOrb - MinPerNode * Nodes * BlockSize
112
113C Find number of complete blocks in the remainder
114          RemainderBlocks = Remainder / BlockSize
115          Remainder = Remainder - RemainderBlocks * BlockSize
116
117C Workout the local number of orbitals
118          NOrbNode = MinPerNode*BlockSize
119          if (Node.lt.RemainderBlocks) NOrbNode = NOrbNode + BlockSize
120          if (Node.eq.RemainderBlocks) NOrbNode = NOrbNode + Remainder
121        endif
122
123C Save value for next call
124        NOrbNodeLast = NOrbNode
125
126      endif
127
128      end subroutine GetNodeOrbs
129
130      subroutine GlobalToLocalOrb( GOrb, Node, Nodes, LOrb)
131      use parallel, only: thisNode => Node
132C
133C  Converts an orbital index in the global frame to the local frame
134C  if the orbital is local to this node. Otherwise the pointer is
135C  return as zero and can therefore be used to test whether the
136C  orbital is local or not.
137C
138C  Julian Gale, Imperial College, November 1998
139C
140C  Input :
141C
142C  integer GOrb   = global orbital index
143C  integer Node   = local processor number
144C  integer Nodes  = global number of processors
145C
146C  From parallel.h :
147C
148C  integer BlockSize = blocking size for orbital distribution across
149C                      the nodes. Choice of value affects the
150C                      performance of the Scalapack routines
151C
152C  Output :
153C
154C  integer LOrb   = local orbital index
155C
156
157      integer GOrb, Node, Nodes, LOrb
158
159C  Local variables
160      integer OrbCheck, GBlock, LEle, LBlock
161
162!      if (thisNode /= Node)
163!     $       call message("Non-local use of GlobalToLocalOrb")
164
165      if (pexsi_dist) then
166         call WhichNodeOrb(Gorb,Nodes,OrbCheck)
167         if (OrbCheck == Node) then
168            lorb = gorb - pexsi_bs*Node
169         else
170            lorb = 0
171         endif
172
173      else if (lspatial) then
174C-------------------------
175C  Spatial distribution  -
176C-------------------------
177        if (Node.eq.nNode(GOrb)) then
178          LOrb = nG2L(GOrb)
179        else
180          LOrb = 0
181        endif
182      else if (use_dd) then
183        if (use_dd_perm) then
184          ! LOrb = dd_perm(GOrb)
185           LOrb = nG2L(GOrb)
186        else
187          if (GOrb.ge.llimit .and. GOrb.lt.ulimit) then
188            LOrb = GOrb - llimit + 1
189          else
190            LOrb = 0
191          endif
192        endif
193      else
194C-----------------------------
195C  Blockcyclic distribution  -
196C-----------------------------
197C  Find global block number
198        GBlock = ((GOrb -1)/BlockSize)
199
200C  Substract global base line to find element number within the block
201        LEle = GOrb - GBlock*BlockSize
202
203C  Find the block number on the local node
204        LBlock = ((GBlock - Node)/Nodes)
205
206C  Generate the local orbital pointer based on the local block number
207        LOrb = LEle + LBlock*Blocksize
208
209C  Check that this is consistent - if it is not then this
210C  local orbital is not on this node and so we return 0
211C  to indicate this.
212        OrbCheck = (LBlock*Nodes + Node)*BlockSize + LEle
213        if (OrbCheck.ne.GOrb) LOrb = 0
214      endif
215
216      end  subroutine GlobalToLocalOrb
217
218
219      subroutine LocalToGlobalOrb( LOrb, Node, Nodes, GOrb)
220      use parallel, only: thisNode => Node
221C
222C  Converts an orbital index in the local frame to the global frame
223C
224C  Julian Gale, Imperial College, December 1998
225C
226C  Input :
227C
228C  integer LOrb   = local orbital index
229C  integer Node   = local processor number
230C  integer Nodes  = global number of processors
231C
232C  From parallel.h :
233C
234C  integer BlockSize = blocking size for orbital distribution across
235C                      the nodes. Choice of value affects the
236C                      performance of the Scalapack routines
237C
238C  Output :
239C
240C  integer GOrb   = global orbital index
241C
242
243      integer LOrb, Node, Nodes, GOrb
244
245C  Local variables
246      integer LEle, LBlock
247
248!      if (thisNode /= Node)
249!     $       call message("Non-local use of LocalToGlobalOrb")
250
251      if (pexsi_dist) then
252         gorb = pexsi_bs*Node + lorb
253
254      else if (lspatial) then
255C-------------------------
256C  Spatial distribution  -
257C-------------------------
258        GOrb = nL2G(LOrb,Node+1)
259
260      else if (use_dd) then
261        if (use_dd_perm) then
262          !  GOrb = dd_invp(LOrb)
263          ! We have this structures also...
264           GOrb = nL2G(LOrb,Node+1)
265        else
266           if (thisNode /= Node)
267     $          call die("Wrong use of dd LocalToGlobalOrb")
268
269          GOrb = LOrb + llimit - 1
270        endif
271      else
272C-----------------------------
273C  Blockcyclic distribution  -
274C-----------------------------
275C  Find local block number
276        LBlock = ((LOrb -1)/BlockSize)
277
278C  Substract local base line to find element number within the block
279        LEle = LOrb - LBlock*BlockSize
280
281C  Calculate global index
282        GOrb = (LBlock*Nodes + Node)*BlockSize + LEle
283      endif
284      return
285      end  subroutine LocalToGlobalOrb
286
287      subroutine WhichNodeOrb( GOrb, Nodes, Node)
288C
289C  Given the global orbital pointer, this routine
290C  returns the Node number where this is stored.
291C
292C  Julian Gale, Imperial College, January 1999
293C
294C  Input :
295C
296C  integer GOrb   = global orbital index
297C  integer Nodes  = total number of Nodes
298C
299C  From parallel.h :
300C
301C  integer BlockSize = blocking size for orbital distribution across
302C                      the nodes. Choice of value affects the
303C                      performance of the Scalapack routines
304C
305C  Output :
306C
307C  integer Node   = Node where this orbital is stored locally
308C
309      integer GOrb, Node, Nodes
310
311C  Local variables
312      integer GBlock
313
314      if (pexsi_dist) then
315         Node = (Gorb-1)/pexsi_bs
316         ! Check for overflow: if gorb is in the "remainder" set,
317         ! the above will be incorrect (e.g. 4 (or even 5) in a {0,1,2,3} set)
318         ! (Consider the case norbs=13, pexsi_bs=2, nodes=5)
319         if (Node > Nodes-1) Node = Nodes-1
320
321      else if (lspatial) then
322C-------------------------
323C  Spatial distribution  -
324C-------------------------
325        Node = nNode(GOrb)
326      else if (use_dd) then
327         if (.not. use_dd_perm) then
328            call die("WhichNodeOrb not ready in DD")
329         endif
330        Node = dd_nnode(GOrb)
331        !! Node = nNode(GOrb)
332      else
333C-----------------------------
334C  Blockcyclic distribution  -
335C-----------------------------
336C  Find global block number
337        GBlock = ((GOrb -1)/BlockSize)
338
339C  Find the Node number that has this block
340        Node = mod(GBlock,Nodes)
341      endif
342
343      return
344      end  subroutine WhichNodeOrb
345
346
347      subroutine WhichMeshNode( GMesh, Nxyz, Nodes, Node)
348C
349C  Given the global mesh pointer, this routine
350C  returns the Node number where this is stored.
351C
352C  Julian Gale, Imperial College, March 1999
353C
354C  Input :
355C
356C  integer GMesh  = global mesh index
357C  integer Nxyz(3)= number of grid points along each axis
358C  integer Nodes  = total number of Nodes
359C
360C  Output :
361C
362C  integer Node   = Node where this orbital is stored locally
363C
364
365C  Passed variables
366      integer GMesh, Nxyz(3), Node, Nodes
367
368C  Local variables
369      integer
370     .  Gy, Gz, Mesh(3)
371
372C  Trap zero mesh case
373      if (Nxyz(1)*Nxyz(2)*Nxyz(3).eq.0) then
374        Node = 0
375        return
376      endif
377
378C  Find grid indices along each axis
379      Gz = ((GMesh-1)/(Nxyz(1)*Nxyz(2))) + 1
380      Gy = GMesh - (Gz-1)*Nxyz(1)*Nxyz(2)
381      Gy = ((Gy-1)/Nxyz(1))+1
382      Mesh(1) = 1
383      Mesh(2) = Gy
384      Mesh(3) = Gz
385
386C  Call subroutine to find node number from global XYZ index
387      call WhichMeshXYZNode( Mesh, Nxyz, Nodes, Node)
388
389      end subroutine WhichMeshNode
390
391      subroutine HowManyMeshPerNode(Nxyz, Node, Nodes, NMeshPN, NxyzL)
392C
393C  Given the total dimensions of the grid of points it works
394C  out how many of the points are on the current node.
395C
396C  Julian Gale, Imperial College, March 1999
397C
398C  Input :
399C
400C  integer Nxyz(3) = dimensions of grid of points
401C  integer Node    = local processor number
402C  integer Nodes   = global number of processors
403C
404C  From parallel.h :
405C
406C  integer ProcessorY = second dimension of processor grid -
407C                      must be a factor of the number of processors
408C                      being used.
409C
410C  Output :
411C
412C  integer NMeshPN  = total number of local mesh points
413C  integer NxyzL(3) = local dimensions of mesh points on this node
414C
415
416      implicit none
417C  Passed variables
418      integer Nxyz(3), Node, Nodes, NMeshPN, NxyzL(3)
419
420C  Local variables
421      integer
422     .  BlockSizeY, BlockSizeZ, NRemY, NRemZ, Py, Pz,
423     .  ProcessorZ
424
425C  Check that ProcessorY is a factor of the number of processors
426      if (mod(Nodes,ProcessorY).gt.0)
427     $     call die('ERROR: ProcessorY must be a factor of the' //
428     $     ' number of processors!')
429      ProcessorZ = Nodes/ProcessorY
430
431C  Find processor grid location
432      Py = (Node/ProcessorZ) + 1
433      Pz = Node - (Py - 1)*ProcessorZ + 1
434
435C  Set blocking sizes
436      BlockSizeY = (Nxyz(2)/ProcessorY)
437      BlockSizeZ = (Nxyz(3)/ProcessorZ)
438
439      NRemY = Nxyz(2) - BlockSizeY*ProcessorY
440      NRemZ = Nxyz(3) - BlockSizeZ*ProcessorZ
441      if (Py-1.lt.NRemY) BlockSizeY = BlockSizeY + 1
442      if (Pz-1.lt.NRemZ) BlockSizeZ = BlockSizeZ + 1
443
444C  Trap zero grid case
445      if (BlockSizeY.eq.0.or.BlockSizeZ.eq.0) then
446        NMeshPN = 0
447        NxyzL(1) = Nxyz(1)
448        NxyzL(2) = Nxyz(2)
449        NxyzL(3) = Nxyz(3)
450        return
451      endif
452
453C  Assign blocksizes as local grid dimensions
454      NxyzL(1) = Nxyz(1)
455      NxyzL(2) = BlockSizeY
456      NxyzL(3) = BlockSizeZ
457
458C  Calculate total number of grid points by multiplying dimensions
459      NMeshPN = NxyzL(1)*NxyzL(2)*NxyzL(3)
460
461      return
462      end  subroutine HowManyMeshPerNode
463
464
465      subroutine GlobalToLocalMesh( GMesh, Nxyz, Node, Nodes, LMesh)
466C
467C  Converts an orbital index in the global frame to the local frame
468C  if the orbital is local to this node. Otherwise the pointer is
469C  return as zero and can therefore be used to test whether the
470C  orbital is local or not.
471C
472C  Julian Gale, Imperial College, January 1999
473C
474C  Input :
475C
476C  integer GMesh   = global mesh point index
477C  integer Nxyz(3) = dimensions of grid of points
478C  integer Node    = local processor number
479C  integer Nodes   = global number of processors
480C
481C  Output :
482C
483C  integer LMesh   = local mesh point index
484C
485
486C  Passed variables
487      integer GMesh, Nxyz(3), Node, Nodes, LMesh
488
489C  Local variables
490      integer
491     .  Mesh(3), MeshL(3), NxyzL(3), Gx, Gy, Gz, NMeshPN
492
493C  Trap zero mesh size case
494      if (Nxyz(1)*Nxyz(2)*Nxyz(3).eq.0) then
495        LMesh = GMesh
496        return
497      endif
498
499C  Find grid indices along each axis
500      Gz = ((GMesh-1)/(Nxyz(1)*Nxyz(2))) + 1
501      Gx = GMesh - (Gz-1)*Nxyz(1)*Nxyz(2)
502      Gy = ((Gx-1)/Nxyz(1))+1
503      Gx = Gx - (Gy-1)*Nxyz(1)
504      Mesh(1) = Gx
505      Mesh(2) = Gy
506      Mesh(3) = Gz
507
508C  Call subroutine to find local mesh pointer based on global XYZ grid
509      call GlobalToLocalXYZMesh( Mesh, Nxyz, Node, Nodes, MeshL)
510
511C  If MeshL = 0 then this point is not on this node -> return
512      if (MeshL(1)*MeshL(2)*MeshL(3).eq.0) then
513        LMesh = 0
514        return
515      endif
516
517C  Find dimensions of local mesh so that we can work out pointer
518      call HowManyMeshPerNode(Nxyz, Node, Nodes, NMeshPN, NxyzL)
519
520C  Generate the local mesh pointer based on the local element and blocks
521      LMesh = (MeshL(3)-1)*(Nxyz(1)*NxyzL(2)) +
522     .        (MeshL(2)-1)*Nxyz(1) + Gx
523
524      end  subroutine GlobalToLocalMesh
525
526
527      subroutine WhichMeshXYZNode( Mesh, Nxyz, Nodes, Node)
528C
529C  Given the global mesh pointer, this routine
530C  returns the Node number where this is stored.
531C
532C  Julian Gale, Imperial College, March 1999
533C
534C  Input :
535C
536C  integer Mesh(3) = global mesh point by grid reference
537C  integer Nxyz(3) = number of grid points along each axis
538C  integer Nodes   = total number of Nodes
539C
540C  From parallel.h :
541C
542C  integer ProcessorY = second dimension of processor grid -
543C                      must be a factor of the number of processors
544C                      being used.
545C
546C  Output :
547C
548C  integer Node   = Node where this orbital is stored locally
549C
550
551C  Passed variables
552      integer Mesh(3), Nxyz(3), Node, Nodes
553
554C  Local variables
555      integer
556     .  ProcessorZ, PGy, PGz, BlockSizeY, BlockSizeZ,
557     .  GSplitY, GSplitZ, NRemY, NRemZ
558
559C  Trap zero mesh size case
560      if (Nxyz(1)*Nxyz(2)*Nxyz(3).eq.0) then
561        Node = 0
562        return
563      endif
564
565C  Check that ProcessorY is a factor of the number of processors
566      if (mod(Nodes,ProcessorY).gt.0)
567     $     call die('ERROR: ProcessorY must be a factor of the' //
568     $     ' number of processors!')
569
570      ProcessorZ = Nodes/ProcessorY
571
572C  Find blocksizes along axes
573      BlockSizeY = (Nxyz(2)/ProcessorY)
574      BlockSizeZ = (Nxyz(3)/ProcessorZ)
575      NRemY = Nxyz(2) - BlockSizeY*ProcessorY
576      NRemZ = Nxyz(3) - BlockSizeZ*ProcessorZ
577
578C  Find grid point where the change from BlockSize+1 to BlockSize happens
579      GSplitY = NRemY*(BlockSizeY + 1)
580      GSplitZ = NRemZ*(BlockSizeZ + 1)
581
582C  Find processor grid coordinates for this point
583      if (Mesh(2).le.GSplitY) then
584        PGy = (Mesh(2) - 1)/(BlockSizeY + 1) + 1
585      else
586        PGy = (Mesh(2) - 1 - GSplitY)/(BlockSizeY) + NRemY + 1
587      endif
588      if (Mesh(3).le.GSplitZ) then
589        PGz = (Mesh(3) - 1)/(BlockSizeZ + 1) + 1
590      else
591        PGz = (Mesh(3) - 1 - GSplitZ)/(BlockSizeZ) + NRemZ + 1
592      endif
593
594C  Calculate processor number for this grid point
595      Node = (PGy - 1)*ProcessorZ + PGz - 1
596
597      return
598      end  subroutine WhichMeshXYZNode
599
600
601      subroutine GlobalToLocalXYZMesh( Mesh, Nxyz, Node, Nodes, LMesh)
602C
603C  Converts an orbital index in the global frame to the local frame
604C  if the orbital is local to this node. Otherwise the pointer is
605C  return as zero and can therefore be used to test whether the
606C  orbital is local or not.
607C
608C  Julian Gale, Imperial College, January 1999
609C
610C  Input :
611C
612C  integer Mesh(3) = global mesh point grid reference
613C  integer Nxyz(3) = dimensions of grid of points
614C  integer Node    = local processor number
615C  integer Nodes   = global number of processors
616C
617C  From parallel.h :
618C
619C  integer ProcessorY = second dimension of processor grid -
620C                      must be a factor of the number of processors
621C                      being used.
622C
623C  Output :
624C
625C  integer LMesh(3) = local mesh point grid reference
626C
627
628C  Passed variables
629      integer Mesh(3), Nxyz(3), Node, Nodes, LMesh(3)
630
631C  Local variables
632      integer
633     .  ProcessorZ, Py, Pz, PGy, PGz, BlockSizeY, BlockSizeZ,
634     .  GSplitY, GSplitZ, NRemY, NRemZ
635
636C  Trap zero mesh size case
637      if (Nxyz(1)*Nxyz(2)*Nxyz(3).eq.0) then
638        LMesh(1) = Mesh(1)
639        LMesh(2) = Mesh(2)
640        LMesh(3) = Mesh(3)
641        return
642      endif
643
644C  Check that ProcessorY is a factor of the number of processors
645      if (mod(Nodes,ProcessorY).gt.0)
646     $     call die('ERROR: ProcessorY must be a factor of the' //
647     $     ' number of processors!')
648      ProcessorZ = Nodes/ProcessorY
649
650C  Find processor grid location
651      Py = (Node/ProcessorZ) + 1
652      Pz = Node - (Py - 1)*ProcessorZ + 1
653
654C  Find blocksizes along axes
655      BlockSizeY = (Nxyz(2)/ProcessorY)
656      BlockSizeZ = (Nxyz(3)/ProcessorZ)
657      NRemY = Nxyz(2) - BlockSizeY*ProcessorY
658      NRemZ = Nxyz(3) - BlockSizeZ*ProcessorZ
659
660C  Find grid point where the change from BlockSize+1 to BlockSize happens
661      GSplitY = NRemY*(BlockSizeY + 1)
662      GSplitZ = NRemZ*(BlockSizeZ + 1)
663
664C  Find processor grid coordinates for this point
665      if (Mesh(2).le.GSplitY) then
666        PGy = (Mesh(2) - 1)/(BlockSizeY + 1) + 1
667      else
668        PGy = (Mesh(2) - 1 - GSplitY)/(BlockSizeY) + NRemY + 1
669      endif
670      if (Mesh(3).le.GSplitZ) then
671        PGz = (Mesh(3) - 1)/(BlockSizeZ + 1) + 1
672      else
673        PGz = (Mesh(3) - 1 - GSplitZ)/(BlockSizeZ) + NRemZ + 1
674      endif
675
676C  If this node isn't the local one then set pointer to zero and return
677      if (Py.ne.PGy.or.Pz.ne.PGz) then
678        LMesh(1) = 0
679        LMesh(2) = 0
680        LMesh(3) = 0
681        return
682      endif
683
684C  Get local mesh pointer by subtracting baseline
685      LMesh(1) = Mesh(1)
686      if (Mesh(2).le.GSplitY) then
687        LMesh(2) = Mesh(2) - (PGy - 1)*(BlockSizeY + 1)
688      else
689        LMesh(2) = Mesh(2) - (PGy - 1)*BlockSizeY - NRemY
690      endif
691      if (Mesh(3).le.GSplitZ) then
692        LMesh(3) = Mesh(3) - (PGz - 1)*(BlockSizeZ + 1)
693      else
694        LMesh(3) = Mesh(3) - (PGz - 1)*BlockSizeZ - NRemZ
695      endif
696
697      end subroutine GlobalToLocalXYZMesh
698
699      subroutine set_processorYdefault(Nodes,procYdefault)
700C
701C Finds a sensible default value for the processor grid in the Y
702C direction to try to get an equal split in X and Y. Currently only
703C looks for factors of 2, 3 and 5.
704C
705C Input :
706C
707C integer Nodes        : total number of processors
708C
709C Output :
710C
711C integer procYdefault : default value of Y grid parameter
712C
713C Written by Julian Gale, November 1999
714C
715C Passed arguments
716      integer
717     .  Nodes, procYdefault
718C Local variables
719      integer
720     .  Nx, Ny, Nrem
721      logical
722     .  factor
723
724C Initialise values
725      Nx = 1
726      Ny = 1
727      Nrem = Nodes
728      factor = .true.
729
730C Loop looking for factors
731      do while (factor.and.Nrem.gt.1)
732        factor = .false.
733        if (mod(Nrem,2).eq.0) then
734          Nrem = Nrem/2
735          factor = .true.
736          if (Nx.gt.Ny) then
737            Ny = 2*Ny
738          else
739            Nx = 2*Nx
740          endif
741        endif
742        if (mod(Nrem,3).eq.0) then
743          Nrem = Nrem/3
744          factor = .true.
745          if (Nx.gt.Ny) then
746            Ny = 3*Ny
747          else
748            Nx = 3*Nx
749          endif
750        endif
751        if (mod(Nrem,5).eq.0) then
752          Nrem = Nrem/5
753          factor = .true.
754          if (Nx.gt.Ny) then
755            Ny = 5*Ny
756          else
757            Nx = 5*Nx
758          endif
759        endif
760      enddo
761
762C Choose default value as lowest of Nx and Ny
763      if (Nx.gt.Ny) then
764        procYdefault = Ny
765      else
766        procYdefault = Nx
767      endif
768
769      end subroutine set_processorYdefault
770
771#ifdef MPI
772      subroutine set_blocksizedefault(Nodes,nuotot,bs)
773C
774C Finds a sensible default value for the blocksize default.
775C When the number of orbitals is less than the blocksize
776C typically used then lower the blocksize to ensure that
777C some work is done on all nodes.
778C
779C Input :
780C
781C integer Nodes        : total number of processors
782C integer nuotot       : total number of orbitals
783C
784C Output :
785C
786C integer bs : default value of blocksize
787C
788C Written by Julian Gale, March 2001
789C Modified by Alberto Garcia, October 2013
790C
791C Passed arguments
792      integer, intent(in)  :: Nodes, nuotot
793      integer, intent(out) :: bs
794
795      integer  :: n1, nn
796
797      ! Scalapack routine for block-cyclic distributions
798      integer, external :: numroc
799
800C Compare number of orbitals against sensible number
801! Note that this number could be optimized...
802
803      if (nuotot.gt.24*Nodes) then
804        bs = 24
805      else if (nuotot.lt.Nodes) then
806        bs = 1
807      else
808
809         bs = nuotot/Nodes + 1
810         do
811            ! Check the number of orbitals handled
812            ! by the first and last processors
813            ! Avoid idle processors, and too-high
814            ! imbalances
815
816            n1 = numroc(nuotot,bs,0,0,nodes)
817            nn = numroc(nuotot,bs,nodes-1,0,nodes)
818
819            if (nn == 0) then
820               bs = bs - 1
821            else if (dble(n1)/nn > 2) then
822               bs = bs - 1
823            else
824               exit
825            endif
826         enddo
827
828      endif
829
830      end subroutine set_blocksizedefault
831#endif
832      end module parallelsubs
833