1      subroutine argos_space_dldbal(stime,syntim,waltim,nod,nlst,
2     + nen,lbbl,dlb,lpsyn,ibindx,iburen,npp)
3c
4      implicit none
5c
6#include "argos_space_common.fh"
7#include "mafdecls.fh"
8#include "msgids.fh"
9#include "bitops.fh"
10c
11      integer argos_space_btop
12      external argos_space_btop
13c
14      integer npp
15      integer nod(np),nlst(npp,2),nen(np,mbbl)
16      integer lbbl(mbbl,mbb2)
17      real*8 stime,syntim,synsum,waltim,dlb(*),factor,facx,facy,facz
18      logical lpsyn
19      logical lbsize
20c
21      integer i,j,k,n,ibusy,least,iproc,icell,jcell,itemp,jtemp
22      integer intsiz,ibindx(np),iburen(np,27,2)
23      logical lnb(27)
24      integer nfr,nto,node
25c
26c     no load balancing needed if run on single processor
27c
28      if(np.eq.1) return
29c
30c     collect the synchronization times converted into integers
31c
32      do 1 i=1,np
33      nlst(i,1)=0
34      nlst(i,2)=0
35      nod(i)=i-1
36    1 continue
37      nlst(me+1,1)=int(1.0d6*syntim)
38      nlst(npp,1)=int(1.0d6*waltim)
39      nlst(npp,2)=0
40c
41      do 2 i=1,nbbl
42      iproc=lbbl(i,1)
43      if(iproc.ne.me) then
44      n=ibindx(iproc+1)
45      if(n.gt.0) nlst(me+1,2)=ior(nlst(me+1,2),2**(n-1))
46      endif
47    2 continue
48c
49c     get the size of an integer on this machine in bits
50c
51      intsiz=ma_sizeof(mt_int,1,mt_byte)
52c
53c      if(intsiz.le.32) then
54      call ga_igop(msp_14,nlst,2*npp,'+')
55c      else
56c      do 3 i=1,np
57c      nlst(i,1)=lshift(nlst(i,1),32)+nlst(i,2)
58c    3 continue
59c      call ga_igop(msp_14,nlst,np,'+')
60c      do 4 i=1,np
61c      nlst(i,2)=iand(nlst(i,1),8589934591)
62c      nlst(i,1)=rshift(nlst(i,1),32)
63c    4 continue
64c      endif
65c
66      synsum=0.0d0
67      do 5 i=1,np
68      dlb(i)=dble(nlst(i,1))*1.0d-6
69      synsum=synsum+dlb(i)
70    5 continue
71      waltim=(dble(nlst(npp,1))*1.0d-6/dble(np))
72c
73      if(nldup.lt.0) then
74      tslow=synsum
75      nldup=nldup+1
76      else
77      if(synsum.gt.tslow) then
78      nldup=nldup+1
79      else
80      tslow=synsum
81      nldup=0
82      endif
83      endif
84c
85c     if the accumulated synchronization time is less than 0.0001 sec
86c     no load balancing is needed
87c
88      if(synsum.lt.1.0d-4) return
89c
90c     determine the busiest processor
91c     if the busiest processor has no off-processor box-box pairs
92c     load balancing will be done by resizing
93c
94      ibusy=1
95      do 6 i=2,np
96      if(abs(dlb(i)).lt.abs(dlb(ibusy))) ibusy=i
97    6 continue
98c
99c     if the busiest processor has no off-processor pairs in the
100c     cell-cell list loadbalancing will be done by resizing
101c
102      lbsize=nlst(ibusy,2).eq.0
103      if(nldup.ge.lbpair) lbsize=.true.
104c
105c     order node list
106c
107      do 7 i=1,np-1
108      do 8 j=i+1,np
109      if(dlb(nod(i)+1).gt.dlb(nod(j)+1)) then
110      n=nod(i)
111      nod(i)=nod(j)
112      nod(j)=n
113      endif
114    8 continue
115    7 continue
116c
117c     load balancing based on cell resizing
118c
119c     1  if specified as only method in input
120c     2. if busiest processor has no off-processor cell-cell pairs (lbsize true)
121c     3. if busiest alternates between two processors (lpipo true)
122c
123      if(loadb.eq.1.or.(loadb.eq.3.and.(lbsize.or.lpipo))) then
124      factor=(1.0d0-factld*(synsum/dble(np)-dlb(ibusy))/waltim)**third
125      facx=1.0d0
126      facy=1.0d0
127      facz=1.0d0
128      call argos_space_resize(ibusy-1,int_mb(i_iown),
129     + dbl_mb(i_boxs),factor,facx,facy,facz)
130      if(me.eq.0.and.lpsyn) then
131      write(lfnsyn,3000) 1,stime,synsum,waltim,ibusy-1,
132     + factor,facx,facy,facz,lpipo
133 3000 format('synchronization',/,i5,3f12.6,/,i5,4f12.6,4x,l1)
134      do 9 i=1,np
135      write(lfnsyn,3001) nod(i),dlb(nod(i)+1)
136 3001 format(i5,f12.6)
137    9 continue
138      call argos_space_wrtbxsz(lfnsyn,dbl_mb(i_boxs))
139      endif
140      ipairf=-3
141      ipairt=-4
142      lpipo=.false.
143      tsyncp=synsum
144      nldup=0
145      tslow=synsum
146      return
147      endif
148c
149c     cascade implementation
150c
151      if(me.eq.0) then
152c
153      do 12 i=1,np
154      nen(i,1)=-1
155   12 continue
156c
157c     for each processor find least busy neighbor in cell-cell list
158c
159      do 10 i=1,np
160      node=nod(i)
161      n=nlst(node+1,2)
162      do 11 j=1,27
163      lnb(j)=iand(n,1).eq.1
164      n=rshift(n,1)
165   11 continue
166      least=-1
167      do 13 j=1,27
168      iproc=iburen(node+1,j,1)
169      if(nen(iproc+1,1).eq.-1) then
170      if(iproc.ne.node.and.lnb(j)) then
171      if(least.ge.0) then
172      if(dlb(iproc+1).gt.dlb(least+1)) least=iproc
173      else
174      if(dlb(iproc+1).gt.dlb(node+1)) least=iproc
175      endif
176      endif
177      endif
178   13 continue
179c
180c     nen(i,1) contains processor id that processor i-1 will receive from
181c
182      if(least.ge.0) then
183      if(nen(least+1,1).lt.0) then
184      nen(least+1,1)=node
185      endif
186      endif
187c
188   10 continue
189c
190      endif
191c
192c
193c     broadcast the list
194c
195      call ga_brdcst(msp_16,nen(1,1),np*intsiz,0)
196c
197c     determine cell pair transfer list
198c
199c     the list contains the box-box pairs to be moved as follows:
200c
201c        nen(i,1) = ibox+1
202c        nen(1,2) = jbox+1
203c
204c     the pair will move from iproc to jproc
205c
206      do 14 i=1,np
207      if(me.eq.nen(i,1)) then
208      do 15 j=1,nbbl
209      if(lbbl(j,1).eq.i-1) then
210      nen(i,1)=lbbl(j,3)+1
211      nen(i,2)=lbbl(j,2)+1
212      goto 14
213      endif
214   15 continue
215      else
216      nen(i,1)=0
217      nen(i,2)=0
218      endif
219   14 continue
220c
221      call ga_igop(msp_17,nen,2*np,'+')
222c
223c     cascading
224c
225      call argos_space_cascad(nen,int_mb(i_bb))
226c
227      if(me.eq.0.and.lpsyn) then
228      write(lfnsyn,3002) 2,stime,synsum,waltim,nldup,tslow
229 3002 format('synchronization',/,i5,3f12.6,i5,f12.6)
230      do 16 i=1,np
231      write(lfnsyn,3003) nod(i),dlb(nod(i)+1)
232 3003 format(i5,f12.6,i7,i5)
233   16 continue
234      endif
235c
236      return
237      end
238c $Id$
239