1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: decomp.c,v 1.9.6.2 2007-07-04 00:50:06 manoj Exp $ */
6 /***************************************************************************
7  *---
8  *--- The software in this file implements three heuristics for distributing
9  *--- multidimensional arrays: ddb_h1 and ddb_h2 are fastest, ddb_ex does
10  *--- an exhaustive search, see below for details.
11  *---
12  *--- To compile this file: cc ddb.c -lm
13  *---
14  *--- Author: Joel Malard
15  *--- Address: Pacific Northwest National Laboratory
16  *---          Battelle Boulevard, PO Box 999
17  *---          Richland, WA 99352
18  *---
19  *--- Bug et al.: jm.malard@pnl.gov
20  *---
21  ***************************************************************************/
22 #if HAVE_STDIO_H
23 #   include <stdio.h>
24 #endif
25 #if HAVE_STDLIB_H
26 #   include <stdlib.h>
27 #endif
28 #if HAVE_MATH_H
29 #   include <math.h>
30 #endif
31 #include "ga-papi.h"
32 #include "typesf2c.h"
33 
34  /*--
35  *****************************************************************************
36  *--
37  *-- void ddb_h1 and ddb_h2 implement load-balancing heuristics
38  *-- void ddb_ex implements an exhaustive search
39  */
40 void ddb(Integer ndims, Integer ardims[], Integer npes, Integer blk[],
41          Integer pedims[]);
42 void ddb_ex( long ndims, Integer ardims[], long npes, double threshold,
43              Integer blk[], Integer pedims[]);
44 void ddb_h1( long ndims, Integer ardims[], long npes, double threshold,
45              Integer blk[], Integer pedims[]);
46 void ddb_h2( Integer ndims, Integer ardims[], Integer npes, double threshold,
47              Integer bias, Integer blk[], Integer pedims[]);
48 
49 /*---------------------------------------------------------------------------
50  *-- Arguments
51  *--
52  *-- The above three procedures have similar sequences of arguments
53  *-- the only difference is that ddb_h2 takes an additional input argument
54  *-- that induces the heuristic to possibly favor distributing the
55  *-- right or the left axes of the data array.
56  *--
57  *-- ndims (input): number of dimensions in the data array and the
58  *-- process grid. There is no provision for requesting process
59  *-- with fewer dimensions as the data array.
60  *--
61  *-- ardims (input): extents of each dimension of the data array.
62  *-- This array is of size ndims. destroyed.
63  *--
64  *-- npes (input): number of processes onto which the distribution
65  *-- takes place.
66  *--
67  *-- threshold (input): minimum acceptable value of the load balance
68  *-- ratio returned by ddb_ap(), see below.
69  *--
70  *-- bias (input to ddb_h2): when set to a positive value positive
71  *-- the rightmost axes of the data array are preferentially
72  *-- distributed, similarly when bias is negative. When bias is zero
73  *-- the heuristic attempts to deal processes equally among the axes
74  *-- of the data array.
75  *--
76  *-- blk (input/output): granularity of data mapping: The number of
77  *-- consecutive elements along each dimension of the array. Upon output:
78  *-- extents of the local array assigned to the process
79  *-- that is assigned the array element with lowest global indices, e.g. A[0].
80  *-- The meaning of this array is not the same for ddb than for the underlying
81  *-- load balancing subroutines. For the subroutine ddb, any non-positive value
82  *-- in array blk is taken at face value. For example with ardim=[50,40],
83  *-- blk = [3,-1] and npes = 40 the process grid that will be computed is:
84  *-- pedims=[20,2] with 3x2 processes storing no data. This is compatible
85  *-- with the semantics of the load-balancing subroutine in the 2D GA.
86  *--
87  *-- pedims(output): number of processes along each dimension of the
88  *-- data array.
89  *--
90  *--------------------------------------------------------------------------
91  *--
92  *-- Other prototypes
93  *--
94  *-- dd_ev evaluates the load balance ratio for the data distribution
95  *-- specified by its argument list.
96  *--
97  *-- ddb_ap and dd_lk are specific to ddb_h1.
98  */
99 void ddb_ap(long ndims, double qedims[], Integer ardims[], Integer pedims[],
100             long npes, long npdivs, long pdivs[]);
101 double dd_ev(long ndims,Integer ardims[], Integer pedims[]);
102 long dd_lk(long * prt, long n, double key);
103 void dd_su(long ndims, Integer ardims[], Integer pedims[], Integer blk[]);
104 /*---------------------------------------------------------------------------
105  *--
106  *-- Dependencies:
107  *--
108  *-- ddb: ddb_h2, ddb_ex
109  *-- ddb_ex: dd_ev, dd_su
110  *-- ddb_h1: ddb_ap, dd_ev, ddb_ex, dd_lk, dd_su  & -lm
111  *-- ddb_h2: dd_ev, ddb_ex, dd_su
112  *--
113  ***************************************************************************/
114 
115 #define THRESHOLD -0.1 /* The threshold for switching to an exhaustive search*/
116 
117 /************************************************************************
118  *--
119  *--  void ddb is a wrapper ontop of some load balancing heuristics. ddb
120  *--  is called from within GA to compute process grids.
121  *--  The first argument, ndims, is the number of
122  *--  array dimensions. The resulting process grid also has ndims
123  *--  dimensions but some of these can be degenerate.
124  ************************************************************************/
ddb(Integer ndims,Integer ardims[],Integer npes,Integer blk[],Integer pedims[])125 void ddb(Integer ndims, Integer ardims[], Integer npes, Integer blk[],
126          Integer pedims[])
127 {
128     double ddb_threshold = 0.1;
129     long ddb_bias = 0;
130     long i, j;
131     Integer count = 0;
132     Integer *tardim, *tblk, *tpedim;
133     long tp, sp;
134 
135     tp = (long)npes;
136 
137     /* count how many axes have <don't care> block values.*/
138     for(i=ndims-1;i>=0;i--){
139        if(blk[i]<=0){
140           pedims[i] = -1;
141           count += 1;
142        } else {
143           sp = (long)(ardims[i]+blk[i]-1)/blk[i];
144           if(sp>tp) {
145              sp = tp; tp = 1;
146              pedims[i] = (Integer)sp;
147           } else {
148              for(j=sp;j<tp&&(tp%j!=0);j++);
149              pedims[i] = (Integer)j;
150              tp = tp / j;
151           }
152        }
153     }
154 
155     if(count>0){
156        tardim = (Integer *) calloc((size_t)count,sizeof(Integer));
157        if(tardim==NULL) {
158          fprintf(stderr,"ddb: Memory allocation failed\n");
159          for(i=0;i<ndims;i++) pedims[i] = 0;
160          return;
161        }
162        tblk = (Integer *) calloc((size_t)count,sizeof(Integer));
163        if(tblk==NULL) {
164           fprintf(stderr,"ddb: Memory allocation failed\n");
165           for(i=0;i<ndims;i++) pedims[i] = 0;
166           return;
167        }
168        tpedim = (Integer *) calloc((size_t)count,sizeof(Integer));
169        if(tpedim==NULL) {
170           fprintf(stderr,"ddb: Memory allocation failed\n");
171           for(i=0;i<ndims;i++) pedims[i] = 0;
172           return;
173        }
174 
175        for(i=0;i<count;i++) tblk[i] = 1;
176        for(i=0,j=0;j<ndims;j++)
177           if(pedims[j]<0) tardim[i++] = ardims[j];
178 
179        ddb_h2( count, tardim, (Integer)tp, ddb_threshold, ddb_bias, tblk, tpedim);
180        /* ddb_h1( count, tardim, tp, ddb_threshold, tblk, tpedim); */
181 
182        for(i=0,j=0;j<ndims;j++)
183           if(pedims[j]<0){
184              blk[j] = (tardim[i]+tpedim[i]-1)/tpedim[i];
185              i = i+1;
186           }
187 
188        for(i=0,j=0;j<ndims;j++)
189           if(pedims[j]<0) pedims[j] = tpedim[i++];
190 
191        free( tpedim );
192        free( tblk );
193        free( tardim );
194     } else {
195        for(j=0;j<ndims;j++)
196           if(pedims[j]<0) pedims[j] = 1;
197     }
198 
199 }
200 /************************************************************************
201  *--
202  *--  void ddb_ex implements a naive data mapping of a multi-dimensional
203  *--  array across a process Cartesian topology. ndims is the number of
204  *--  array dimensions. The resulting process grid also has ndims
205  *--  dimensions but some of these can be degenerate.
206  *--
207  *--  Heuristic:   Let d be the number of dimensions of the data array.
208  *--  Return that assignment p1, ..., pd that minimizes the communication
209  *--  volume measure among those that maximizes the load balancing ratio
210  *--  computed by dd_ev.
211  *--  The communication volume measure is the sum of
212  *--  all monomials (ni/pi) of degree d-1.
213  *--
214  *--  ddb_ex returns as soon as it has found a process distribution whose
215  *--  load balance ratio is at least as large as the value of threshold.
216  *--
217  *--  This procedure allocates storage for 3*ndims+npes integers.
218  *--
219  ************************************************************************/
ddb_ex(long ndims,Integer ardims[],long npes,double threshold,Integer blk[],Integer pedims[])220 void ddb_ex( long ndims, Integer ardims[], long npes, double threshold,
221              Integer blk[], Integer pedims[])
222 {
223       Integer *tdims;
224       long *pdivs;
225       long npdivs;
226       long i, j, k;
227       long bev;
228       long pc, done;
229       long *stack;
230       Integer *tard;
231       long r, cev;
232       double clb, blb;
233 
234       /*- Quick exit -*/
235       if(ndims==1) {
236            pedims[0] = npes;
237            blb = dd_ev(ndims,ardims,pedims);
238            dd_su(1,ardims,pedims,blk);
239            return;
240       }
241 
242       /*- Reset array dimensions to reflect granularity -*/
243       tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
244       if(tard==NULL) {
245          fprintf(stderr,"ddb_ex: Memory allocation failed\n");
246          for(i=0;i<ndims;i++) blk[i] = 0;
247          return;
248       }
249       for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
250       for(i=0;i<ndims;i++) tard[i] = ardims[i] / blk[i];
251       for(i=0;i<ndims;i++) if (tard[i]<1) {
252          tard[i] = 1; blk[i] = ardims[i]; }
253 
254       /*- Allocate memory for current solution -*/
255       tdims = (Integer *) calloc((size_t)ndims,sizeof(Integer));
256       if(tdims==NULL) {
257          fprintf(stderr,"ddb_ex: Memory allocation failed\n");
258          for(i=0;i<ndims;i++) blk[i] = 0;
259          return;
260       }
261 
262       /*- Allocate memory to hold divisors of npes -*/
263       npdivs = 1;
264       for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
265       pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
266       if(pdivs==NULL) {
267          fprintf(stderr,"ddb_ex: Memory allocation failed\n");
268          for(i=0;i<ndims;i++) blk[i] = 0;
269          free(tard);
270          return;
271       }
272 
273       /*- Allocate storage for the recursion stack -*/
274       stack = (long *) calloc((size_t)ndims,sizeof(long));
275       if(stack==NULL){
276          fprintf(stderr,"%s: %s\n","ddb_ex",
277              "memory allocation failed");
278          for(i=0;i<ndims;i++) blk[i] = 0;
279          free(tard);
280          free(pdivs);
281          return;
282       }
283 
284       /*- Find all divisors of npes -*/
285       for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
286 
287       /*- Pump priming the exhaustive search -*/
288       blb = -1.0;
289       bev = 1.0;
290       for(i=0;i<ndims;i++) bev *= tard[i];
291       pedims[0]=npes; for(i=1;i<ndims;i++) pedims[i] = 1;
292       tdims[0] = 0;
293       stack[0] = npes;
294       pc = 0;
295       done = 0;
296 
297       /*-  Recursion loop -*/
298       do {
299          if(pc==ndims-1) {
300 	   /*- Set the number of processes for the last dimension -*/
301            tdims[pc] = stack[pc];
302 
303 	   /*- Evaluate current solution  -*/
304             clb = dd_ev(ndims,tard,tdims);
305             cev = 0;
306             for(k=0; k<ndims; k++){
307                r = 1;
308                for(j=0; j<ndims; j++) {
309                   if(j!=k) r = r*(tard[j]/tdims[j]);
310                }
311                cev = cev+r;
312             }
313             if(clb>blb || (clb==blb && cev<bev)) {
314                for(j=0; j<ndims; j++) pedims[j] = tdims[j];
315                blb = clb;
316                bev = cev;
317             }
318             if(blb>threshold) break;
319             tdims[pc] = 0;
320             pc -= 1;
321          } else {
322            if( tdims[pc] == stack[pc] ) {
323 	     /*- Backtrack when current array dimension has exhausted
324               *- all remaining processes
325               */
326               done = (pc==0);
327               tdims[pc] = 0;
328               pc -= 1;
329            } else {
330 	     /*- Increment the number of processes assigned to the current
331               *- array axis.
332               */
333               for(tdims[pc]+=1; stack[pc]%tdims[pc]!=0; tdims[pc]+=1);
334               pc += 1;
335               stack[pc] = npes;
336               for(i=0;i<pc;i++) stack[pc] /= tdims[i];
337               tdims[pc] = 0;
338            }
339          }
340       } while(!done);
341 
342       dd_su(ndims,ardims,pedims,blk);
343 
344       free(tard);
345       free(stack);
346       free(tdims);
347       free(pdivs);
348 }
349 /************************************************************************
350  *--
351  *-- void ddb_h1 estimates a good load balancing scheme
352  *-- for a Cartesian process topology given the extents
353  *-- along all dimensions of an array. It does so by solving
354  *-- exactly the continous problem and then finding a 'nearby'
355  *-- approximation to the solution of the discrete problem.
356  *--
357  *-- If the value of objective function attained with this heuristic
358  *-- is less than threshold then an exhaustive search is performed.
359  *--
360  *-- This procedure allocates storage for ndims longs and npes doubles and
361  *-- may call ddb_ex.
362  *--
363  ************************************************************************/
ddb_h1(long ndims,Integer ardims[],long npes,double threshold,Integer blk[],Integer pedims[])364 void ddb_h1(long ndims, Integer ardims[], long npes, double threshold,
365            Integer blk[], Integer pedims[])
366       {
367       long h, i, j, k;
368       double * qedims;
369       long * pdivs;
370       Integer * apdims;
371 	  Integer *tard;
372       long npdivs;
373       double t, q;
374       double cb, ub, blb;
375       if(ndims==1) {
376          pedims[0] = npes;
377          blb = dd_ev(ndims,ardims,pedims);
378          dd_su(ndims,ardims,pedims,blk);
379          return;
380       }
381 
382       /*- Allocate memory to store the granularity -*/
383       tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
384       if(tard==NULL){
385          fprintf(stderr,"%s: %s\n","ddb_h1",
386              "memory allocation failed");
387          for(i=0;i<ndims;i++) blk[i] = 0;
388          return;
389       }
390       /*- Reset array dimensions to reflect granularity -*/
391       for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
392       for(i=0;i<ndims;i++) tard[i] = ardims[i] / blk[i];
393       for(i=0;i<ndims;i++) if (tard[i]<1) {
394          tard[i] = 1; blk[i] = ardims[i];}
395 
396       /*- First solve the load balancing problem exactly in
397        *- floating point arithmetic -*/
398       qedims = (double *) calloc((size_t)ndims,sizeof(double));
399       if(qedims==NULL){
400          fprintf(stderr,"%s: %s\n","ddb_h1",
401              "memory allocation failed");
402          for(i=0;i<ndims;i++) blk[i] = 0;
403          free(tard);
404          return;
405       }
406 
407       qedims[0] = (double) npes;
408       for(i=1;i<ndims;i++) qedims[0] /= (tard[i]/(double)tard[0]);
409       qedims[0] = pow(qedims[0],1.0/ndims);
410 
411       for(i=1;i<ndims;i++){
412           qedims[i] = (tard[i]/(double)tard[0])*qedims[0];
413       };
414 
415       /*- Set up the search for a integer approximation the floating point solution -*/
416       npdivs = 1;
417       for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
418       pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
419       if(pdivs==NULL){
420          fprintf(stderr,"%s: %s\n","split",
421              "memory allocation failed");
422          for(i=0;i<ndims;i++) blk[i] = 0;
423          free(tard);
424          free(qedims);
425          return;
426       }
427       /*- Compute the discrete approximation -*/
428       for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
429       ddb_ap(ndims,qedims,tard,pedims,npes,npdivs,pdivs) ;
430       free(qedims);
431       free(pdivs);
432 
433 
434       /*- Lookout for a permutation of the solution vector
435        *- that would improve the initial solution -*/
436       apdims = (Integer *) calloc((size_t)ndims,sizeof(Integer));
437       if(apdims==NULL){
438          fprintf(stderr,"%s: %s\n","split",
439              "memory allocation failed");
440          for(i=0;i<ndims;i++) blk[i] = 0;
441          free(tard);
442          return;
443       }
444 
445       ub = dd_ev(ndims,tard,pedims);
446       for(k=0;k<ndims;k++) apdims[k] = pedims[k];
447 
448       do {
449          for(k=0;k<ndims;k++) pedims[k] = apdims[k];
450          blb = ub;
451 
452          /*- Find the worst distributed dimension -*/
453          h = 0;
454          q = (tard[0]<pedims[0]) ? pedims[0] : tard[0]%pedims[0];
455          q /= (double)pedims[0];
456          for(k=1;k<ndims;k++){
457             t = (tard[k]<pedims[k]) ? pedims[k] : tard[k]%pedims[k];
458             t /= (double) pedims[k];
459             if(t>q) {
460               h = k; q = t;
461             }
462          }
463 
464          /*- Swap elements of apdims to improve load balance */
465          j = h;
466          for(k=0;k<ndims;k++){
467             if(k==h) continue;
468             i = apdims[h]; apdims[h] = apdims[k]; apdims[k] = i;
469             cb = dd_ev(ndims,tard,apdims);
470             if(cb>ub) {
471               j = k;
472               ub = cb;
473             }
474             i = apdims[h]; apdims[h] = apdims[k]; apdims[k] = i;
475          }
476          if(j!=h){
477             i = apdims[h]; apdims[h] = apdims[j]; apdims[j] = i;
478          }
479       } while (ub > blb);
480 
481       for(i=0;i<ndims;i++) pedims[i] = apdims[i];
482       blb = ub;
483       free(apdims);
484 
485       /*- Do an exhaustive search is the heuristic returns a solution
486        *- whose load balance ratio is less than the given threshold. -*/
487       if(blb<threshold) ddb_ex(ndims,tard,npes,threshold,blk,pedims);
488 
489       free(tard);
490 
491       dd_su(ndims,ardims,pedims,blk);
492 
493       return;
494       }
495 
496 /*---------------------------------------------------------------------------
497  *--
498  *-- void ddb_ap find an integer vector that is near in some sense
499  *-- to a real valued vector
500  *--
501  *---------------------------------------------------------------------------*/
ddb_ap(long ndims,double * qedims,Integer * ardims,Integer * pedims,long npes,long npdivs,long * pdivs)502       void ddb_ap(long ndims, double * qedims, Integer * ardims, Integer * pedims,
503                   long npes, long npdivs, long * pdivs)
504       {
505       long bq;
506       long i, k, g;
507       long idim;
508          for(idim=0;idim<ndims-1;idim++){
509             g = dd_lk(pdivs,npdivs,qedims[idim]);
510             bq = pdivs[g] ;
511             pedims[idim] = bq;
512             npes /= bq;
513             npes = (npes<1) ? 1 : npes;
514             if(idim<ndims-2){
515                qedims[idim+1] = (double) npes;
516                for(i=idim+2;i<ndims;i++) qedims[idim+1] /=
517                    (ardims[i]/(double)ardims[idim+1]);
518                qedims[idim+1] = (double) pow(qedims[idim+1],
519                    1.0/(double)(ndims-1-idim));
520                for(i=idim+2;i<ndims;i++){
521                    qedims[i] = (ardims[i]/(double)ardims[idim+1])*
522                        qedims[idim+1];
523                }
524 
525                if(bq>1) {
526                    for(k=1,i=g+1;i<npdivs;i++){
527                        if(pdivs[i]%bq==0) {
528                            pdivs[k] = pdivs[i]/bq;
529                            k += 1;
530                        }
531                    }
532                    npdivs = k;
533                }
534             }
535          }
536          pedims[ndims-1] = npes;
537       }
538 /*---------------------------------------------------------------------------
539  *--
540  *-- long dd_lk find nearest match to key
541  *--
542  *---------------------------------------------------------------------------*/
dd_lk(long * prt,long n,double key)543       long dd_lk(long *prt, long n, double key)
544       {
545       long lw, md, hgh;
546       double km,kz;
547       long h, i;
548       long k;
549       double u, v;
550 
551       if(n==1) return 0 ;
552       if(n<=5) {
553         k = 0 ;
554         km = key-prt[0]; if(km<0.0) km = -km;
555         for(i=1;i<n;i++) {
556            kz = key-prt[i]; if(kz<0.0) kz = -kz;
557            if(kz<km) {
558               km = kz ;
559               k = i ;
560            }
561         }
562         return k ;
563       }
564       lw = 0; hgh=n-1;
565 
566       if(lw==hgh) {
567          return lw;
568       }
569       do {
570          md = (lw+hgh)/2;
571          if(key>prt[md]) {
572             lw = md+1;
573          } else {
574             hgh = md;
575          }
576       } while (lw<hgh);
577       kz = prt[lw];
578       u = key-prt[lw]; if(u<0.0) u = -u;
579       h = lw;
580       if(lw>0) {
581         v = key-prt[lw-1]; if(v<0.0) v = -v;
582         if(v<u) {
583           u = v;
584           h = lw-1;
585         }
586       }
587       if(lw<n-1) {
588         v = key-prt[lw+1]; if(v<0.0) v = -v;
589         if(v<u) {
590           u = v;
591           h = lw+1;
592         }
593       }
594       return h ;
595       }
596 /************************************************************************
597  *--
598  *-- void ddb_h2 lists all prime divisors of the number of
599  *-- processes and distribute these among the array dimensions.
600  *-- If the value of objective function attained with this heuristic
601  *-- is less than threshold then an exhaustive search is performed.
602  *-- The argument bias directs the search of ddb_h2. When bias is
603  *-- positive the rightmost axes of the data array are preferentially
604  *-- distributed, similarly when bias is negative. When bias is zero
605  *-- the heuristic attempts to deal processes equally among the axes
606  *-- of the data array.
607  *--
608  *-- ddb_h2 allocates storage for ndims+npes integers and may call ddb_ex.
609  *--
610  ************************************************************************/
ddb_h2(Integer ndims,Integer ardims[],Integer npes,double threshold,Integer bias,Integer blk[],Integer pedims[])611 void ddb_h2(Integer ndims, Integer ardims[], Integer npes, double threshold, Integer bias,
612             Integer blk[], Integer pedims[])
613       {
614       long h, i, j, k;
615       Integer *tard;
616       long * pdivs;
617       long npdivs;
618       long p0;
619       double q, w;
620       double ub;
621       long istart, istep, ilook;
622 
623       /*- Allocate memory to store the granularity -*/
624       tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
625       if(tard==NULL){
626          fprintf(stderr,"%s: %s\n","ddb_h2",
627              "memory allocation failed");
628          for(i=0;i<ndims;i++) blk[i] = 0;
629          return;
630       }
631       /*- Reset array dimensions to reflect granularity -*/
632       for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
633 
634       for(i=0;i<ndims;i++) tard[i] = (ardims[i]+ blk[i]-1)/blk[i]; /* JM */
635       for(i=0;i<ndims;i++) if (tard[i]<1) {
636          tard[i] = 1; blk[i] = ardims[i];
637       }
638 
639       /*- Allocate storage to old all divisors of npes -*/
640       npdivs = 1;
641       for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
642       pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
643       if(pdivs==NULL){
644          fprintf(stderr,"%s: %s\n","split",
645              "memory allocation failed");
646          for(i=0;i<ndims;i++) blk[i] = 0;
647          free(tard);
648          return;
649       }
650       /*- Find all divisors of npes -*/
651       for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
652 
653       /*- Find all prime divisors of npes (with repetitions) -*/
654       if(npdivs>1) {
655          k = 1;
656          do {
657             h = k+1;
658             for(j=h;j<npdivs;j++)
659             if(pdivs[j]%pdivs[k]==0)
660                pdivs[h++] = pdivs[j]/pdivs[k];
661             npdivs = h;
662             k = k+1;
663          } while(k<npdivs);
664       }
665 
666       /*- Set istart and istep -*/
667       istep = 1;
668       istart = 0;
669       if(bias>0) {
670           istep = -1;
671           istart = ndims-1;
672       }
673       /*- Set pedims -*/
674       for(j=0;j<ndims;j++) pedims[j] = 1.0;
675       for(k=npdivs-1;k>=1;k--){
676          p0 = pdivs[k]; h = istart;
677          q = (tard[istart]<p0*pedims[istart]) ? 1.1 :
678              (tard[istart]%(p0*pedims[istart]))/(double) tard[istart];
679          for(j=1;j<ndims;j++){
680             ilook = (istart+istep*j)%ndims;
681             w = (tard[ilook]<p0*pedims[ilook]) ? 1.1 :
682                 (tard[ilook]%(p0*pedims[ilook]))/(double) tard[ilook];
683             if(w<q) { q = w; h = ilook; }
684          }
685          pedims[h] *= p0;
686          if(bias==0) istart = (istart+1)%ndims;
687       }
688       free(pdivs);
689 
690       ub = dd_ev(ndims,tard,pedims);
691 
692       /*- Do an exhaustive search is the heuristic returns a solution
693        *- whose load balance ratio is less than the given threshold. -*/
694       if(ub<threshold) {
695          ddb_ex(ndims, tard, npes, threshold, blk, pedims);
696       }
697 
698       dd_su(ndims,ardims,pedims,blk);
699 
700       for(i=0;i<ndims;i++)
701           if(pedims[i]>0){
702              blk[i] = (tard[i]+pedims[i]-1)/pedims[i];
703           } else {
704              pnga_error("process dimension is zero: ddb_h2",0);
705           }
706 
707       free(tard);
708       return;
709       }
710 /****************************************************************************
711  *--
712  *--  double dd_ev evaluates the load balancing ratio as follows:
713  *--
714  *--  Let n1, n2 ... nd be the extents of the array dimensions
715  *--  Let p1, p2 ... pd be the numbers of processes across
716  *--      the corresponding dimensions of the process grid
717  *--  Let there be npes processes available
718  *--
719  *--  Load balancing measure = (n1/p1)*...*(nd/pd)*npes
720  *--                           ------------------------
721  *--                                 n1*n2*...*nd
722  *--  The communication volume measure is the sum of
723  *--  all monomials (ni/pi) of degree d-1.
724  *--
725  ****************************************************************************/
dd_ev(long ndims,Integer ardims[],Integer pedims[])726       double dd_ev(long ndims,Integer ardims[], Integer pedims[])
727       {
728       double q, t;
729       long k;
730       q = 1.0;
731       t = 1.0;
732       for(k=0;k<ndims;k++){
733           q = (ardims[k]/pedims[k])*pedims[k];
734           t = t*(q/(double)ardims[k]);
735       }
736       return t;
737       }
738 /****************************************************************************
739  *--
740  *--  void dd_su computes the extents of the local block corresponding
741  *--  to the element with least global indices, e.g. A[0][0][0].
742  *--
743  ****************************************************************************/
dd_su(long ndims,Integer ardims[],Integer pedims[],Integer blk[])744       void dd_su(long ndims, Integer ardims[], Integer pedims[], Integer blk[])
745       {
746       long i;
747 
748       for(i=0;i<ndims;i++) {
749            blk[i] = ardims[i]/pedims[i];
750            if(blk[i]<1) blk[i] = 1;
751       }
752       }
753 /****************************************************************************
754  *---
755  *--- The End
756  *---
757  ****************************************************************************/
758 
759