1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: decomp.c,v 1.9.6.2 2007-07-04 00:50:06 manoj Exp $ */
6 /***************************************************************************
7 *---
8 *--- The software in this file implements three heuristics for distributing
9 *--- multidimensional arrays: ddb_h1 and ddb_h2 are fastest, ddb_ex does
10 *--- an exhaustive search, see below for details.
11 *---
12 *--- To compile this file: cc ddb.c -lm
13 *---
14 *--- Author: Joel Malard
15 *--- Address: Pacific Northwest National Laboratory
16 *--- Battelle Boulevard, PO Box 999
17 *--- Richland, WA 99352
18 *---
19 *--- Bug et al.: jm.malard@pnl.gov
20 *---
21 ***************************************************************************/
22 #if HAVE_STDIO_H
23 # include <stdio.h>
24 #endif
25 #if HAVE_STDLIB_H
26 # include <stdlib.h>
27 #endif
28 #if HAVE_MATH_H
29 # include <math.h>
30 #endif
31 #include "ga-papi.h"
32 #include "typesf2c.h"
33
34 /*--
35 *****************************************************************************
36 *--
37 *-- void ddb_h1 and ddb_h2 implement load-balancing heuristics
38 *-- void ddb_ex implements an exhaustive search
39 */
40 void ddb(Integer ndims, Integer ardims[], Integer npes, Integer blk[],
41 Integer pedims[]);
42 void ddb_ex( long ndims, Integer ardims[], long npes, double threshold,
43 Integer blk[], Integer pedims[]);
44 void ddb_h1( long ndims, Integer ardims[], long npes, double threshold,
45 Integer blk[], Integer pedims[]);
46 void ddb_h2( Integer ndims, Integer ardims[], Integer npes, double threshold,
47 Integer bias, Integer blk[], Integer pedims[]);
48
49 /*---------------------------------------------------------------------------
50 *-- Arguments
51 *--
52 *-- The above three procedures have similar sequences of arguments
53 *-- the only difference is that ddb_h2 takes an additional input argument
54 *-- that induces the heuristic to possibly favor distributing the
55 *-- right or the left axes of the data array.
56 *--
57 *-- ndims (input): number of dimensions in the data array and the
58 *-- process grid. There is no provision for requesting process
59 *-- with fewer dimensions as the data array.
60 *--
61 *-- ardims (input): extents of each dimension of the data array.
62 *-- This array is of size ndims. destroyed.
63 *--
64 *-- npes (input): number of processes onto which the distribution
65 *-- takes place.
66 *--
67 *-- threshold (input): minimum acceptable value of the load balance
68 *-- ratio returned by ddb_ap(), see below.
69 *--
70 *-- bias (input to ddb_h2): when set to a positive value positive
71 *-- the rightmost axes of the data array are preferentially
72 *-- distributed, similarly when bias is negative. When bias is zero
73 *-- the heuristic attempts to deal processes equally among the axes
74 *-- of the data array.
75 *--
76 *-- blk (input/output): granularity of data mapping: The number of
77 *-- consecutive elements along each dimension of the array. Upon output:
78 *-- extents of the local array assigned to the process
79 *-- that is assigned the array element with lowest global indices, e.g. A[0].
80 *-- The meaning of this array is not the same for ddb than for the underlying
81 *-- load balancing subroutines. For the subroutine ddb, any non-positive value
82 *-- in array blk is taken at face value. For example with ardim=[50,40],
83 *-- blk = [3,-1] and npes = 40 the process grid that will be computed is:
84 *-- pedims=[20,2] with 3x2 processes storing no data. This is compatible
85 *-- with the semantics of the load-balancing subroutine in the 2D GA.
86 *--
87 *-- pedims(output): number of processes along each dimension of the
88 *-- data array.
89 *--
90 *--------------------------------------------------------------------------
91 *--
92 *-- Other prototypes
93 *--
94 *-- dd_ev evaluates the load balance ratio for the data distribution
95 *-- specified by its argument list.
96 *--
97 *-- ddb_ap and dd_lk are specific to ddb_h1.
98 */
99 void ddb_ap(long ndims, double qedims[], Integer ardims[], Integer pedims[],
100 long npes, long npdivs, long pdivs[]);
101 double dd_ev(long ndims,Integer ardims[], Integer pedims[]);
102 long dd_lk(long * prt, long n, double key);
103 void dd_su(long ndims, Integer ardims[], Integer pedims[], Integer blk[]);
104 /*---------------------------------------------------------------------------
105 *--
106 *-- Dependencies:
107 *--
108 *-- ddb: ddb_h2, ddb_ex
109 *-- ddb_ex: dd_ev, dd_su
110 *-- ddb_h1: ddb_ap, dd_ev, ddb_ex, dd_lk, dd_su & -lm
111 *-- ddb_h2: dd_ev, ddb_ex, dd_su
112 *--
113 ***************************************************************************/
114
115 #define THRESHOLD -0.1 /* The threshold for switching to an exhaustive search*/
116
117 /************************************************************************
118 *--
119 *-- void ddb is a wrapper ontop of some load balancing heuristics. ddb
120 *-- is called from within GA to compute process grids.
121 *-- The first argument, ndims, is the number of
122 *-- array dimensions. The resulting process grid also has ndims
123 *-- dimensions but some of these can be degenerate.
124 ************************************************************************/
ddb(Integer ndims,Integer ardims[],Integer npes,Integer blk[],Integer pedims[])125 void ddb(Integer ndims, Integer ardims[], Integer npes, Integer blk[],
126 Integer pedims[])
127 {
128 double ddb_threshold = 0.1;
129 long ddb_bias = 0;
130 long i, j;
131 Integer count = 0;
132 Integer *tardim, *tblk, *tpedim;
133 long tp, sp;
134
135 tp = (long)npes;
136
137 /* count how many axes have <don't care> block values.*/
138 for(i=ndims-1;i>=0;i--){
139 if(blk[i]<=0){
140 pedims[i] = -1;
141 count += 1;
142 } else {
143 sp = (long)(ardims[i]+blk[i]-1)/blk[i];
144 if(sp>tp) {
145 sp = tp; tp = 1;
146 pedims[i] = (Integer)sp;
147 } else {
148 for(j=sp;j<tp&&(tp%j!=0);j++);
149 pedims[i] = (Integer)j;
150 tp = tp / j;
151 }
152 }
153 }
154
155 if(count>0){
156 tardim = (Integer *) calloc((size_t)count,sizeof(Integer));
157 if(tardim==NULL) {
158 fprintf(stderr,"ddb: Memory allocation failed\n");
159 for(i=0;i<ndims;i++) pedims[i] = 0;
160 return;
161 }
162 tblk = (Integer *) calloc((size_t)count,sizeof(Integer));
163 if(tblk==NULL) {
164 fprintf(stderr,"ddb: Memory allocation failed\n");
165 for(i=0;i<ndims;i++) pedims[i] = 0;
166 return;
167 }
168 tpedim = (Integer *) calloc((size_t)count,sizeof(Integer));
169 if(tpedim==NULL) {
170 fprintf(stderr,"ddb: Memory allocation failed\n");
171 for(i=0;i<ndims;i++) pedims[i] = 0;
172 return;
173 }
174
175 for(i=0;i<count;i++) tblk[i] = 1;
176 for(i=0,j=0;j<ndims;j++)
177 if(pedims[j]<0) tardim[i++] = ardims[j];
178
179 ddb_h2( count, tardim, (Integer)tp, ddb_threshold, ddb_bias, tblk, tpedim);
180 /* ddb_h1( count, tardim, tp, ddb_threshold, tblk, tpedim); */
181
182 for(i=0,j=0;j<ndims;j++)
183 if(pedims[j]<0){
184 blk[j] = (tardim[i]+tpedim[i]-1)/tpedim[i];
185 i = i+1;
186 }
187
188 for(i=0,j=0;j<ndims;j++)
189 if(pedims[j]<0) pedims[j] = tpedim[i++];
190
191 free( tpedim );
192 free( tblk );
193 free( tardim );
194 } else {
195 for(j=0;j<ndims;j++)
196 if(pedims[j]<0) pedims[j] = 1;
197 }
198
199 }
200 /************************************************************************
201 *--
202 *-- void ddb_ex implements a naive data mapping of a multi-dimensional
203 *-- array across a process Cartesian topology. ndims is the number of
204 *-- array dimensions. The resulting process grid also has ndims
205 *-- dimensions but some of these can be degenerate.
206 *--
207 *-- Heuristic: Let d be the number of dimensions of the data array.
208 *-- Return that assignment p1, ..., pd that minimizes the communication
209 *-- volume measure among those that maximizes the load balancing ratio
210 *-- computed by dd_ev.
211 *-- The communication volume measure is the sum of
212 *-- all monomials (ni/pi) of degree d-1.
213 *--
214 *-- ddb_ex returns as soon as it has found a process distribution whose
215 *-- load balance ratio is at least as large as the value of threshold.
216 *--
217 *-- This procedure allocates storage for 3*ndims+npes integers.
218 *--
219 ************************************************************************/
ddb_ex(long ndims,Integer ardims[],long npes,double threshold,Integer blk[],Integer pedims[])220 void ddb_ex( long ndims, Integer ardims[], long npes, double threshold,
221 Integer blk[], Integer pedims[])
222 {
223 Integer *tdims;
224 long *pdivs;
225 long npdivs;
226 long i, j, k;
227 long bev;
228 long pc, done;
229 long *stack;
230 Integer *tard;
231 long r, cev;
232 double clb, blb;
233
234 /*- Quick exit -*/
235 if(ndims==1) {
236 pedims[0] = npes;
237 blb = dd_ev(ndims,ardims,pedims);
238 dd_su(1,ardims,pedims,blk);
239 return;
240 }
241
242 /*- Reset array dimensions to reflect granularity -*/
243 tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
244 if(tard==NULL) {
245 fprintf(stderr,"ddb_ex: Memory allocation failed\n");
246 for(i=0;i<ndims;i++) blk[i] = 0;
247 return;
248 }
249 for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
250 for(i=0;i<ndims;i++) tard[i] = ardims[i] / blk[i];
251 for(i=0;i<ndims;i++) if (tard[i]<1) {
252 tard[i] = 1; blk[i] = ardims[i]; }
253
254 /*- Allocate memory for current solution -*/
255 tdims = (Integer *) calloc((size_t)ndims,sizeof(Integer));
256 if(tdims==NULL) {
257 fprintf(stderr,"ddb_ex: Memory allocation failed\n");
258 for(i=0;i<ndims;i++) blk[i] = 0;
259 return;
260 }
261
262 /*- Allocate memory to hold divisors of npes -*/
263 npdivs = 1;
264 for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
265 pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
266 if(pdivs==NULL) {
267 fprintf(stderr,"ddb_ex: Memory allocation failed\n");
268 for(i=0;i<ndims;i++) blk[i] = 0;
269 free(tard);
270 return;
271 }
272
273 /*- Allocate storage for the recursion stack -*/
274 stack = (long *) calloc((size_t)ndims,sizeof(long));
275 if(stack==NULL){
276 fprintf(stderr,"%s: %s\n","ddb_ex",
277 "memory allocation failed");
278 for(i=0;i<ndims;i++) blk[i] = 0;
279 free(tard);
280 free(pdivs);
281 return;
282 }
283
284 /*- Find all divisors of npes -*/
285 for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
286
287 /*- Pump priming the exhaustive search -*/
288 blb = -1.0;
289 bev = 1.0;
290 for(i=0;i<ndims;i++) bev *= tard[i];
291 pedims[0]=npes; for(i=1;i<ndims;i++) pedims[i] = 1;
292 tdims[0] = 0;
293 stack[0] = npes;
294 pc = 0;
295 done = 0;
296
297 /*- Recursion loop -*/
298 do {
299 if(pc==ndims-1) {
300 /*- Set the number of processes for the last dimension -*/
301 tdims[pc] = stack[pc];
302
303 /*- Evaluate current solution -*/
304 clb = dd_ev(ndims,tard,tdims);
305 cev = 0;
306 for(k=0; k<ndims; k++){
307 r = 1;
308 for(j=0; j<ndims; j++) {
309 if(j!=k) r = r*(tard[j]/tdims[j]);
310 }
311 cev = cev+r;
312 }
313 if(clb>blb || (clb==blb && cev<bev)) {
314 for(j=0; j<ndims; j++) pedims[j] = tdims[j];
315 blb = clb;
316 bev = cev;
317 }
318 if(blb>threshold) break;
319 tdims[pc] = 0;
320 pc -= 1;
321 } else {
322 if( tdims[pc] == stack[pc] ) {
323 /*- Backtrack when current array dimension has exhausted
324 *- all remaining processes
325 */
326 done = (pc==0);
327 tdims[pc] = 0;
328 pc -= 1;
329 } else {
330 /*- Increment the number of processes assigned to the current
331 *- array axis.
332 */
333 for(tdims[pc]+=1; stack[pc]%tdims[pc]!=0; tdims[pc]+=1);
334 pc += 1;
335 stack[pc] = npes;
336 for(i=0;i<pc;i++) stack[pc] /= tdims[i];
337 tdims[pc] = 0;
338 }
339 }
340 } while(!done);
341
342 dd_su(ndims,ardims,pedims,blk);
343
344 free(tard);
345 free(stack);
346 free(tdims);
347 free(pdivs);
348 }
349 /************************************************************************
350 *--
351 *-- void ddb_h1 estimates a good load balancing scheme
352 *-- for a Cartesian process topology given the extents
353 *-- along all dimensions of an array. It does so by solving
354 *-- exactly the continous problem and then finding a 'nearby'
355 *-- approximation to the solution of the discrete problem.
356 *--
357 *-- If the value of objective function attained with this heuristic
358 *-- is less than threshold then an exhaustive search is performed.
359 *--
360 *-- This procedure allocates storage for ndims longs and npes doubles and
361 *-- may call ddb_ex.
362 *--
363 ************************************************************************/
ddb_h1(long ndims,Integer ardims[],long npes,double threshold,Integer blk[],Integer pedims[])364 void ddb_h1(long ndims, Integer ardims[], long npes, double threshold,
365 Integer blk[], Integer pedims[])
366 {
367 long h, i, j, k;
368 double * qedims;
369 long * pdivs;
370 Integer * apdims;
371 Integer *tard;
372 long npdivs;
373 double t, q;
374 double cb, ub, blb;
375 if(ndims==1) {
376 pedims[0] = npes;
377 blb = dd_ev(ndims,ardims,pedims);
378 dd_su(ndims,ardims,pedims,blk);
379 return;
380 }
381
382 /*- Allocate memory to store the granularity -*/
383 tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
384 if(tard==NULL){
385 fprintf(stderr,"%s: %s\n","ddb_h1",
386 "memory allocation failed");
387 for(i=0;i<ndims;i++) blk[i] = 0;
388 return;
389 }
390 /*- Reset array dimensions to reflect granularity -*/
391 for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
392 for(i=0;i<ndims;i++) tard[i] = ardims[i] / blk[i];
393 for(i=0;i<ndims;i++) if (tard[i]<1) {
394 tard[i] = 1; blk[i] = ardims[i];}
395
396 /*- First solve the load balancing problem exactly in
397 *- floating point arithmetic -*/
398 qedims = (double *) calloc((size_t)ndims,sizeof(double));
399 if(qedims==NULL){
400 fprintf(stderr,"%s: %s\n","ddb_h1",
401 "memory allocation failed");
402 for(i=0;i<ndims;i++) blk[i] = 0;
403 free(tard);
404 return;
405 }
406
407 qedims[0] = (double) npes;
408 for(i=1;i<ndims;i++) qedims[0] /= (tard[i]/(double)tard[0]);
409 qedims[0] = pow(qedims[0],1.0/ndims);
410
411 for(i=1;i<ndims;i++){
412 qedims[i] = (tard[i]/(double)tard[0])*qedims[0];
413 };
414
415 /*- Set up the search for a integer approximation the floating point solution -*/
416 npdivs = 1;
417 for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
418 pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
419 if(pdivs==NULL){
420 fprintf(stderr,"%s: %s\n","split",
421 "memory allocation failed");
422 for(i=0;i<ndims;i++) blk[i] = 0;
423 free(tard);
424 free(qedims);
425 return;
426 }
427 /*- Compute the discrete approximation -*/
428 for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
429 ddb_ap(ndims,qedims,tard,pedims,npes,npdivs,pdivs) ;
430 free(qedims);
431 free(pdivs);
432
433
434 /*- Lookout for a permutation of the solution vector
435 *- that would improve the initial solution -*/
436 apdims = (Integer *) calloc((size_t)ndims,sizeof(Integer));
437 if(apdims==NULL){
438 fprintf(stderr,"%s: %s\n","split",
439 "memory allocation failed");
440 for(i=0;i<ndims;i++) blk[i] = 0;
441 free(tard);
442 return;
443 }
444
445 ub = dd_ev(ndims,tard,pedims);
446 for(k=0;k<ndims;k++) apdims[k] = pedims[k];
447
448 do {
449 for(k=0;k<ndims;k++) pedims[k] = apdims[k];
450 blb = ub;
451
452 /*- Find the worst distributed dimension -*/
453 h = 0;
454 q = (tard[0]<pedims[0]) ? pedims[0] : tard[0]%pedims[0];
455 q /= (double)pedims[0];
456 for(k=1;k<ndims;k++){
457 t = (tard[k]<pedims[k]) ? pedims[k] : tard[k]%pedims[k];
458 t /= (double) pedims[k];
459 if(t>q) {
460 h = k; q = t;
461 }
462 }
463
464 /*- Swap elements of apdims to improve load balance */
465 j = h;
466 for(k=0;k<ndims;k++){
467 if(k==h) continue;
468 i = apdims[h]; apdims[h] = apdims[k]; apdims[k] = i;
469 cb = dd_ev(ndims,tard,apdims);
470 if(cb>ub) {
471 j = k;
472 ub = cb;
473 }
474 i = apdims[h]; apdims[h] = apdims[k]; apdims[k] = i;
475 }
476 if(j!=h){
477 i = apdims[h]; apdims[h] = apdims[j]; apdims[j] = i;
478 }
479 } while (ub > blb);
480
481 for(i=0;i<ndims;i++) pedims[i] = apdims[i];
482 blb = ub;
483 free(apdims);
484
485 /*- Do an exhaustive search is the heuristic returns a solution
486 *- whose load balance ratio is less than the given threshold. -*/
487 if(blb<threshold) ddb_ex(ndims,tard,npes,threshold,blk,pedims);
488
489 free(tard);
490
491 dd_su(ndims,ardims,pedims,blk);
492
493 return;
494 }
495
496 /*---------------------------------------------------------------------------
497 *--
498 *-- void ddb_ap find an integer vector that is near in some sense
499 *-- to a real valued vector
500 *--
501 *---------------------------------------------------------------------------*/
ddb_ap(long ndims,double * qedims,Integer * ardims,Integer * pedims,long npes,long npdivs,long * pdivs)502 void ddb_ap(long ndims, double * qedims, Integer * ardims, Integer * pedims,
503 long npes, long npdivs, long * pdivs)
504 {
505 long bq;
506 long i, k, g;
507 long idim;
508 for(idim=0;idim<ndims-1;idim++){
509 g = dd_lk(pdivs,npdivs,qedims[idim]);
510 bq = pdivs[g] ;
511 pedims[idim] = bq;
512 npes /= bq;
513 npes = (npes<1) ? 1 : npes;
514 if(idim<ndims-2){
515 qedims[idim+1] = (double) npes;
516 for(i=idim+2;i<ndims;i++) qedims[idim+1] /=
517 (ardims[i]/(double)ardims[idim+1]);
518 qedims[idim+1] = (double) pow(qedims[idim+1],
519 1.0/(double)(ndims-1-idim));
520 for(i=idim+2;i<ndims;i++){
521 qedims[i] = (ardims[i]/(double)ardims[idim+1])*
522 qedims[idim+1];
523 }
524
525 if(bq>1) {
526 for(k=1,i=g+1;i<npdivs;i++){
527 if(pdivs[i]%bq==0) {
528 pdivs[k] = pdivs[i]/bq;
529 k += 1;
530 }
531 }
532 npdivs = k;
533 }
534 }
535 }
536 pedims[ndims-1] = npes;
537 }
538 /*---------------------------------------------------------------------------
539 *--
540 *-- long dd_lk find nearest match to key
541 *--
542 *---------------------------------------------------------------------------*/
dd_lk(long * prt,long n,double key)543 long dd_lk(long *prt, long n, double key)
544 {
545 long lw, md, hgh;
546 double km,kz;
547 long h, i;
548 long k;
549 double u, v;
550
551 if(n==1) return 0 ;
552 if(n<=5) {
553 k = 0 ;
554 km = key-prt[0]; if(km<0.0) km = -km;
555 for(i=1;i<n;i++) {
556 kz = key-prt[i]; if(kz<0.0) kz = -kz;
557 if(kz<km) {
558 km = kz ;
559 k = i ;
560 }
561 }
562 return k ;
563 }
564 lw = 0; hgh=n-1;
565
566 if(lw==hgh) {
567 return lw;
568 }
569 do {
570 md = (lw+hgh)/2;
571 if(key>prt[md]) {
572 lw = md+1;
573 } else {
574 hgh = md;
575 }
576 } while (lw<hgh);
577 kz = prt[lw];
578 u = key-prt[lw]; if(u<0.0) u = -u;
579 h = lw;
580 if(lw>0) {
581 v = key-prt[lw-1]; if(v<0.0) v = -v;
582 if(v<u) {
583 u = v;
584 h = lw-1;
585 }
586 }
587 if(lw<n-1) {
588 v = key-prt[lw+1]; if(v<0.0) v = -v;
589 if(v<u) {
590 u = v;
591 h = lw+1;
592 }
593 }
594 return h ;
595 }
596 /************************************************************************
597 *--
598 *-- void ddb_h2 lists all prime divisors of the number of
599 *-- processes and distribute these among the array dimensions.
600 *-- If the value of objective function attained with this heuristic
601 *-- is less than threshold then an exhaustive search is performed.
602 *-- The argument bias directs the search of ddb_h2. When bias is
603 *-- positive the rightmost axes of the data array are preferentially
604 *-- distributed, similarly when bias is negative. When bias is zero
605 *-- the heuristic attempts to deal processes equally among the axes
606 *-- of the data array.
607 *--
608 *-- ddb_h2 allocates storage for ndims+npes integers and may call ddb_ex.
609 *--
610 ************************************************************************/
ddb_h2(Integer ndims,Integer ardims[],Integer npes,double threshold,Integer bias,Integer blk[],Integer pedims[])611 void ddb_h2(Integer ndims, Integer ardims[], Integer npes, double threshold, Integer bias,
612 Integer blk[], Integer pedims[])
613 {
614 long h, i, j, k;
615 Integer *tard;
616 long * pdivs;
617 long npdivs;
618 long p0;
619 double q, w;
620 double ub;
621 long istart, istep, ilook;
622
623 /*- Allocate memory to store the granularity -*/
624 tard = (Integer *) calloc((size_t)ndims,sizeof(Integer));
625 if(tard==NULL){
626 fprintf(stderr,"%s: %s\n","ddb_h2",
627 "memory allocation failed");
628 for(i=0;i<ndims;i++) blk[i] = 0;
629 return;
630 }
631 /*- Reset array dimensions to reflect granularity -*/
632 for(i=0;i<ndims;i++) if (blk[i]<1) blk[i] = 1;
633
634 for(i=0;i<ndims;i++) tard[i] = (ardims[i]+ blk[i]-1)/blk[i]; /* JM */
635 for(i=0;i<ndims;i++) if (tard[i]<1) {
636 tard[i] = 1; blk[i] = ardims[i];
637 }
638
639 /*- Allocate storage to old all divisors of npes -*/
640 npdivs = 1;
641 for(i=2;i<=npes;i++) if(npes%i==0) npdivs += 1;
642 pdivs = (long *) calloc((size_t)npdivs,sizeof(long));
643 if(pdivs==NULL){
644 fprintf(stderr,"%s: %s\n","split",
645 "memory allocation failed");
646 for(i=0;i<ndims;i++) blk[i] = 0;
647 free(tard);
648 return;
649 }
650 /*- Find all divisors of npes -*/
651 for(j=0,i=1;i<=npes;i++) if(npes%i==0) pdivs[j++] = i;
652
653 /*- Find all prime divisors of npes (with repetitions) -*/
654 if(npdivs>1) {
655 k = 1;
656 do {
657 h = k+1;
658 for(j=h;j<npdivs;j++)
659 if(pdivs[j]%pdivs[k]==0)
660 pdivs[h++] = pdivs[j]/pdivs[k];
661 npdivs = h;
662 k = k+1;
663 } while(k<npdivs);
664 }
665
666 /*- Set istart and istep -*/
667 istep = 1;
668 istart = 0;
669 if(bias>0) {
670 istep = -1;
671 istart = ndims-1;
672 }
673 /*- Set pedims -*/
674 for(j=0;j<ndims;j++) pedims[j] = 1.0;
675 for(k=npdivs-1;k>=1;k--){
676 p0 = pdivs[k]; h = istart;
677 q = (tard[istart]<p0*pedims[istart]) ? 1.1 :
678 (tard[istart]%(p0*pedims[istart]))/(double) tard[istart];
679 for(j=1;j<ndims;j++){
680 ilook = (istart+istep*j)%ndims;
681 w = (tard[ilook]<p0*pedims[ilook]) ? 1.1 :
682 (tard[ilook]%(p0*pedims[ilook]))/(double) tard[ilook];
683 if(w<q) { q = w; h = ilook; }
684 }
685 pedims[h] *= p0;
686 if(bias==0) istart = (istart+1)%ndims;
687 }
688 free(pdivs);
689
690 ub = dd_ev(ndims,tard,pedims);
691
692 /*- Do an exhaustive search is the heuristic returns a solution
693 *- whose load balance ratio is less than the given threshold. -*/
694 if(ub<threshold) {
695 ddb_ex(ndims, tard, npes, threshold, blk, pedims);
696 }
697
698 dd_su(ndims,ardims,pedims,blk);
699
700 for(i=0;i<ndims;i++)
701 if(pedims[i]>0){
702 blk[i] = (tard[i]+pedims[i]-1)/pedims[i];
703 } else {
704 pnga_error("process dimension is zero: ddb_h2",0);
705 }
706
707 free(tard);
708 return;
709 }
710 /****************************************************************************
711 *--
712 *-- double dd_ev evaluates the load balancing ratio as follows:
713 *--
714 *-- Let n1, n2 ... nd be the extents of the array dimensions
715 *-- Let p1, p2 ... pd be the numbers of processes across
716 *-- the corresponding dimensions of the process grid
717 *-- Let there be npes processes available
718 *--
719 *-- Load balancing measure = (n1/p1)*...*(nd/pd)*npes
720 *-- ------------------------
721 *-- n1*n2*...*nd
722 *-- The communication volume measure is the sum of
723 *-- all monomials (ni/pi) of degree d-1.
724 *--
725 ****************************************************************************/
dd_ev(long ndims,Integer ardims[],Integer pedims[])726 double dd_ev(long ndims,Integer ardims[], Integer pedims[])
727 {
728 double q, t;
729 long k;
730 q = 1.0;
731 t = 1.0;
732 for(k=0;k<ndims;k++){
733 q = (ardims[k]/pedims[k])*pedims[k];
734 t = t*(q/(double)ardims[k]);
735 }
736 return t;
737 }
738 /****************************************************************************
739 *--
740 *-- void dd_su computes the extents of the local block corresponding
741 *-- to the element with least global indices, e.g. A[0][0][0].
742 *--
743 ****************************************************************************/
dd_su(long ndims,Integer ardims[],Integer pedims[],Integer blk[])744 void dd_su(long ndims, Integer ardims[], Integer pedims[], Integer blk[])
745 {
746 long i;
747
748 for(i=0;i<ndims;i++) {
749 blk[i] = ardims[i]/pedims[i];
750 if(blk[i]<1) blk[i] = 1;
751 }
752 }
753 /****************************************************************************
754 *---
755 *--- The End
756 *---
757 ****************************************************************************/
758
759