1 /*
2 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2015 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2015 Research Organization for Information Science
13 * and Technology (RIST). All rights reserved.
14 * $COPYRIGHT$
15 *
16 * Additional copyrights may follow
17 *
18 * $HEADER$
19 */
20
21 #include "ompi_config.h"
22
23 #include "mpi.h"
24 #include "opal/util/bit_ops.h"
25 #include "ompi/constants.h"
26 #include "ompi/communicator/communicator.h"
27 #include "ompi/mca/coll/base/coll_tags.h"
28 #include "ompi/mca/coll/base/coll_base_functions.h"
29 #include "coll_base_topo.h"
30
31 /*
32 * Some static helpers.
33 */
pown(int fanout,int num)34 static int pown( int fanout, int num )
35 {
36 int j, p = 1;
37 if( num < 0 ) return 0;
38 if (1==num) return fanout;
39 if (2==fanout) {
40 return p<<num;
41 }
42 else {
43 for( j = 0; j < num; j++ ) { p*= fanout; }
44 }
45 return p;
46 }
47
calculate_level(int fanout,int rank)48 static int calculate_level( int fanout, int rank )
49 {
50 int level, num;
51 if( rank < 0 ) return -1;
52 for( level = 0, num = 0; num <= rank; level++ ) {
53 num += pown(fanout, level);
54 }
55 return level-1;
56 }
57
calculate_num_nodes_up_to_level(int fanout,int level)58 static int calculate_num_nodes_up_to_level( int fanout, int level )
59 {
60 /* just use geometric progression formula for sum:
61 a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
62 return ((pown(fanout,level) - 1)/(fanout - 1));
63 }
64
65 /*
66 * And now the building functions.
67 *
68 * An example for fanout = 2, comm_size = 7
69 *
70 * 0 <-- delta = 1 (fanout^0)
71 * / \
72 * 1 2 <-- delta = 2 (fanout^1)
73 * / \ / \
74 * 3 5 4 6 <-- delta = 4 (fanout^2)
75 */
76
77 ompi_coll_tree_t*
ompi_coll_base_topo_build_tree(int fanout,struct ompi_communicator_t * comm,int root)78 ompi_coll_base_topo_build_tree( int fanout,
79 struct ompi_communicator_t* comm,
80 int root )
81 {
82 int rank, size, schild, sparent, shiftedrank, i;
83 int level; /* location of my rank in the tree structure of size */
84 int delta; /* number of nodes on my level */
85 int slimit; /* total number of nodes on levels above me */
86 ompi_coll_tree_t* tree;
87
88 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
89
90 if (fanout<1) {
91 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
92 return NULL;
93 }
94 if (fanout>MAXTREEFANOUT) {
95 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
96 return NULL;
97 }
98
99 /*
100 * Get size and rank of the process in this communicator
101 */
102 size = ompi_comm_size(comm);
103 rank = ompi_comm_rank(comm);
104
105 tree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
106 if (!tree) {
107 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
108 return NULL;
109 }
110
111 tree->tree_root = MPI_UNDEFINED;
112 tree->tree_nextsize = MPI_UNDEFINED;
113
114 /*
115 * Set root
116 */
117 tree->tree_root = root;
118
119 /*
120 * Initialize tree
121 */
122 tree->tree_fanout = fanout;
123 tree->tree_bmtree = 0;
124 tree->tree_root = root;
125 tree->tree_prev = -1;
126 tree->tree_nextsize = 0;
127 for( i = 0; i < fanout; i++ ) {
128 tree->tree_next[i] = -1;
129 }
130
131 /* return if we have less than 2 processes */
132 if( size < 2 ) {
133 return tree;
134 }
135
136 /*
137 * Shift all ranks by root, so that the algorithm can be
138 * designed as if root would be always 0
139 * shiftedrank should be used in calculating distances
140 * and position in tree
141 */
142 shiftedrank = rank - root;
143 if( shiftedrank < 0 ) {
144 shiftedrank += size;
145 }
146
147 /* calculate my level */
148 level = calculate_level( fanout, shiftedrank );
149 delta = pown( fanout, level );
150
151 /* find my children */
152 for( i = 0; i < fanout; i++ ) {
153 schild = shiftedrank + delta * (i+1);
154 if( schild < size ) {
155 tree->tree_next[i] = (schild+root)%size;
156 tree->tree_nextsize = tree->tree_nextsize + 1;
157 } else {
158 break;
159 }
160 }
161
162 /* find my parent */
163 slimit = calculate_num_nodes_up_to_level( fanout, level );
164 sparent = shiftedrank;
165 if( sparent < fanout ) {
166 sparent = 0;
167 } else {
168 while( sparent >= slimit ) {
169 sparent -= delta/fanout;
170 }
171 }
172 tree->tree_prev = (sparent+root)%size;
173
174 return tree;
175 }
176
177 /*
178 * Constructs in-order binary tree which can be used for non-commutative reduce
179 * operations.
180 * Root of this tree is always rank (size-1) and fanout is 2.
181 * Here are some of the examples of this tree:
182 * size == 2 size == 3 size == 4 size == 9
183 * 1 2 3 8
184 * / / \ / \ / \
185 * 0 1 0 2 1 7 3
186 * / / \ / \
187 * 0 6 5 2 1
188 * / /
189 * 4 0
190 */
191 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bintree(struct ompi_communicator_t * comm)192 ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
193 {
194 int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
195 ompi_coll_tree_t* tree;
196
197 /*
198 * Get size and rank of the process in this communicator
199 */
200 size = ompi_comm_size(comm);
201 rank = ompi_comm_rank(comm);
202
203 tree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
204 if (!tree) {
205 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
206 "coll:base:topo_build_tree PANIC::out of memory"));
207 return NULL;
208 }
209
210 tree->tree_root = MPI_UNDEFINED;
211 tree->tree_nextsize = MPI_UNDEFINED;
212
213 /*
214 * Initialize tree
215 */
216 tree->tree_fanout = 2;
217 tree->tree_bmtree = 0;
218 tree->tree_root = size - 1;
219 tree->tree_prev = -1;
220 tree->tree_nextsize = 0;
221 tree->tree_next[0] = -1;
222 tree->tree_next[1] = -1;
223 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
224 "coll:base:topo_build_in_order_tree Building fo %d rt %d",
225 tree->tree_fanout, tree->tree_root));
226
227 /*
228 * Build the tree
229 */
230 myrank = rank;
231 parent = size - 1;
232 delta = 0;
233
234 while ( 1 ) {
235 /* Compute the size of the right subtree */
236 rightsize = size >> 1;
237
238 /* Determine the left and right child of this parent */
239 lchild = -1;
240 rchild = -1;
241 if (size - 1 > 0) {
242 lchild = parent - 1;
243 if (lchild > 0) {
244 rchild = rightsize - 1;
245 }
246 }
247
248 /* The following cases are possible: myrank can be
249 - a parent,
250 - belong to the left subtree, or
251 - belong to the right subtee
252 Each of the cases need to be handled differently.
253 */
254
255 if (myrank == parent) {
256 /* I am the parent:
257 - compute real ranks of my children, and exit the loop. */
258 if (lchild >= 0) tree->tree_next[0] = lchild + delta;
259 if (rchild >= 0) tree->tree_next[1] = rchild + delta;
260 break;
261 }
262 if (myrank > rchild) {
263 /* I belong to the left subtree:
264 - If I am the left child, compute real rank of my parent
265 - Iterate down through tree:
266 compute new size, shift ranks down, and update delta.
267 */
268 if (myrank == lchild) {
269 tree->tree_prev = parent + delta;
270 }
271 size = size - rightsize - 1;
272 delta = delta + rightsize;
273 myrank = myrank - rightsize;
274 parent = size - 1;
275
276 } else {
277 /* I belong to the right subtree:
278 - If I am the right child, compute real rank of my parent
279 - Iterate down through tree:
280 compute new size and parent,
281 but the delta and rank do not need to change.
282 */
283 if (myrank == rchild) {
284 tree->tree_prev = parent + delta;
285 }
286 size = rightsize;
287 parent = rchild;
288 }
289 }
290
291 if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
292 if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
293
294 return tree;
295 }
296
ompi_coll_base_topo_destroy_tree(ompi_coll_tree_t ** tree)297 int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
298 {
299 ompi_coll_tree_t *ptr;
300
301 if ((!tree)||(!*tree)) {
302 return OMPI_SUCCESS;
303 }
304
305 ptr = *tree;
306
307 free (ptr);
308 *tree = NULL; /* mark tree as gone */
309
310 return OMPI_SUCCESS;
311 }
312
313 /*
314 *
315 * Here are some of the examples of this tree:
316 * size == 2 size = 4 size = 8
317 * 0 0 0
318 * / | \ / | \
319 * 1 2 1 4 2 1
320 * | | |\
321 * 3 6 5 3
322 * |
323 * 7
324 */
325 ompi_coll_tree_t*
ompi_coll_base_topo_build_bmtree(struct ompi_communicator_t * comm,int root)326 ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
327 int root )
328 {
329 int childs = 0, rank, size, mask = 1, index, remote, i;
330 ompi_coll_tree_t *bmtree;
331
332 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
333
334 /*
335 * Get size and rank of the process in this communicator
336 */
337 size = ompi_comm_size(comm);
338 rank = ompi_comm_rank(comm);
339
340 index = rank -root;
341
342 bmtree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
343 if (!bmtree) {
344 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
345 return NULL;
346 }
347
348 bmtree->tree_bmtree = 1;
349
350 bmtree->tree_root = MPI_UNDEFINED;
351 bmtree->tree_nextsize = MPI_UNDEFINED;
352 for( i = 0;i < MAXTREEFANOUT; i++ ) {
353 bmtree->tree_next[i] = -1;
354 }
355
356 if( index < 0 ) index += size;
357
358 mask = opal_next_poweroftwo(index);
359
360 /* Now I can compute my father rank */
361 if( root == rank ) {
362 bmtree->tree_prev = root;
363 } else {
364 remote = (index ^ (mask >> 1)) + root;
365 if( remote >= size ) remote -= size;
366 bmtree->tree_prev = remote;
367 }
368 /* And now let's fill my childs */
369 while( mask < size ) {
370 remote = (index ^ mask);
371 if( remote >= size ) break;
372 remote += root;
373 if( remote >= size ) remote -= size;
374 if (childs==MAXTREEFANOUT) {
375 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
376 free(bmtree);
377 return NULL;
378 }
379 bmtree->tree_next[childs] = remote;
380 mask <<= 1;
381 childs++;
382 }
383 bmtree->tree_nextsize = childs;
384 bmtree->tree_root = root;
385 return bmtree;
386 }
387
388 /*
389 * Constructs in-order binomial tree which can be used for gather/scatter
390 * operations.
391 *
392 * Here are some of the examples of this tree:
393 * size == 2 size = 4 size = 8
394 * 0 0 0
395 * / / | / | \
396 * 1 1 2 1 2 4
397 * | | | \
398 * 3 3 5 6
399 * |
400 * 7
401 */
402 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bmtree(struct ompi_communicator_t * comm,int root)403 ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
404 int root )
405 {
406 int childs = 0, rank, vrank, size, mask = 1, remote, i;
407 ompi_coll_tree_t *bmtree;
408
409 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
410
411 /*
412 * Get size and rank of the process in this communicator
413 */
414 size = ompi_comm_size(comm);
415 rank = ompi_comm_rank(comm);
416
417 vrank = (rank - root + size) % size;
418
419 bmtree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
420 if (!bmtree) {
421 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
422 return NULL;
423 }
424
425 bmtree->tree_bmtree = 1;
426 bmtree->tree_root = MPI_UNDEFINED;
427 bmtree->tree_nextsize = MPI_UNDEFINED;
428 for(i=0;i<MAXTREEFANOUT;i++) {
429 bmtree->tree_next[i] = -1;
430 }
431
432 if (root == rank) {
433 bmtree->tree_prev = root;
434 }
435
436 while (mask < size) {
437 remote = vrank ^ mask;
438 if (remote < vrank) {
439 bmtree->tree_prev = (remote + root) % size;
440 break;
441 } else if (remote < size) {
442 bmtree->tree_next[childs] = (remote + root) % size;
443 childs++;
444 if (childs==MAXTREEFANOUT) {
445 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
446 "coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
447 MAXTREEFANOUT, childs));
448 free(bmtree);
449 return NULL;
450 }
451 }
452 mask <<= 1;
453 }
454 bmtree->tree_nextsize = childs;
455 bmtree->tree_root = root;
456
457 return bmtree;
458 }
459
460 /*
461 * ompi_coll_base_topo_build_kmtree: Build k-nomial tree for Bcast
462 *
463 * Example, comm_size=10
464 * radix=2 radix=3 radix=4
465 * 0 0 0
466 * / / \ \ / / | \ \ / / \ \ \
467 * 8 4 2 1 9 3 6 1 2 4 8 1 2 3
468 * | |\ | |\ |\ /|\ |
469 * 9 6 5 3 4 5 7 8 5 6 7 9
470 * |
471 * 7
472 */
473 ompi_coll_tree_t*
ompi_coll_base_topo_build_kmtree(struct ompi_communicator_t * comm,int root,int radix)474 ompi_coll_base_topo_build_kmtree(struct ompi_communicator_t* comm,
475 int root, int radix)
476 {
477 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
478 "coll:base:topo:build_kmtree root %d, radix %d", root, radix));
479 int comm_size = ompi_comm_size(comm);
480 int rank = ompi_comm_rank(comm);
481
482 /* nchilds <= (radix - 1) * \ceil(\log_{radix}(comm_size)) */
483 int log_radix = 0;
484 for (int i = 1; i < comm_size; i *= radix)
485 log_radix++;
486 int nchilds_max = (radix - 1) * log_radix;
487
488 int vrank = (rank - root + comm_size) % comm_size;
489 ompi_coll_tree_t *kmtree = malloc(COLL_TREE_SIZE(nchilds_max));
490 if (NULL == kmtree) {
491 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
492 "coll:base:topo:build_kmtree PANIC out of memory"));
493 return NULL;
494 }
495
496 kmtree->tree_bmtree = 0;
497 kmtree->tree_root = root;
498 kmtree->tree_prev = MPI_PROC_NULL;
499 kmtree->tree_nextsize = 0;
500
501 /* Setup parent */
502 int mask = 0x1;
503 while (mask < comm_size) {
504 if (vrank % (radix * mask)) {
505 kmtree->tree_prev = vrank / (radix * mask) * (radix * mask);
506 kmtree->tree_prev = (kmtree->tree_prev + root) % comm_size;
507 break;
508 }
509 mask *= radix;
510 }
511
512 /* Setup childs */
513 mask /= radix;
514 int nchilds = 0;
515 while (mask > 0) {
516 for (int r = 1; r < radix; r++) {
517 int child = vrank + mask * r;
518 if (child < comm_size) {
519 child = (child + root) % comm_size;
520 kmtree->tree_next[nchilds] = child;
521 nchilds++;
522 }
523 }
524 mask /= radix;
525 }
526 kmtree->tree_nextsize = nchilds;
527 return kmtree;
528 }
529
530 ompi_coll_tree_t*
ompi_coll_base_topo_build_chain(int fanout,struct ompi_communicator_t * comm,int root)531 ompi_coll_base_topo_build_chain( int fanout,
532 struct ompi_communicator_t* comm,
533 int root )
534 {
535 int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
536 ompi_coll_tree_t *chain;
537
538 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
539
540 /*
541 * Get size and rank of the process in this communicator
542 */
543 size = ompi_comm_size(comm);
544 rank = ompi_comm_rank(comm);
545
546 if( fanout < 1 ) {
547 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
548 fanout = 1;
549 }
550 if (fanout>MAXTREEFANOUT) {
551 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
552 fanout = MAXTREEFANOUT;
553 }
554
555 /*
556 * Allocate space for topology arrays if needed
557 */
558 chain = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
559 if (!chain) {
560 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
561 fflush(stdout);
562 return NULL;
563 }
564 chain->tree_root = MPI_UNDEFINED;
565 chain->tree_nextsize = -1;
566 for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
567
568 /*
569 * Set root & numchain
570 */
571 chain->tree_root = root;
572 if( (size - 1) < fanout ) {
573 chain->tree_nextsize = size-1;
574 fanout = size-1;
575 } else {
576 chain->tree_nextsize = fanout;
577 }
578
579 /*
580 * Shift ranks
581 */
582 srank = rank - root;
583 if (srank < 0) srank += size;
584
585 /*
586 * Special case - fanout == 1
587 */
588 if( fanout == 1 ) {
589 if( srank == 0 ) chain->tree_prev = -1;
590 else chain->tree_prev = (srank-1+root)%size;
591
592 if( (srank + 1) >= size) {
593 chain->tree_next[0] = -1;
594 chain->tree_nextsize = 0;
595 } else {
596 chain->tree_next[0] = (srank+1+root)%size;
597 chain->tree_nextsize = 1;
598 }
599 return chain;
600 }
601
602 /* Let's handle the case where there is just one node in the communicator */
603 if( size == 1 ) {
604 chain->tree_next[0] = -1;
605 chain->tree_nextsize = 0;
606 chain->tree_prev = -1;
607 return chain;
608 }
609 /*
610 * Calculate maximum chain length
611 */
612 maxchainlen = (size-1) / fanout;
613 if( (size-1) % fanout != 0 ) {
614 maxchainlen++;
615 mark = (size-1)%fanout;
616 } else {
617 mark = fanout+1;
618 }
619
620 /*
621 * Find your own place in the list of shifted ranks
622 */
623 if( srank != 0 ) {
624 int column;
625 if( srank-1 < (mark * maxchainlen) ) {
626 column = (srank-1)/maxchainlen;
627 head = 1+column*maxchainlen;
628 len = maxchainlen;
629 } else {
630 column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
631 head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
632 len = maxchainlen-1;
633 }
634
635 if( srank == head ) {
636 chain->tree_prev = 0; /*root*/
637 } else {
638 chain->tree_prev = srank-1; /* rank -1 */
639 }
640 if( srank == (head + len - 1) ) {
641 chain->tree_next[0] = -1;
642 chain->tree_nextsize = 0;
643 } else {
644 if( (srank + 1) < size ) {
645 chain->tree_next[0] = srank+1;
646 chain->tree_nextsize = 1;
647 } else {
648 chain->tree_next[0] = -1;
649 chain->tree_nextsize = 0;
650 }
651 }
652 chain->tree_prev = (chain->tree_prev+root)%size;
653 if( chain->tree_next[0] != -1 ) {
654 chain->tree_next[0] = (chain->tree_next[0]+root)%size;
655 }
656 } else {
657 /*
658 * Unshift values
659 */
660 chain->tree_prev = -1;
661 chain->tree_next[0] = (root+1)%size;
662 for( i = 1; i < fanout; i++ ) {
663 chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
664 if( i > mark ) {
665 chain->tree_next[i]--;
666 }
667 chain->tree_next[i] %= size;
668 }
669 chain->tree_nextsize = fanout;
670 }
671
672 return chain;
673 }
674
ompi_coll_base_topo_dump_tree(ompi_coll_tree_t * tree,int rank)675 int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
676 {
677 int i;
678
679 OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
680 " fanout %d BM %1d nextsize %d prev %d",
681 rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
682 tree->tree_nextsize, tree->tree_prev));
683 if( tree->tree_nextsize ) {
684 for( i = 0; i < tree->tree_nextsize; i++ )
685 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
686 }
687 return (0);
688 }
689
690