1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2015 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2015      Research Organization for Information Science
13  *                         and Technology (RIST). All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 
21 #include "ompi_config.h"
22 
23 #include "mpi.h"
24 #include "opal/util/bit_ops.h"
25 #include "ompi/constants.h"
26 #include "ompi/communicator/communicator.h"
27 #include "ompi/mca/coll/base/coll_tags.h"
28 #include "ompi/mca/coll/base/coll_base_functions.h"
29 #include "coll_base_topo.h"
30 
31 /*
32  * Some static helpers.
33  */
pown(int fanout,int num)34 static int pown( int fanout, int num )
35 {
36     int j, p = 1;
37     if( num < 0 ) return 0;
38     if (1==num) return fanout;
39     if (2==fanout) {
40         return p<<num;
41     }
42     else {
43         for( j = 0; j < num; j++ ) { p*= fanout; }
44     }
45     return p;
46 }
47 
calculate_level(int fanout,int rank)48 static int calculate_level( int fanout, int rank )
49 {
50     int level, num;
51     if( rank < 0 ) return -1;
52     for( level = 0, num = 0; num <= rank; level++ ) {
53         num += pown(fanout, level);
54     }
55     return level-1;
56 }
57 
calculate_num_nodes_up_to_level(int fanout,int level)58 static int calculate_num_nodes_up_to_level( int fanout, int level )
59 {
60     /* just use geometric progression formula for sum:
61        a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
62     return ((pown(fanout,level) - 1)/(fanout - 1));
63 }
64 
65 /*
66  * And now the building functions.
67  *
68  * An example for fanout = 2, comm_size = 7
69  *
70  *              0           <-- delta = 1 (fanout^0)
71  *            /   \
72  *           1     2        <-- delta = 2 (fanout^1)
73  *          / \   / \
74  *         3   5 4   6      <-- delta = 4 (fanout^2)
75  */
76 
77 ompi_coll_tree_t*
ompi_coll_base_topo_build_tree(int fanout,struct ompi_communicator_t * comm,int root)78 ompi_coll_base_topo_build_tree( int fanout,
79                                  struct ompi_communicator_t* comm,
80                                  int root )
81 {
82     int rank, size, schild, sparent, shiftedrank, i;
83     int level; /* location of my rank in the tree structure of size */
84     int delta; /* number of nodes on my level */
85     int slimit; /* total number of nodes on levels above me */
86     ompi_coll_tree_t* tree;
87 
88     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
89 
90     if (fanout<1) {
91         OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
92         return NULL;
93     }
94     if (fanout>MAXTREEFANOUT) {
95         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
96         return NULL;
97     }
98 
99     /*
100      * Get size and rank of the process in this communicator
101      */
102     size = ompi_comm_size(comm);
103     rank = ompi_comm_rank(comm);
104 
105     tree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
106     if (!tree) {
107         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
108         return NULL;
109     }
110 
111     tree->tree_root     = MPI_UNDEFINED;
112     tree->tree_nextsize = MPI_UNDEFINED;
113 
114     /*
115      * Set root
116      */
117     tree->tree_root = root;
118 
119     /*
120      * Initialize tree
121      */
122     tree->tree_fanout   = fanout;
123     tree->tree_bmtree   = 0;
124     tree->tree_root     = root;
125     tree->tree_prev     = -1;
126     tree->tree_nextsize = 0;
127     for( i = 0; i < fanout; i++ ) {
128         tree->tree_next[i] = -1;
129     }
130 
131     /* return if we have less than 2 processes */
132     if( size < 2 ) {
133         return tree;
134     }
135 
136     /*
137      * Shift all ranks by root, so that the algorithm can be
138      * designed as if root would be always 0
139      * shiftedrank should be used in calculating distances
140      * and position in tree
141      */
142     shiftedrank = rank - root;
143     if( shiftedrank < 0 ) {
144         shiftedrank += size;
145     }
146 
147     /* calculate my level */
148     level = calculate_level( fanout, shiftedrank );
149     delta = pown( fanout, level );
150 
151     /* find my children */
152     for( i = 0; i < fanout; i++ ) {
153         schild = shiftedrank + delta * (i+1);
154         if( schild < size ) {
155             tree->tree_next[i] = (schild+root)%size;
156             tree->tree_nextsize = tree->tree_nextsize + 1;
157         } else {
158             break;
159         }
160     }
161 
162     /* find my parent */
163     slimit = calculate_num_nodes_up_to_level( fanout, level );
164     sparent = shiftedrank;
165     if( sparent < fanout ) {
166         sparent = 0;
167     } else {
168         while( sparent >= slimit ) {
169             sparent -= delta/fanout;
170         }
171     }
172     tree->tree_prev = (sparent+root)%size;
173 
174     return tree;
175 }
176 
177 /*
178  * Constructs in-order binary tree which can be used for non-commutative reduce
179  * operations.
180  * Root of this tree is always rank (size-1) and fanout is 2.
181  * Here are some of the examples of this tree:
182  * size == 2     size == 3     size == 4                size == 9
183  *      1             2             3                        8
184  *     /             / \          /   \                    /   \
185  *    0             1  0         2     1                  7     3
186  *                                    /                 /  \   / \
187  *                                   0                 6    5 2   1
188  *                                                         /     /
189  *                                                        4     0
190  */
191 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bintree(struct ompi_communicator_t * comm)192 ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
193 {
194     int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
195     ompi_coll_tree_t* tree;
196 
197     /*
198      * Get size and rank of the process in this communicator
199      */
200     size = ompi_comm_size(comm);
201     rank = ompi_comm_rank(comm);
202 
203     tree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
204     if (!tree) {
205         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
206                      "coll:base:topo_build_tree PANIC::out of memory"));
207         return NULL;
208     }
209 
210     tree->tree_root     = MPI_UNDEFINED;
211     tree->tree_nextsize = MPI_UNDEFINED;
212 
213     /*
214      * Initialize tree
215      */
216     tree->tree_fanout   = 2;
217     tree->tree_bmtree   = 0;
218     tree->tree_root     = size - 1;
219     tree->tree_prev     = -1;
220     tree->tree_nextsize = 0;
221     tree->tree_next[0]  = -1;
222     tree->tree_next[1]  = -1;
223     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
224                  "coll:base:topo_build_in_order_tree Building fo %d rt %d",
225                  tree->tree_fanout, tree->tree_root));
226 
227     /*
228      * Build the tree
229      */
230     myrank = rank;
231     parent = size - 1;
232     delta = 0;
233 
234     while ( 1 ) {
235         /* Compute the size of the right subtree */
236         rightsize = size >> 1;
237 
238         /* Determine the left and right child of this parent  */
239         lchild = -1;
240         rchild = -1;
241         if (size - 1 > 0) {
242             lchild = parent - 1;
243             if (lchild > 0) {
244                 rchild = rightsize - 1;
245             }
246         }
247 
248         /* The following cases are possible: myrank can be
249            - a parent,
250            - belong to the left subtree, or
251            - belong to the right subtee
252            Each of the cases need to be handled differently.
253         */
254 
255         if (myrank == parent) {
256             /* I am the parent:
257                - compute real ranks of my children, and exit the loop. */
258             if (lchild >= 0) tree->tree_next[0] = lchild + delta;
259             if (rchild >= 0) tree->tree_next[1] = rchild + delta;
260             break;
261         }
262         if (myrank > rchild) {
263             /* I belong to the left subtree:
264                - If I am the left child, compute real rank of my parent
265                - Iterate down through tree:
266                compute new size, shift ranks down, and update delta.
267             */
268             if (myrank == lchild) {
269                 tree->tree_prev = parent + delta;
270             }
271             size = size - rightsize - 1;
272             delta = delta + rightsize;
273             myrank = myrank - rightsize;
274             parent = size - 1;
275 
276         } else {
277             /* I belong to the right subtree:
278                - If I am the right child, compute real rank of my parent
279                - Iterate down through tree:
280                compute new size and parent,
281                but the delta and rank do not need to change.
282             */
283             if (myrank == rchild) {
284                 tree->tree_prev = parent + delta;
285             }
286             size = rightsize;
287             parent = rchild;
288         }
289     }
290 
291     if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
292     if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
293 
294     return tree;
295 }
296 
ompi_coll_base_topo_destroy_tree(ompi_coll_tree_t ** tree)297 int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
298 {
299     ompi_coll_tree_t *ptr;
300 
301     if ((!tree)||(!*tree)) {
302         return OMPI_SUCCESS;
303     }
304 
305     ptr = *tree;
306 
307     free (ptr);
308     *tree = NULL;   /* mark tree as gone */
309 
310     return OMPI_SUCCESS;
311 }
312 
313 /*
314  *
315  * Here are some of the examples of this tree:
316  * size == 2                   size = 4                 size = 8
317  *      0                           0                        0
318  *     /                            | \                    / | \
319  *    1                             2  1                  4  2  1
320  *                                     |                     |  |\
321  *                                     3                     6  5 3
322  *                                                                |
323  *                                                                7
324  */
325 ompi_coll_tree_t*
ompi_coll_base_topo_build_bmtree(struct ompi_communicator_t * comm,int root)326 ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
327                                    int root )
328 {
329     int childs = 0, rank, size, mask = 1, index, remote, i;
330     ompi_coll_tree_t *bmtree;
331 
332     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
333 
334     /*
335      * Get size and rank of the process in this communicator
336      */
337     size = ompi_comm_size(comm);
338     rank = ompi_comm_rank(comm);
339 
340     index = rank -root;
341 
342     bmtree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
343     if (!bmtree) {
344         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
345         return NULL;
346     }
347 
348     bmtree->tree_bmtree   = 1;
349 
350     bmtree->tree_root     = MPI_UNDEFINED;
351     bmtree->tree_nextsize = MPI_UNDEFINED;
352     for( i = 0;i < MAXTREEFANOUT; i++ ) {
353         bmtree->tree_next[i] = -1;
354     }
355 
356     if( index < 0 ) index += size;
357 
358     mask = opal_next_poweroftwo(index);
359 
360     /* Now I can compute my father rank */
361     if( root == rank ) {
362         bmtree->tree_prev = root;
363     } else {
364         remote = (index ^ (mask >> 1)) + root;
365         if( remote >= size ) remote -= size;
366         bmtree->tree_prev = remote;
367     }
368     /* And now let's fill my childs */
369     while( mask < size ) {
370         remote = (index ^ mask);
371         if( remote >= size ) break;
372         remote += root;
373         if( remote >= size ) remote -= size;
374         if (childs==MAXTREEFANOUT) {
375             OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
376             free(bmtree);
377             return NULL;
378         }
379         bmtree->tree_next[childs] = remote;
380         mask <<= 1;
381         childs++;
382     }
383     bmtree->tree_nextsize = childs;
384     bmtree->tree_root     = root;
385     return bmtree;
386 }
387 
388 /*
389  * Constructs in-order binomial tree which can be used for gather/scatter
390  * operations.
391  *
392  * Here are some of the examples of this tree:
393  * size == 2                   size = 4                 size = 8
394  *      0                           0                        0
395  *     /                          / |                      / | \
396  *    1                          1  2                     1  2  4
397  *                                  |                        |  | \
398  *                                  3                        3  5  6
399  *                                                                 |
400  *                                                                 7
401  */
402 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bmtree(struct ompi_communicator_t * comm,int root)403 ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
404                                             int root )
405 {
406     int childs = 0, rank, vrank, size, mask = 1, remote, i;
407     ompi_coll_tree_t *bmtree;
408 
409     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
410 
411     /*
412      * Get size and rank of the process in this communicator
413      */
414     size = ompi_comm_size(comm);
415     rank = ompi_comm_rank(comm);
416 
417     vrank = (rank - root + size) % size;
418 
419     bmtree = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
420     if (!bmtree) {
421         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
422         return NULL;
423     }
424 
425     bmtree->tree_bmtree   = 1;
426     bmtree->tree_root     = MPI_UNDEFINED;
427     bmtree->tree_nextsize = MPI_UNDEFINED;
428     for(i=0;i<MAXTREEFANOUT;i++) {
429         bmtree->tree_next[i] = -1;
430     }
431 
432     if (root == rank) {
433         bmtree->tree_prev = root;
434     }
435 
436     while (mask < size) {
437         remote = vrank ^ mask;
438         if (remote < vrank) {
439             bmtree->tree_prev = (remote + root) % size;
440             break;
441         } else if (remote < size) {
442             bmtree->tree_next[childs] = (remote + root) % size;
443             childs++;
444             if (childs==MAXTREEFANOUT) {
445                 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
446                              "coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
447                              MAXTREEFANOUT, childs));
448                 free(bmtree);
449                 return NULL;
450             }
451         }
452         mask <<= 1;
453     }
454     bmtree->tree_nextsize = childs;
455     bmtree->tree_root     = root;
456 
457     return bmtree;
458 }
459 
460 /*
461  * ompi_coll_base_topo_build_kmtree: Build k-nomial tree for Bcast
462  *
463  * Example, comm_size=10
464  *    radix=2         radix=3             radix=4
465  *       0               0                   0
466  *    / / \ \       / /  |  \ \         /   / \ \ \
467  *   8 4   2 1     9 3   6   1 2       4   8  1 2 3
468  *   | |\  |         |\  |\           /|\  |
469  *   9 6 5 3         4 5 7 8         5 6 7 9
470  *     |
471  *     7
472  */
473 ompi_coll_tree_t*
ompi_coll_base_topo_build_kmtree(struct ompi_communicator_t * comm,int root,int radix)474 ompi_coll_base_topo_build_kmtree(struct ompi_communicator_t* comm,
475                                  int root, int radix)
476 {
477     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
478                  "coll:base:topo:build_kmtree root %d, radix %d", root, radix));
479     int comm_size = ompi_comm_size(comm);
480     int rank = ompi_comm_rank(comm);
481 
482     /* nchilds <= (radix - 1) * \ceil(\log_{radix}(comm_size)) */
483     int log_radix = 0;
484     for (int i = 1; i < comm_size; i *= radix)
485         log_radix++;
486     int nchilds_max = (radix - 1) * log_radix;
487 
488     int vrank = (rank - root + comm_size) % comm_size;
489     ompi_coll_tree_t *kmtree = malloc(COLL_TREE_SIZE(nchilds_max));
490     if (NULL == kmtree) {
491         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
492                      "coll:base:topo:build_kmtree PANIC out of memory"));
493         return NULL;
494     }
495 
496     kmtree->tree_bmtree = 0;
497     kmtree->tree_root = root;
498     kmtree->tree_prev = MPI_PROC_NULL;
499     kmtree->tree_nextsize = 0;
500 
501     /* Setup parent */
502     int mask = 0x1;
503     while (mask < comm_size) {
504         if (vrank % (radix * mask)) {
505             kmtree->tree_prev = vrank / (radix * mask) * (radix * mask);
506             kmtree->tree_prev = (kmtree->tree_prev + root) % comm_size;
507             break;
508         }
509         mask *= radix;
510     }
511 
512     /* Setup childs */
513     mask /= radix;
514     int nchilds = 0;
515     while (mask > 0) {
516         for (int r = 1; r < radix; r++) {
517             int child = vrank + mask * r;
518             if (child < comm_size) {
519                 child = (child + root) % comm_size;
520                 kmtree->tree_next[nchilds] = child;
521                 nchilds++;
522             }
523         }
524         mask /= radix;
525     }
526     kmtree->tree_nextsize = nchilds;
527     return kmtree;
528 }
529 
530 ompi_coll_tree_t*
ompi_coll_base_topo_build_chain(int fanout,struct ompi_communicator_t * comm,int root)531 ompi_coll_base_topo_build_chain( int fanout,
532                                   struct ompi_communicator_t* comm,
533                                   int root )
534 {
535     int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
536     ompi_coll_tree_t *chain;
537 
538     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
539 
540     /*
541      * Get size and rank of the process in this communicator
542      */
543     size = ompi_comm_size(comm);
544     rank = ompi_comm_rank(comm);
545 
546     if( fanout < 1 ) {
547         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
548         fanout = 1;
549     }
550     if (fanout>MAXTREEFANOUT) {
551         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
552         fanout = MAXTREEFANOUT;
553     }
554 
555     /*
556      * Allocate space for topology arrays if needed
557      */
558     chain = (ompi_coll_tree_t*)malloc(COLL_TREE_SIZE(MAXTREEFANOUT));
559     if (!chain) {
560         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
561         fflush(stdout);
562         return NULL;
563     }
564     chain->tree_root     = MPI_UNDEFINED;
565     chain->tree_nextsize = -1;
566     for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
567 
568     /*
569      * Set root & numchain
570      */
571     chain->tree_root = root;
572     if( (size - 1) < fanout ) {
573         chain->tree_nextsize = size-1;
574         fanout = size-1;
575     } else {
576         chain->tree_nextsize = fanout;
577     }
578 
579     /*
580      * Shift ranks
581      */
582     srank = rank - root;
583     if (srank < 0) srank += size;
584 
585     /*
586      * Special case - fanout == 1
587      */
588     if( fanout == 1 ) {
589         if( srank == 0 ) chain->tree_prev = -1;
590         else chain->tree_prev = (srank-1+root)%size;
591 
592         if( (srank + 1) >= size) {
593             chain->tree_next[0] = -1;
594             chain->tree_nextsize = 0;
595         } else {
596             chain->tree_next[0] = (srank+1+root)%size;
597             chain->tree_nextsize = 1;
598         }
599         return chain;
600     }
601 
602     /* Let's handle the case where there is just one node in the communicator */
603     if( size == 1 ) {
604         chain->tree_next[0] = -1;
605         chain->tree_nextsize = 0;
606         chain->tree_prev = -1;
607         return chain;
608     }
609     /*
610      * Calculate maximum chain length
611      */
612     maxchainlen = (size-1) / fanout;
613     if( (size-1) % fanout != 0 ) {
614         maxchainlen++;
615         mark = (size-1)%fanout;
616     } else {
617         mark = fanout+1;
618     }
619 
620     /*
621      * Find your own place in the list of shifted ranks
622      */
623     if( srank != 0 ) {
624         int column;
625         if( srank-1 < (mark * maxchainlen) ) {
626             column = (srank-1)/maxchainlen;
627             head = 1+column*maxchainlen;
628             len = maxchainlen;
629         } else {
630             column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
631             head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
632             len = maxchainlen-1;
633         }
634 
635         if( srank == head ) {
636             chain->tree_prev = 0; /*root*/
637         } else {
638             chain->tree_prev = srank-1; /* rank -1 */
639         }
640         if( srank == (head + len - 1) ) {
641             chain->tree_next[0] = -1;
642             chain->tree_nextsize = 0;
643         } else {
644             if( (srank + 1) < size ) {
645                 chain->tree_next[0] = srank+1;
646                 chain->tree_nextsize = 1;
647             } else {
648                 chain->tree_next[0] = -1;
649                 chain->tree_nextsize = 0;
650             }
651         }
652         chain->tree_prev = (chain->tree_prev+root)%size;
653         if( chain->tree_next[0] != -1 ) {
654             chain->tree_next[0] = (chain->tree_next[0]+root)%size;
655         }
656     } else {
657         /*
658          * Unshift values
659          */
660         chain->tree_prev = -1;
661         chain->tree_next[0] = (root+1)%size;
662         for( i = 1; i < fanout; i++ ) {
663             chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
664             if( i > mark ) {
665                 chain->tree_next[i]--;
666             }
667             chain->tree_next[i] %= size;
668         }
669         chain->tree_nextsize = fanout;
670     }
671 
672     return chain;
673 }
674 
ompi_coll_base_topo_dump_tree(ompi_coll_tree_t * tree,int rank)675 int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
676 {
677     int i;
678 
679     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
680                  " fanout %d BM %1d nextsize %d prev %d",
681                  rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
682                  tree->tree_nextsize, tree->tree_prev));
683     if( tree->tree_nextsize ) {
684         for( i = 0; i < tree->tree_nextsize; i++ )
685             OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
686     }
687     return (0);
688 }
689 
690