1 /*
2  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2015 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2015      Research Organization for Information Science
13  *                         and Technology (RIST). All rights reserved.
14  * $COPYRIGHT$
15  *
16  * Additional copyrights may follow
17  *
18  * $HEADER$
19  */
20 
21 #include "ompi_config.h"
22 
23 #include "mpi.h"
24 #include "opal/util/bit_ops.h"
25 #include "ompi/constants.h"
26 #include "ompi/communicator/communicator.h"
27 #include "ompi/mca/coll/base/coll_tags.h"
28 #include "ompi/mca/coll/base/coll_base_functions.h"
29 #include "coll_base_topo.h"
30 
31 /*
32  * Some static helpers.
33  */
pown(int fanout,int num)34 static int pown( int fanout, int num )
35 {
36     int j, p = 1;
37     if( num < 0 ) return 0;
38     if (1==num) return fanout;
39     if (2==fanout) {
40         return p<<num;
41     }
42     else {
43         for( j = 0; j < num; j++ ) { p*= fanout; }
44     }
45     return p;
46 }
47 
calculate_level(int fanout,int rank)48 static int calculate_level( int fanout, int rank )
49 {
50     int level, num;
51     if( rank < 0 ) return -1;
52     for( level = 0, num = 0; num <= rank; level++ ) {
53         num += pown(fanout, level);
54     }
55     return level-1;
56 }
57 
calculate_num_nodes_up_to_level(int fanout,int level)58 static int calculate_num_nodes_up_to_level( int fanout, int level )
59 {
60     /* just use geometric progression formula for sum:
61        a^0+a^1+...a^(n-1) = (a^n-1)/(a-1) */
62     return ((pown(fanout,level) - 1)/(fanout - 1));
63 }
64 
65 /*
66  * And now the building functions.
67  *
68  * An example for fanout = 2, comm_size = 7
69  *
70  *              0           <-- delta = 1 (fanout^0)
71  *            /   \
72  *           1     2        <-- delta = 2 (fanout^1)
73  *          / \   / \
74  *         3   5 4   6      <-- delta = 4 (fanout^2)
75  */
76 
77 ompi_coll_tree_t*
ompi_coll_base_topo_build_tree(int fanout,struct ompi_communicator_t * comm,int root)78 ompi_coll_base_topo_build_tree( int fanout,
79                                  struct ompi_communicator_t* comm,
80                                  int root )
81 {
82     int rank, size, schild, sparent, shiftedrank, i;
83     int level; /* location of my rank in the tree structure of size */
84     int delta; /* number of nodes on my level */
85     int slimit; /* total number of nodes on levels above me */
86     ompi_coll_tree_t* tree;
87 
88     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree Building fo %d rt %d", fanout, root));
89 
90     if (fanout<1) {
91         OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo_build_tree invalid fanout %d", fanout));
92         return NULL;
93     }
94     if (fanout>MAXTREEFANOUT) {
95         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree invalid fanout %d bigger than max %d", fanout, MAXTREEFANOUT));
96         return NULL;
97     }
98 
99     /*
100      * Get size and rank of the process in this communicator
101      */
102     size = ompi_comm_size(comm);
103     rank = ompi_comm_rank(comm);
104 
105     tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
106     if (!tree) {
107         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo_build_tree PANIC::out of memory"));
108         return NULL;
109     }
110 
111     tree->tree_root     = MPI_UNDEFINED;
112     tree->tree_nextsize = MPI_UNDEFINED;
113 
114     /*
115      * Set root
116      */
117     tree->tree_root = root;
118 
119     /*
120      * Initialize tree
121      */
122     tree->tree_fanout   = fanout;
123     tree->tree_bmtree   = 0;
124     tree->tree_root     = root;
125     tree->tree_prev     = -1;
126     tree->tree_nextsize = 0;
127     for( i = 0; i < fanout; i++ ) {
128         tree->tree_next[i] = -1;
129     }
130 
131     /* return if we have less than 2 processes */
132     if( size < 2 ) {
133         return tree;
134     }
135 
136     /*
137      * Shift all ranks by root, so that the algorithm can be
138      * designed as if root would be always 0
139      * shiftedrank should be used in calculating distances
140      * and position in tree
141      */
142     shiftedrank = rank - root;
143     if( shiftedrank < 0 ) {
144         shiftedrank += size;
145     }
146 
147     /* calculate my level */
148     level = calculate_level( fanout, shiftedrank );
149     delta = pown( fanout, level );
150 
151     /* find my children */
152     for( i = 0; i < fanout; i++ ) {
153         schild = shiftedrank + delta * (i+1);
154         if( schild < size ) {
155             tree->tree_next[i] = (schild+root)%size;
156             tree->tree_nextsize = tree->tree_nextsize + 1;
157         } else {
158             break;
159         }
160     }
161 
162     /* find my parent */
163     slimit = calculate_num_nodes_up_to_level( fanout, level );
164     sparent = shiftedrank;
165     if( sparent < fanout ) {
166         sparent = 0;
167     } else {
168         while( sparent >= slimit ) {
169             sparent -= delta/fanout;
170         }
171     }
172     tree->tree_prev = (sparent+root)%size;
173 
174     return tree;
175 }
176 
177 /*
178  * Constructs in-order binary tree which can be used for non-commutative reduce
179  * operations.
180  * Root of this tree is always rank (size-1) and fanout is 2.
181  * Here are some of the examples of this tree:
182  * size == 2     size == 3     size == 4                size == 9
183  *      1             2             3                        8
184  *     /             / \          /   \                    /   \
185  *    0             1  0         2     1                  7     3
186  *                                    /                 /  \   / \
187  *                                   0                 6    5 2   1
188  *                                                         /     /
189  *                                                        4     0
190  */
191 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bintree(struct ompi_communicator_t * comm)192 ompi_coll_base_topo_build_in_order_bintree( struct ompi_communicator_t* comm )
193 {
194     int rank, size, myrank, rightsize, delta, parent, lchild, rchild;
195     ompi_coll_tree_t* tree;
196 
197     /*
198      * Get size and rank of the process in this communicator
199      */
200     size = ompi_comm_size(comm);
201     rank = ompi_comm_rank(comm);
202 
203     tree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
204     if (!tree) {
205         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
206                      "coll:base:topo_build_tree PANIC::out of memory"));
207         return NULL;
208     }
209 
210     tree->tree_root     = MPI_UNDEFINED;
211     tree->tree_nextsize = MPI_UNDEFINED;
212 
213     /*
214      * Initialize tree
215      */
216     tree->tree_fanout   = 2;
217     tree->tree_bmtree   = 0;
218     tree->tree_root     = size - 1;
219     tree->tree_prev     = -1;
220     tree->tree_nextsize = 0;
221     tree->tree_next[0]  = -1;
222     tree->tree_next[1]  = -1;
223     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
224                  "coll:base:topo_build_in_order_tree Building fo %d rt %d",
225                  tree->tree_fanout, tree->tree_root));
226 
227     /*
228      * Build the tree
229      */
230     myrank = rank;
231     parent = size - 1;
232     delta = 0;
233 
234     while ( 1 ) {
235         /* Compute the size of the right subtree */
236         rightsize = size >> 1;
237 
238         /* Determine the left and right child of this parent  */
239         lchild = -1;
240         rchild = -1;
241         if (size - 1 > 0) {
242             lchild = parent - 1;
243             if (lchild > 0) {
244                 rchild = rightsize - 1;
245             }
246         }
247 
248         /* The following cases are possible: myrank can be
249            - a parent,
250            - belong to the left subtree, or
251            - belong to the right subtee
252            Each of the cases need to be handled differently.
253         */
254 
255         if (myrank == parent) {
256             /* I am the parent:
257                - compute real ranks of my children, and exit the loop. */
258             if (lchild >= 0) tree->tree_next[0] = lchild + delta;
259             if (rchild >= 0) tree->tree_next[1] = rchild + delta;
260             break;
261         }
262         if (myrank > rchild) {
263             /* I belong to the left subtree:
264                - If I am the left child, compute real rank of my parent
265                - Iterate down through tree:
266                compute new size, shift ranks down, and update delta.
267             */
268             if (myrank == lchild) {
269                 tree->tree_prev = parent + delta;
270             }
271             size = size - rightsize - 1;
272             delta = delta + rightsize;
273             myrank = myrank - rightsize;
274             parent = size - 1;
275 
276         } else {
277             /* I belong to the right subtree:
278                - If I am the right child, compute real rank of my parent
279                - Iterate down through tree:
280                compute new size and parent,
281                but the delta and rank do not need to change.
282             */
283             if (myrank == rchild) {
284                 tree->tree_prev = parent + delta;
285             }
286             size = rightsize;
287             parent = rchild;
288         }
289     }
290 
291     if (tree->tree_next[0] >= 0) { tree->tree_nextsize = 1; }
292     if (tree->tree_next[1] >= 0) { tree->tree_nextsize += 1; }
293 
294     return tree;
295 }
296 
ompi_coll_base_topo_destroy_tree(ompi_coll_tree_t ** tree)297 int ompi_coll_base_topo_destroy_tree( ompi_coll_tree_t** tree )
298 {
299     ompi_coll_tree_t *ptr;
300 
301     if ((!tree)||(!*tree)) {
302         return OMPI_SUCCESS;
303     }
304 
305     ptr = *tree;
306 
307     free (ptr);
308     *tree = NULL;   /* mark tree as gone */
309 
310     return OMPI_SUCCESS;
311 }
312 
313 /*
314  *
315  * Here are some of the examples of this tree:
316  * size == 2                   size = 4                 size = 8
317  *      0                           0                        0
318  *     /                            | \                    / | \
319  *    1                             2  1                  4  2  1
320  *                                     |                     |  |\
321  *                                     3                     6  5 3
322  *                                                                |
323  *                                                                7
324  */
325 ompi_coll_tree_t*
ompi_coll_base_topo_build_bmtree(struct ompi_communicator_t * comm,int root)326 ompi_coll_base_topo_build_bmtree( struct ompi_communicator_t* comm,
327                                    int root )
328 {
329     int childs = 0, rank, size, mask = 1, index, remote, i;
330     ompi_coll_tree_t *bmtree;
331 
332     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree rt %d", root));
333 
334     /*
335      * Get size and rank of the process in this communicator
336      */
337     size = ompi_comm_size(comm);
338     rank = ompi_comm_rank(comm);
339 
340     index = rank -root;
341 
342     bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
343     if (!bmtree) {
344         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
345         return NULL;
346     }
347 
348     bmtree->tree_bmtree   = 1;
349 
350     bmtree->tree_root     = MPI_UNDEFINED;
351     bmtree->tree_nextsize = MPI_UNDEFINED;
352     for( i = 0;i < MAXTREEFANOUT; i++ ) {
353         bmtree->tree_next[i] = -1;
354     }
355 
356     if( index < 0 ) index += size;
357 
358     mask = opal_next_poweroftwo(index);
359 
360     /* Now I can compute my father rank */
361     if( root == rank ) {
362         bmtree->tree_prev = root;
363     } else {
364         remote = (index ^ (mask >> 1)) + root;
365         if( remote >= size ) remote -= size;
366         bmtree->tree_prev = remote;
367     }
368     /* And now let's fill my childs */
369     while( mask < size ) {
370         remote = (index ^ mask);
371         if( remote >= size ) break;
372         remote += root;
373         if( remote >= size ) remote -= size;
374         if (childs==MAXTREEFANOUT) {
375             OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree max fanout incorrect %d needed %d", MAXTREEFANOUT, childs));
376             free(bmtree);
377             return NULL;
378         }
379         bmtree->tree_next[childs] = remote;
380         mask <<= 1;
381         childs++;
382     }
383     bmtree->tree_nextsize = childs;
384     bmtree->tree_root     = root;
385     return bmtree;
386 }
387 
388 /*
389  * Constructs in-order binomial tree which can be used for gather/scatter
390  * operations.
391  *
392  * Here are some of the examples of this tree:
393  * size == 2                   size = 4                 size = 8
394  *      0                           0                        0
395  *     /                          / |                      / | \
396  *    1                          1  2                     1  2  4
397  *                                  |                        |  | \
398  *                                  3                        3  5  6
399  *                                                                 |
400  *                                                                 7
401  */
402 ompi_coll_tree_t*
ompi_coll_base_topo_build_in_order_bmtree(struct ompi_communicator_t * comm,int root)403 ompi_coll_base_topo_build_in_order_bmtree( struct ompi_communicator_t* comm,
404                                             int root )
405 {
406     int childs = 0, rank, vrank, size, mask = 1, remote, i;
407     ompi_coll_tree_t *bmtree;
408 
409     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_in_order_bmtree rt %d", root));
410 
411     /*
412      * Get size and rank of the process in this communicator
413      */
414     size = ompi_comm_size(comm);
415     rank = ompi_comm_rank(comm);
416 
417     vrank = (rank - root + size) % size;
418 
419     bmtree = (ompi_coll_tree_t*)malloc(sizeof(ompi_coll_tree_t));
420     if (!bmtree) {
421         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_bmtree PANIC out of memory"));
422         return NULL;
423     }
424 
425     bmtree->tree_bmtree   = 1;
426     bmtree->tree_root     = MPI_UNDEFINED;
427     bmtree->tree_nextsize = MPI_UNDEFINED;
428     for(i=0;i<MAXTREEFANOUT;i++) {
429         bmtree->tree_next[i] = -1;
430     }
431 
432     if (root == rank) {
433         bmtree->tree_prev = root;
434     }
435 
436     while (mask < size) {
437         remote = vrank ^ mask;
438         if (remote < vrank) {
439             bmtree->tree_prev = (remote + root) % size;
440             break;
441         } else if (remote < size) {
442             bmtree->tree_next[childs] = (remote + root) % size;
443             childs++;
444             if (childs==MAXTREEFANOUT) {
445                 OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
446                              "coll:base:topo:build_bmtree max fanout incorrect %d needed %d",
447                              MAXTREEFANOUT, childs));
448                 free(bmtree);
449                 return NULL;
450             }
451         }
452         mask <<= 1;
453     }
454     bmtree->tree_nextsize = childs;
455     bmtree->tree_root     = root;
456 
457     return bmtree;
458 }
459 
460 
461 ompi_coll_tree_t*
ompi_coll_base_topo_build_chain(int fanout,struct ompi_communicator_t * comm,int root)462 ompi_coll_base_topo_build_chain( int fanout,
463                                   struct ompi_communicator_t* comm,
464                                   int root )
465 {
466     int i, maxchainlen, mark, head, len, rank, size, srank /* shifted rank */;
467     ompi_coll_tree_t *chain;
468 
469     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain fo %d rt %d", fanout, root));
470 
471     /*
472      * Get size and rank of the process in this communicator
473      */
474     size = ompi_comm_size(comm);
475     rank = ompi_comm_rank(comm);
476 
477     if( fanout < 1 ) {
478         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout of ZERO, forcing to 1 (pipeline)!"));
479         fanout = 1;
480     }
481     if (fanout>MAXTREEFANOUT) {
482         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain WARNING invalid fanout %d bigger than max %d, forcing to max!", fanout, MAXTREEFANOUT));
483         fanout = MAXTREEFANOUT;
484     }
485 
486     /*
487      * Allocate space for topology arrays if needed
488      */
489     chain = (ompi_coll_tree_t*)malloc( sizeof(ompi_coll_tree_t) );
490     if (!chain) {
491         OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"coll:base:topo:build_chain PANIC out of memory"));
492         fflush(stdout);
493         return NULL;
494     }
495     chain->tree_root     = MPI_UNDEFINED;
496     chain->tree_nextsize = -1;
497     for(i=0;i<fanout;i++) chain->tree_next[i] = -1;
498 
499     /*
500      * Set root & numchain
501      */
502     chain->tree_root = root;
503     if( (size - 1) < fanout ) {
504         chain->tree_nextsize = size-1;
505         fanout = size-1;
506     } else {
507         chain->tree_nextsize = fanout;
508     }
509 
510     /*
511      * Shift ranks
512      */
513     srank = rank - root;
514     if (srank < 0) srank += size;
515 
516     /*
517      * Special case - fanout == 1
518      */
519     if( fanout == 1 ) {
520         if( srank == 0 ) chain->tree_prev = -1;
521         else chain->tree_prev = (srank-1+root)%size;
522 
523         if( (srank + 1) >= size) {
524             chain->tree_next[0] = -1;
525             chain->tree_nextsize = 0;
526         } else {
527             chain->tree_next[0] = (srank+1+root)%size;
528             chain->tree_nextsize = 1;
529         }
530         return chain;
531     }
532 
533     /* Let's handle the case where there is just one node in the communicator */
534     if( size == 1 ) {
535         chain->tree_next[0] = -1;
536         chain->tree_nextsize = 0;
537         chain->tree_prev = -1;
538         return chain;
539     }
540     /*
541      * Calculate maximum chain length
542      */
543     maxchainlen = (size-1) / fanout;
544     if( (size-1) % fanout != 0 ) {
545         maxchainlen++;
546         mark = (size-1)%fanout;
547     } else {
548         mark = fanout+1;
549     }
550 
551     /*
552      * Find your own place in the list of shifted ranks
553      */
554     if( srank != 0 ) {
555         int column;
556         if( srank-1 < (mark * maxchainlen) ) {
557             column = (srank-1)/maxchainlen;
558             head = 1+column*maxchainlen;
559             len = maxchainlen;
560         } else {
561             column = mark + (srank-1-mark*maxchainlen)/(maxchainlen-1);
562             head = mark*maxchainlen+1+(column-mark)*(maxchainlen-1);
563             len = maxchainlen-1;
564         }
565 
566         if( srank == head ) {
567             chain->tree_prev = 0; /*root*/
568         } else {
569             chain->tree_prev = srank-1; /* rank -1 */
570         }
571         if( srank == (head + len - 1) ) {
572             chain->tree_next[0] = -1;
573             chain->tree_nextsize = 0;
574         } else {
575             if( (srank + 1) < size ) {
576                 chain->tree_next[0] = srank+1;
577                 chain->tree_nextsize = 1;
578             } else {
579                 chain->tree_next[0] = -1;
580                 chain->tree_nextsize = 0;
581             }
582         }
583         chain->tree_prev = (chain->tree_prev+root)%size;
584         if( chain->tree_next[0] != -1 ) {
585             chain->tree_next[0] = (chain->tree_next[0]+root)%size;
586         }
587     } else {
588         /*
589          * Unshift values
590          */
591         chain->tree_prev = -1;
592         chain->tree_next[0] = (root+1)%size;
593         for( i = 1; i < fanout; i++ ) {
594             chain->tree_next[i] = chain->tree_next[i-1] + maxchainlen;
595             if( i > mark ) {
596                 chain->tree_next[i]--;
597             }
598             chain->tree_next[i] %= size;
599         }
600         chain->tree_nextsize = fanout;
601     }
602 
603     return chain;
604 }
605 
ompi_coll_base_topo_dump_tree(ompi_coll_tree_t * tree,int rank)606 int ompi_coll_base_topo_dump_tree (ompi_coll_tree_t* tree, int rank)
607 {
608     int i;
609 
610     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:topo:topo_dump_tree %1d tree root %d"
611                  " fanout %d BM %1d nextsize %d prev %d",
612                  rank, tree->tree_root, tree->tree_bmtree, tree->tree_fanout,
613                  tree->tree_nextsize, tree->tree_prev));
614     if( tree->tree_nextsize ) {
615         for( i = 0; i < tree->tree_nextsize; i++ )
616             OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"[%1d] %d", i, tree->tree_next[i]));
617     }
618     return (0);
619 }
620 
621