1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2011-2017 The University of Tennessee and The University
4  *                         of Tennessee Research Foundation.  All rights
5  *                         reserved.
6  * Copyright (c) 2011-2016 INRIA.  All rights reserved.
7  * Copyright (c) 2012-2017 Bordeaux Polytechnic Institute
8  * Copyright (c) 2015-2016 Intel, Inc.  All rights reserved.
9  * Copyright (c) 2015-2017 Research Organization for Information Science
10  *                         and Technology (RIST). All rights reserved.
11  * Copyright (c) 2016      Los Alamos National Security, LLC. All rights
12  *                         reserved.
13  * Copyright (c) 2017      Cisco Systems, Inc.  All rights reserved
14  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
15  * $COPYRIGHT$
16  *
17  * Additional copyrights may follow
18  *
19  * $HEADER$
20  */
21 
22 #include "ompi_config.h"
23 
24 #include "opal/constants.h"
25 #include "opal/mca/hwloc/hwloc-internal.h"
26 
27 #include "ompi/mca/topo/treematch/topo_treematch.h"
28 #include "ompi/mca/topo/treematch/treematch/treematch.h"
29 #include "ompi/mca/topo/treematch/treematch/tm_mapping.h"
30 #include "ompi/mca/topo/base/base.h"
31 
32 #include "ompi/communicator/communicator.h"
33 #include "ompi/info/info.h"
34 
35 #include "ompi/mca/pml/pml.h"
36 
37 #include "opal/mca/pmix/pmix.h"
38 
39 /* #define __DEBUG__ 1 */
40 
41 /**
42  * This function is a allreduce between all processes to detect for oversubscription.
43  * On each node, the local_procs will be a different array, that contains only the
44  * local processes. Thus, that process will compute the node oversubscription and will
45  * bring this value to the operation, while every other process on the node will
46  * contribute 0.
47  * Doing an AllReduce might be an overkill for this situation, but it should remain
48  * more scalable than a star reduction (between the roots of each node (nodes_roots),
49  * followed by a bcast to all processes.
50  */
check_oversubscribing(int rank,int num_nodes,int num_objs_in_node,int num_procs_in_node,int * nodes_roots,int * local_procs,ompi_communicator_t * comm_old)51 static int check_oversubscribing(int rank,
52                                  int num_nodes,
53                                  int num_objs_in_node,
54                                  int num_procs_in_node,
55                                  int *nodes_roots,
56                                  int *local_procs,
57                                  ompi_communicator_t *comm_old)
58 {
59     int oversubscribed = 0, local_oversub = 0, err;
60 
61     /* Only a single process per node, the local root, compute the oversubscription condition */
62     if (rank == local_procs[0])
63         if(num_objs_in_node < num_procs_in_node)
64             local_oversub = 1;
65 
66 
67     if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_allreduce(&local_oversub, &oversubscribed, 1, MPI_INT,
68                                                                 MPI_SUM, comm_old, comm_old->c_coll->coll_allreduce_module)))
69         return err;
70 
71     return oversubscribed;
72 }
73 
74 #ifdef __DEBUG__
dump_int_array(int level,int output_id,char * prolog,char * line_prolog,int * array,size_t length)75 static void dump_int_array( int level, int output_id, char* prolog, char* line_prolog, int* array, size_t length )
76 {
77     size_t i;
78     if( -1 == output_id ) return;
79 
80     opal_output_verbose(level, output_id, "%s : ", prolog);
81     for(i = 0; i < length ; i++)
82         opal_output_verbose(level, output_id, "%s [%lu:%i] ", line_prolog, i, array[i]);
83     opal_output_verbose(level, output_id, "\n");
84 }
dump_double_array(int level,int output_id,char * prolog,char * line_prolog,double * array,size_t length)85 static void dump_double_array( int level, int output_id, char* prolog, char* line_prolog, double* array, size_t length )
86 {
87     size_t i;
88 
89     if( -1 == output_id ) return;
90     opal_output_verbose(level, output_id, "%s : ", prolog);
91     for(i = 0; i < length ; i++)
92         opal_output_verbose(level, output_id, "%s [%lu:%lf] ", line_prolog, i, array[i]);
93     opal_output_verbose(level, output_id, "\n");
94 }
95 #endif
96 
mca_topo_treematch_dist_graph_create(mca_topo_base_module_t * topo_module,ompi_communicator_t * comm_old,int n,const int nodes[],const int degrees[],const int targets[],const int weights[],struct opal_info_t * info,int reorder,ompi_communicator_t ** newcomm)97 int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
98                                          ompi_communicator_t *comm_old,
99                                          int n, const int nodes[],
100                                          const int degrees[], const int targets[],
101                                          const int weights[],
102                                          struct opal_info_t *info, int reorder,
103                                          ompi_communicator_t **newcomm)
104 {
105     int err;
106 
107     if (OMPI_SUCCESS != (err = mca_topo_base_dist_graph_distribute(topo_module, comm_old, n, nodes,
108                                                                    degrees, targets, weights,
109                                                                    &(topo_module->mtc.dist_graph))))
110         return err;
111 
112     if(!reorder) {  /* No reorder. Create a new communicator, then   */
113                     /* jump out to attach the dist_graph and return */
114     fallback:
115 
116         if( OMPI_SUCCESS == (err = ompi_comm_create(comm_old,
117                                                     comm_old->c_local_group,
118                                                     newcomm))){
119             /* Attach the dist_graph to the newly created communicator */
120             (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
121             (*newcomm)->c_topo          = topo_module;
122             (*newcomm)->c_topo->reorder = reorder;
123         }
124         return err;
125     }  /* reorder == yes */
126 
127     mca_topo_base_comm_dist_graph_2_2_0_t *topo = NULL;
128     ompi_proc_t *proc = NULL;
129     MPI_Request  *reqs = NULL;
130     hwloc_cpuset_t set = NULL;
131     hwloc_obj_t object, root_obj;
132     hwloc_obj_t *tracker = NULL;
133     double *local_pattern = NULL;
134     int *vpids, *colors = NULL;
135     int *lindex_to_grank = NULL;
136     int *nodes_roots = NULL, *k = NULL;
137     int *localrank_to_objnum  = NULL;
138     int depth, effective_depth, obj_rank = -1;
139     int num_objs_in_node = 0, num_pus_in_node = 0;
140     int numlevels = 0, num_nodes = 0, num_procs_in_node = 0;
141     int rank, size, newrank = -1, hwloc_err, i, j, idx;
142     int oversubscribing_objs = 0, oversubscribed_pus = 0;
143     uint32_t val, *pval;
144 
145     /* We need to know if the processes are bound. We assume all
146      * processes are in the same state: all bound or none. */
147     if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
148         goto fallback;
149     }
150     root_obj = hwloc_get_root_obj(opal_hwloc_topology);
151     if (NULL == root_obj) goto fallback;
152 
153     topo = topo_module->mtc.dist_graph;
154     rank = ompi_comm_rank(comm_old);
155     size = ompi_comm_size(comm_old);
156 
157     OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
158                          "Process rank is : %i\n",rank));
159     /**
160      * In order to decrease the number of loops let's use a trick:
161      * build the lindex_to_grank in the vpids array, and only allocate
162      * it upon completion of the most costly loop.
163      */
164     vpids = (int *)malloc(size * sizeof(int));
165     colors = (int *)malloc(size * sizeof(int));
166     for(i = 0 ; i < size ; i++) {
167         proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
168         if (( i == rank ) ||
169             (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)))
170             vpids[num_procs_in_node++] = i;
171 
172         pval = &val;
173         OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
174         if( OPAL_SUCCESS != err ) {
175             opal_output(0, "Unable to extract peer %s nodeid from the modex.\n",
176                         OMPI_NAME_PRINT(&(proc->super)));
177             colors[i] = -1;
178             continue;
179         }
180         colors[i] = (int)val;
181     }
182     lindex_to_grank = (int *)malloc(num_procs_in_node * sizeof(int));
183     memcpy(lindex_to_grank, vpids, num_procs_in_node * sizeof(int));
184     memcpy(vpids, colors, size * sizeof(int));
185 
186 #ifdef __DEBUG__
187     if ( 0 == rank ) {
188         dump_int_array(10, ompi_topo_base_framework.framework_output,
189                        "lindex_to_grank : ", "", lindex_to_grank, num_procs_in_node);
190         dump_int_array(10, ompi_topo_base_framework.framework_output,
191                        "Vpids : ", "", colors, size);
192     }
193 #endif
194     /* clean-up dupes in the array */
195     for(i = 0; i < size ; i++) {
196         if ( -1 == vpids[i] ) continue;
197         num_nodes++;  /* compute number of nodes */
198         for(j = i+1; j < size; j++)
199             if( vpids[i] == vpids[j] )
200                 vpids[j] = -1;
201     }
202     if( 0 == num_nodes ) {
203         /* No useful info has been retrieved from the runtime. Fallback
204          * and create a duplicate of the original communicator */
205         free(vpids);
206         free(colors);
207         goto fallback; /* return with success */
208     }
209     /* compute local roots ranks in comm_old */
210     /* Only the global root needs to do this */
211     if(0 == rank) {
212         nodes_roots = (int *)calloc(num_nodes, sizeof(int));
213         for(i = idx = 0; i < size; i++)
214             if( vpids[i] != -1 )
215                 nodes_roots[idx++] = i;
216         OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
217                              "num nodes is %i\n", num_nodes));
218 #ifdef __DEBUG__
219         dump_int_array(10, ompi_topo_base_framework.framework_output,
220                        "Root nodes are :\n", "root ", nodes_roots, num_nodes);
221 #endif
222     }
223     free(vpids);
224 
225     /* if cpubind returns an error, it will be full anyway */
226     set = hwloc_bitmap_alloc_full();
227     hwloc_get_cpubind(opal_hwloc_topology, set, 0);
228     num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
229 
230     /**
231      * In all situations (including heterogeneous environments) all processes must execute
232      * all the calls that involve collective communications, so we have to lay the logic
233      * accordingly.
234      */
235 
236     if(hwloc_bitmap_isincluded(root_obj->cpuset,set)) { /* processes are not bound on the machine */
237         if (0 == rank)
238             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
239                                  ">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n"));
240 
241         /* we try to bind to cores or above objects if enough are present */
242         /* Not sure that cores are present in ALL nodes */
243         depth = hwloc_get_type_or_above_depth(opal_hwloc_topology, HWLOC_OBJ_CORE);
244         num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, depth);
245     } else {    /* the processes are already bound */
246         object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, set);
247         obj_rank = object->logical_index;
248         effective_depth = object->depth;
249         num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
250     }
251     if( (0 == num_objs_in_node) || (0 == num_pus_in_node) ) {  /* deal with bozo cases: COVERITY 1418505 */
252         free(colors);
253         goto fallback; /* return with success */
254     }
255     /* Check for oversubscribing */
256     oversubscribing_objs = check_oversubscribing(rank, num_nodes,
257                                                  num_objs_in_node, num_procs_in_node,
258                                                  nodes_roots, lindex_to_grank, comm_old);
259 
260     if(oversubscribing_objs) {
261         if(hwloc_bitmap_isincluded(root_obj->cpuset, set)) { /* processes are not bound on the machine */
262             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
263                                  "Oversubscribing OBJ/CORES resources => Trying to use PUs \n"));
264 
265             oversubscribed_pus = check_oversubscribing(rank, num_nodes,
266                                                        num_pus_in_node, num_procs_in_node,
267                                                        nodes_roots, lindex_to_grank, comm_old);
268             /* Update the data used to compute the correct binding */
269             if (!oversubscribed_pus) {
270                 obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
271                 effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
272                 num_objs_in_node = num_pus_in_node;
273                 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
274                                      "Process %i not bound : binding on PU#%i \n", rank, obj_rank));
275             }
276         } else {
277             /* Bound processes will participate with the same data as before */
278             oversubscribed_pus = check_oversubscribing(rank, num_nodes,
279                                                        num_objs_in_node, num_procs_in_node,
280                                                        nodes_roots, lindex_to_grank, comm_old);
281         }
282     }
283 
284     if( !oversubscribing_objs && !oversubscribed_pus ) {
285         if( hwloc_bitmap_isincluded(root_obj->cpuset, set) ) { /* processes are not bound on the machine */
286             obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
287             effective_depth = depth;
288             object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, obj_rank);
289             if( NULL == object) {
290                 free(colors);
291                 hwloc_bitmap_free(set);
292                 goto fallback;  /* return with success */
293             }
294 
295             hwloc_bitmap_copy(set, object->cpuset);
296             hwloc_bitmap_singlify(set); /* we don't want the process to move */
297             hwloc_err = hwloc_set_cpubind(opal_hwloc_topology, set, 0);
298             if( -1 == hwloc_err) {
299                 /* This is a local issue. Either we agree with the rest of the processes to stop the
300                  * reordering or we have to complete the entire process. Let's complete.
301                  */
302                 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
303                                      "Process %i failed to bind on OBJ#%i \n", rank, obj_rank));
304             } else
305                 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
306                                      "Process %i not bound : binding on OBJ#%i \n",rank, obj_rank));
307         } else {
308             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
309                                  "Process %i bound  on OBJ #%i \n"
310                                  "=====> Num obj in node : %i | num pus in node : %i\n",
311                                  rank, obj_rank,
312                                  num_objs_in_node, num_pus_in_node));
313         }
314     } else {
315         OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
316                              "Oversubscribing PUs resources => Rank Reordering Impossible \n"));
317         free(colors);
318         hwloc_bitmap_free(set);
319         goto fallback;  /* return with success */
320     }
321 
322     reqs = (MPI_Request *)calloc(num_procs_in_node-1, sizeof(MPI_Request));
323     if( rank == lindex_to_grank[0] ) {  /* local leader clean the hierarchy */
324         int array_size = effective_depth + 1;
325         int *myhierarchy = (int *)calloc(array_size, sizeof(int));
326 
327         numlevels = 1;
328         myhierarchy[0] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, 0);
329         for (i = 1; i < array_size ; i++) {
330             myhierarchy[i] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, i);
331             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
332                                  "hierarchy[%i] = %i\n", i, myhierarchy[i]));
333             if ((myhierarchy[i] != 0) && (myhierarchy[i] != myhierarchy[i-1]))
334                 numlevels++;
335         }
336 
337         tracker = (hwloc_obj_t *)calloc(numlevels, sizeof(hwloc_obj_t));
338         for(idx = 0, i = 1; i < array_size; i++) {
339             if(myhierarchy[i] != myhierarchy[i-1])
340                 tracker[idx++] = hwloc_get_obj_by_depth(opal_hwloc_topology, i-1, 0);
341         }
342         tracker[idx] = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, 0);
343         free(myhierarchy);
344 
345         OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
346                              ">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n",
347                              effective_depth, hwloc_topology_get_depth(opal_hwloc_topology), numlevels));
348         for(i = 0 ; i < numlevels ; i++) {
349             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
350                                  "tracker[%i] : arity %i | depth %i\n",
351                                  i, tracker[i]->arity, tracker[i]->depth));
352         }
353         /* get the obj number */
354         localrank_to_objnum = (int *)calloc(num_procs_in_node, sizeof(int));
355         localrank_to_objnum[0] = obj_rank;
356 
357         for(i = 1;  i < num_procs_in_node; i++) {
358             if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&localrank_to_objnum[i], 1, MPI_INT,
359                                                            lindex_to_grank[i], -111, comm_old, &reqs[i-1])))) {
360                 free(reqs); reqs = NULL;
361                 goto release_and_return;
362             }
363         }
364         if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_procs_in_node-1,
365                                                           reqs, MPI_STATUSES_IGNORE))) {
366             free(reqs); reqs = NULL;
367             goto release_and_return;
368         }
369     } else {
370         /* sending my core number to my local master on the node */
371         if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&obj_rank, 1, MPI_INT, lindex_to_grank[0],
372                                                      -111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
373             free(reqs); reqs = NULL;
374             goto release_and_return;
375         }
376     }
377     free(reqs); reqs = NULL;
378 
379     /* Centralized Reordering */
380     if (0 == mca_topo_treematch_component.reorder_mode) {
381         int *k = NULL;
382         int *obj_mapping = NULL;
383         int num_objs_total = 0;
384 
385         /* Gather comm pattern
386          * If weights have been provided take them in account. Otherwise rely
387          * solely on HWLOC information.
388          */
389         if( 0 == rank ) {
390 
391             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
392                                  "========== Centralized Reordering ========= \n"));
393             local_pattern = (double *)calloc(size*size,sizeof(double));
394         } else {
395             local_pattern = (double *)calloc(size,sizeof(double));
396         }
397         if( true == topo->weighted ) {
398             for(i = 0; i < topo->indegree ; i++)
399                 local_pattern[topo->in[i]] += topo->inw[i];
400             for(i = 0; i < topo->outdegree ; i++)
401                 local_pattern[topo->out[i]] += topo->outw[i];
402         }
403         err = comm_old->c_coll->coll_gather( (0 == rank ? MPI_IN_PLACE : local_pattern), size, MPI_DOUBLE,
404                                              local_pattern, size, MPI_DOUBLE,  /* ignored on non-root */
405                                              0, comm_old, comm_old->c_coll->coll_gather_module);
406         if (OMPI_SUCCESS != err) {
407             goto release_and_return;
408         }
409 
410         if( rank == lindex_to_grank[0] ) {
411             tm_topology_t *tm_topology = NULL;
412             int *obj_to_rank_in_comm = NULL;
413             int *hierarchies = NULL;
414             int  min;
415 
416             /* create a table that derives the rank in comm_old from the object number */
417             obj_to_rank_in_comm = (int *)malloc(num_objs_in_node*sizeof(int));
418             for(i = 0 ; i < num_objs_in_node ; i++) {
419                 obj_to_rank_in_comm[i] = -1;
420                 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, i);
421                 for( j = 0; j < num_procs_in_node ; j++ )
422                     if(localrank_to_objnum[j] == (int)(object->logical_index)) {
423                         obj_to_rank_in_comm[i] = lindex_to_grank[j];
424                         break;
425                     }
426             }
427 
428             /* the global master gathers info from local_masters */
429             if ( 0 == rank ) {
430                 if ( num_nodes > 1 ) {
431                     int *objs_per_node = NULL, displ;
432 
433                     objs_per_node = (int *)calloc(num_nodes, sizeof(int));
434                     reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
435                     objs_per_node[0] = num_objs_in_node;
436                     for(i = 1; i < num_nodes ; i++)
437                         if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(objs_per_node + i, 1, MPI_INT,
438                                                                        nodes_roots[i], -112, comm_old, &reqs[i-1])))) {
439                             free(obj_to_rank_in_comm);
440                             free(objs_per_node);
441                             goto release_and_return;
442                         }
443 
444                     if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
445                                                                       reqs, MPI_STATUSES_IGNORE))) {
446                         free(objs_per_node);
447                         goto release_and_return;
448                     }
449 
450                     for(i = 0; i < num_nodes; i++)
451                         num_objs_total += objs_per_node[i];
452                     obj_mapping = (int *)calloc(num_objs_total,sizeof(int));
453 
454                     memcpy(obj_mapping, obj_to_rank_in_comm, objs_per_node[0]*sizeof(int));
455                     displ = objs_per_node[0];
456                     for(i = 1; i < num_nodes ; i++) {
457                         if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displ, objs_per_node[i], MPI_INT,
458                                                                        nodes_roots[i], -113, comm_old, &reqs[i-1])))) {
459                             free(obj_to_rank_in_comm);
460                             free(objs_per_node);
461                             free(obj_mapping);
462                             goto release_and_return;
463                         }
464                         displ += objs_per_node[i];
465                     }
466                     if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
467                                                                       reqs, MPI_STATUSES_IGNORE))) {
468                         free(obj_to_rank_in_comm);
469                         free(objs_per_node);
470                         free(obj_mapping);
471                         goto release_and_return;
472                     }
473                     free(objs_per_node);
474                 } else {
475                     /* if num_nodes == 1, then it's easy to get the obj mapping */
476                     num_objs_total = num_objs_in_node;
477                     obj_mapping = (int *)calloc(num_objs_total, sizeof(int));
478                     memcpy(obj_mapping, obj_to_rank_in_comm, num_objs_total*sizeof(int));
479                 }
480 #ifdef __DEBUG__
481                 dump_int_array(10, ompi_topo_base_framework.framework_output,
482                                "Obj mapping : ", "", obj_mapping, num_objs_total );
483 #endif
484             } else {
485                 if ( num_nodes > 1 ) {
486                     if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&num_objs_in_node, 1, MPI_INT,
487                                                                  0, -112, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
488                         free(obj_to_rank_in_comm);
489                         goto release_and_return;
490                     }
491                     if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(obj_to_rank_in_comm, num_objs_in_node, MPI_INT,
492                                                                  0, -113, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
493                         free(obj_to_rank_in_comm);
494                         goto release_and_return;
495                     }
496                 }
497             }
498             free(obj_to_rank_in_comm);
499 
500             assert(numlevels < TM_MAX_LEVELS);
501             if( 0 == rank ) {
502                 hierarchies = (int *)malloc(num_nodes*(TM_MAX_LEVELS+1)*sizeof(int));
503             } else {
504                 hierarchies = (int *)malloc((TM_MAX_LEVELS+1)*sizeof(int));
505             }
506 
507             hierarchies[0] = numlevels;
508 
509             for(i = 0 ; i < hierarchies[0]; i++)
510                 hierarchies[i+1] = tracker[i]->arity;
511             for(; i < (TM_MAX_LEVELS+1); i++)  /* fill up everything else with -1 */
512                 hierarchies[i] = -1;
513 
514             /* gather hierarchies iff more than 1 node! */
515             if ( num_nodes > 1 ) {
516                 if( rank != 0 ) {
517                     if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchies,(TM_MAX_LEVELS+1), MPI_INT, 0,
518                                                                  -114, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
519                         free(hierarchies);
520                         goto release_and_return;
521                     }
522                 } else {
523                     for(i = 1; i < num_nodes ; i++)
524                         if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(TM_MAX_LEVELS+1), (TM_MAX_LEVELS+1), MPI_INT,
525                                                                        nodes_roots[i], -114, comm_old, &reqs[i-1])))) {
526                             free(obj_mapping);
527                             free(hierarchies);
528                             goto release_and_return;
529                         }
530                     if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
531                                                                       reqs, MPI_STATUSES_IGNORE))) {
532                         free(obj_mapping);
533                         free(hierarchies);
534                         goto release_and_return;
535                     }
536                     free(reqs); reqs = NULL;
537                 }
538             }
539 
540             if ( 0 == rank ) {
541                 tm_tree_t *comm_tree = NULL;
542                 tm_solution_t *sol = NULL;
543                 tm_affinity_mat_t *aff_mat = NULL;
544                 double **comm_pattern = NULL;
545 
546 #ifdef __DEBUG__
547                 dump_int_array(10, ompi_topo_base_framework.framework_output,
548                                "hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
549 #endif
550                 tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
551                 tm_topology->nb_levels = hierarchies[0];
552 
553                 /* extract min depth */
554                 for(i = 1 ; i < num_nodes ; i++)
555                     if (hierarchies[i*(TM_MAX_LEVELS+1)] < tm_topology->nb_levels)
556                         tm_topology->nb_levels = hierarchies[i*(TM_MAX_LEVELS+1)];
557 
558                 /* Crush levels in hierarchies too long (ie > tm_topology->nb_levels)*/
559                 for(i = 0; i < num_nodes ; i++) {
560                     int *base_ptr = hierarchies + i*(TM_MAX_LEVELS+1);
561                     int  suppl = *base_ptr - tm_topology->nb_levels;
562                     for(j = 1 ; j <= suppl ; j++)
563                         *(base_ptr + tm_topology->nb_levels) *= *(base_ptr + tm_topology->nb_levels + j);
564                 }
565                 if( num_nodes > 1) {
566                     /* We aggregate all topos => +1 level!*/
567                     tm_topology->nb_levels += 1;
568                     tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
569                     tm_topology->arity[0] = num_nodes;
570                     for(i = 1; i < tm_topology->nb_levels; i++) {  /* compute the minimum for each level */
571                         min = hierarchies[i];
572                         for(j = 1; j < num_nodes ; j++)
573                             if( hierarchies[j*(TM_MAX_LEVELS+1) + i] < min)
574                                 min = hierarchies[j*(TM_MAX_LEVELS+1) + i];
575                         tm_topology->arity[i] = min;
576                     }
577                 } else {
578                     tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
579                     for(i = 0; i < tm_topology->nb_levels; i++)
580                         tm_topology->arity[i] = hierarchies[i+1];
581                 }
582                 free(hierarchies);
583 
584                 for(i = 0; i < tm_topology->nb_levels; i++) {
585                     OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
586                                          "topo_arity[%i] = %i\n", i, tm_topology->arity[i]));
587                 }
588 
589                 /* compute the number of processing elements */
590                 tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
591                 tm_topology->nb_nodes[0] = 1;
592                 for(i = 1 ; i < tm_topology->nb_levels; i++)
593                     tm_topology->nb_nodes[i] = tm_topology->nb_nodes[i-1] * tm_topology->arity[i-1];
594 
595                 /* Build process id tab */
596                 tm_topology->node_id  = (int **)calloc(tm_topology->nb_levels, sizeof(int*));
597                 tm_topology->node_rank = (int **)malloc(sizeof(int *) * tm_topology->nb_levels);
598                 for(i = 0; i < tm_topology->nb_levels; i++) {
599                     tm_topology->node_id[i] = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
600                     tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int));
601                     /*note : we make the hypothesis that logical indexes in hwloc range from
602                       0 to N, are contiguous and crescent.  */
603 
604                     for( j = 0 ; j < (int)tm_topology->nb_nodes[i] ; j++ ) {
605                         tm_topology->node_id[i][j] = j;
606                         tm_topology->node_rank[i][j] = j;
607 
608                         /* Should use object->logical_index */
609                         /* obj = hwloc_get_obj_by_depth(topo,i,j%num_objs_in_node);
610                            id = obj->logical_index + (num_objs_in_node)*(j/num_obj_in_node)*/
611                         /*
612                            int id = core_numbering[j%nb_core_per_nodes] + (nb_core_per_nodes)*(j/nb_core_per_nodes);
613                            topology->node_id[i][j] = id;
614                            topology->node_rank[i][id] = j;
615                         */
616                     }
617                 }
618                 /* unused for now*/
619                 tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
620 
621                 tm_topology->nb_proc_units = num_objs_total;
622 
623                 tm_topology->nb_constraints = 0;
624                 for(i = 0; i < tm_topology->nb_proc_units ; i++)
625                     if (obj_mapping[i] != -1)
626                         tm_topology->nb_constraints++;
627                 tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
628                 for(idx = 0, i = 0; i < tm_topology->nb_proc_units ; i++)
629                     if (obj_mapping[i] != -1)
630                         tm_topology->constraints[idx++] = obj_mapping[i];
631 
632                 tm_topology->oversub_fact = 1;
633 
634 #ifdef __DEBUG__
635                 assert(num_objs_total == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
636 
637                 for(i = 0; i < tm_topology->nb_levels ; i++) {
638                     opal_output_verbose(10, ompi_topo_base_framework.framework_output,
639                                         "tm topo node_id for level [%i] : ",i);
640                     dump_int_array(10, ompi_topo_base_framework.framework_output,
641                                    "", "", obj_mapping, tm_topology->nb_nodes[i]);
642                 }
643                 tm_display_topology(tm_topology);
644 #endif
645 
646                 comm_pattern = (double **)malloc(size*sizeof(double *));
647                 for(i = 0 ; i < size ; i++)
648                     comm_pattern[i] = local_pattern + i * size;
649                 /* matrix needs to be symmetric */
650                 for( i = 0; i < size ; i++ )
651                     for( j = i; j < size ; j++ ) {
652                         comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
653                         comm_pattern[j][i] = comm_pattern[i][j];
654                     }
655 #ifdef __DEBUG__
656                 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
657                                     "==== COMM PATTERN ====\n");
658                 for( i = 0 ; i < size ; i++) {
659                     dump_double_array(10, ompi_topo_base_framework.framework_output,
660                                       "", "", comm_pattern[i], size);
661                 }
662 #endif
663                 tm_optimize_topology(&tm_topology);
664                 aff_mat = tm_build_affinity_mat(comm_pattern,size);
665                 comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
666                 sol = tm_compute_mapping(tm_topology, comm_tree);
667 
668                 k = (int *)calloc(sol->k_length, sizeof(int));
669                 for(idx = 0 ; idx < (int)sol->k_length ; idx++)
670                     k[idx] = sol->k[idx][0];
671 
672 #ifdef __DEBUG__
673                 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
674                                     "====> nb levels : %i\n",tm_topology->nb_levels);
675                 dump_int_array(10, ompi_topo_base_framework.framework_output,
676                                "Rank permutation sigma/k : ", "", k, num_objs_total);
677                 assert(size == (int)sol->sigma_length);
678                 dump_int_array(10, ompi_topo_base_framework.framework_output,
679                                "Matching : ", "",sol->sigma, sol->sigma_length);
680 #endif
681                 free(obj_mapping);
682                 free(comm_pattern);
683                 free(aff_mat->sum_row);
684                 free(aff_mat);
685                 tm_free_solution(sol);
686                 tm_free_tree(comm_tree);
687                 tm_free_topology(tm_topology);
688             }
689         }
690 
691         /* Todo : Bcast + group creation */
692         /* scatter the ranks */
693         if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_scatter(k, 1, MPI_INT,
694                                                                   &newrank, 1, MPI_INT,
695                                                                   0, comm_old,
696                                                                   comm_old->c_coll->coll_scatter_module))) {
697             if (NULL != k) free(k);
698             goto release_and_return;
699         }
700 
701         if ( 0 == rank )
702             free(k);
703 
704         /* this needs to be optimized but will do for now */
705         if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
706             goto release_and_return;
707         }
708         /* end of TODO */
709 
710         /* Attach the dist_graph to the newly created communicator */
711         (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
712         (*newcomm)->c_topo          = topo_module;
713         (*newcomm)->c_topo->reorder = reorder;
714 
715     } else { /* partially distributed reordering */
716         int *grank_to_lrank = NULL, *lrank_to_grank = NULL, *marked = NULL;
717         int node_position = 0, offset = 0, pos = 0;
718         ompi_communicator_t *localcomm = NULL;
719 
720         if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, colors[rank], rank,
721                                                    &localcomm, false))) {
722             goto release_and_return;
723         }
724 
725         lrank_to_grank = (int *)calloc(num_procs_in_node, sizeof(int));
726         if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_allgather(&rank, 1, MPI_INT,
727                                                                      lrank_to_grank, 1, MPI_INT,
728                                                                      localcomm, localcomm->c_coll->coll_allgather_module))) {
729             free(lrank_to_grank);
730             ompi_comm_free(&localcomm);
731             goto release_and_return;
732         }
733 
734         grank_to_lrank = (int *)malloc(size * sizeof(int));
735         for(i = 0 ; i < size ; grank_to_lrank[i++] = -1);
736         for(i = 0 ; i < num_procs_in_node ; i++)
737             grank_to_lrank[lrank_to_grank[i]] = i;
738 
739         /* Discover the local patterns */
740         if (rank == lindex_to_grank[0]) {
741             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
742                                  "========== Partially Distributed Reordering ========= \n"));
743             local_pattern = (double *)calloc(num_procs_in_node * num_procs_in_node, sizeof(double));
744         } else {
745             local_pattern = (double *)calloc(num_procs_in_node, sizeof(double));
746         }
747         /* Extract the local communication pattern */
748         if( true == topo->weighted ) {
749             for(i = 0; i < topo->indegree; i++)
750                 if (grank_to_lrank[topo->in[i]] != -1)
751                     local_pattern[grank_to_lrank[topo->in[i]]] += topo->inw[i];
752             for(i = 0; i < topo->outdegree; i++)
753                 if (grank_to_lrank[topo->out[i]] != -1)
754                     local_pattern[grank_to_lrank[topo->out[i]]] += topo->outw[i];
755         }
756         if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_gather((rank == lindex_to_grank[0] ? MPI_IN_PLACE : local_pattern),
757                                                                   num_procs_in_node, MPI_DOUBLE,
758                                                                   local_pattern, num_procs_in_node, MPI_DOUBLE,
759                                                                   0, localcomm, localcomm->c_coll->coll_gather_module))) {
760             free(lrank_to_grank);
761             ompi_comm_free(&localcomm);
762             free(grank_to_lrank);
763             goto release_and_return;
764         }
765 
766         /* The root has now the entire information, so let's crunch it */
767         if (rank == lindex_to_grank[0]) {
768             tm_topology_t  *tm_topology = NULL;
769             tm_tree_t *comm_tree = NULL;
770             tm_solution_t *sol = NULL;
771             tm_affinity_mat_t *aff_mat = NULL;
772             double **comm_pattern = NULL;
773 
774             comm_pattern = (double **)malloc(num_procs_in_node*sizeof(double *));
775             for( i = 0; i < num_procs_in_node; i++ ) {
776                 comm_pattern[i] = local_pattern + i * num_procs_in_node;
777             }
778             /* Matrix needs to be symmetric. Beware: as comm_patterns
779              * refers to local_pattern we indirectly alter the content
780              * of local_pattern */
781             for( i = 0; i < num_procs_in_node ; i++ )
782                 for( j = i; j < num_procs_in_node ; j++ ) {
783                     comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
784                     comm_pattern[j][i] = comm_pattern[i][j];
785                 }
786 
787 #ifdef __DEBUG__
788             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
789                                  "========== COMM PATTERN ============= \n"));
790             for(i = 0 ; i < num_procs_in_node ; i++){
791                 opal_output_verbose(10, ompi_topo_base_framework.framework_output," %i : ",i);
792                 dump_double_array(10, ompi_topo_base_framework.framework_output,
793                                   "", "", comm_pattern[i], num_procs_in_node);
794             }
795             opal_output_verbose(10, ompi_topo_base_framework.framework_output,
796                                 "======================= \n");
797 #endif
798 
799             tm_topology  = (tm_topology_t *)malloc(sizeof(tm_topology_t));
800             tm_topology->nb_levels = numlevels;
801             tm_topology->arity     = (int *)calloc(tm_topology->nb_levels, sizeof(int));
802             tm_topology->nb_nodes  = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
803             tm_topology->node_id   = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
804             tm_topology->node_rank = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
805 
806             for(i = 0 ; i < tm_topology->nb_levels ; i++){
807                 int nb_objs = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, tracker[i]->depth);
808                 tm_topology->nb_nodes[i] = nb_objs;
809                 tm_topology->arity[i]    = tracker[i]->arity;
810                 tm_topology->node_id[i]  = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
811                 tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int));
812                 for(j = 0; j < (int)tm_topology->nb_nodes[i] ; j++){
813                     tm_topology->node_id[i][j] = j;
814                     tm_topology->node_rank[i][j] = j;
815                 }
816             }
817 
818             /* unused for now*/
819             tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
820 
821             tm_topology->nb_proc_units = num_objs_in_node;
822             //tm_topology->nb_proc_units = num_procs_in_node;
823             tm_topology->nb_constraints = 0;
824             for(i = 0; i < num_procs_in_node ; i++)
825                 if (localrank_to_objnum[i] != -1)
826                     tm_topology->nb_constraints++;
827 
828             tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
829             for(idx = 0,i = 0; i < num_procs_in_node ; i++)
830                 if (localrank_to_objnum[i] != -1)
831                     tm_topology->constraints[idx++] = localrank_to_objnum[i];
832 
833             tm_topology->oversub_fact = 1;
834 
835 #ifdef __DEBUG__
836             assert(num_objs_in_node == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
837             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
838                                  "Levels in topo : %i | num procs in node : %i\n",
839                                  tm_topology->nb_levels,num_procs_in_node));
840             for(i = 0; i < tm_topology->nb_levels ; i++) {
841                 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
842                                      "Nb objs for level %i : %lu | arity %i\n ",
843                                      i, tm_topology->nb_nodes[i],tm_topology->arity[i]));
844                 dump_int_array(10, ompi_topo_base_framework.framework_output,
845                                "", "Obj id ", tm_topology->node_id[i], tm_topology->nb_nodes[i]);
846             }
847             tm_display_topology(tm_topology);
848 #endif
849             tm_optimize_topology(&tm_topology);
850             aff_mat = tm_build_affinity_mat(comm_pattern,num_procs_in_node);
851             comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
852             sol = tm_compute_mapping(tm_topology, comm_tree);
853 
854             assert((int)sol->k_length == num_objs_in_node);
855 
856             k = (int *)calloc(sol->k_length, sizeof(int));
857             for(idx = 0 ; idx < (int)sol->k_length ; idx++)
858                 k[idx] = sol->k[idx][0];
859 
860 #ifdef __DEBUG__
861             OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
862                                  "====> nb levels : %i\n",tm_topology->nb_levels));
863             dump_int_array(10, ompi_topo_base_framework.framework_output,
864                            "Rank permutation sigma/k : ", "", k, num_procs_in_node);
865             assert(num_procs_in_node == (int)sol->sigma_length);
866             dump_int_array(10, ompi_topo_base_framework.framework_output,
867                            "Matching : ", "", sol->sigma, sol->sigma_length);
868 #endif
869 
870             free(aff_mat->sum_row);
871             free(aff_mat);
872             free(comm_pattern);
873             tm_free_solution(sol);
874             tm_free_tree(comm_tree);
875             tm_free_topology(tm_topology);
876         }
877 
878         /* Todo : Bcast + group creation */
879         /* scatter the ranks */
880         if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_scatter(k, 1, MPI_INT,
881                                                                    &newrank, 1, MPI_INT,
882                                                                    0, localcomm,
883                                                                    localcomm->c_coll->coll_scatter_module))) {
884             if (NULL != k) free(k);
885             ompi_comm_free(&localcomm);
886             free(lrank_to_grank);
887             free(grank_to_lrank);
888             goto release_and_return;
889         }
890 
891         /* compute the offset of newrank before the split */
892         /* use the colors array, not the vpids */
893         marked = (int *)malloc((num_nodes-1)*sizeof(int));
894         for(idx = 0 ; idx < num_nodes - 1 ; idx++)
895             marked[idx] = -1;
896 
897         while( (node_position != rank) && (colors[node_position] != colors[rank])) {
898             /* Have we already counted the current color ? */
899             for(idx = 0; idx < pos; idx++)
900                 if( marked[idx] == colors[node_position] )
901                     goto next_iter;  /* yes, let's skip the rest */
902             /* How many elements of this color are here ? none before the current position */
903             for(; idx < size; idx++)
904                 if(colors[idx] == colors[node_position])
905                     offset++;
906             marked[pos++] = colors[node_position];
907           next_iter:
908             node_position++;
909         }
910         newrank += offset;
911         free(marked);
912 
913         if (rank == lindex_to_grank[0])
914             free(k);
915 
916         /* this needs to be optimized but will do for now */
917         if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
918             ompi_comm_free(&localcomm);
919             free(lrank_to_grank);
920             free(grank_to_lrank);
921             goto release_and_return;
922         }
923         /* end of TODO */
924 
925         /* Attach the dist_graph to the newly created communicator */
926         (*newcomm)->c_flags        |= OMPI_COMM_DIST_GRAPH;
927         (*newcomm)->c_topo          = topo_module;
928         (*newcomm)->c_topo->reorder = reorder;
929 
930         free(grank_to_lrank);
931         free(lrank_to_grank);
932     } /* distributed reordering end */
933 
934   release_and_return:
935     if (NULL != reqs ) free(reqs);
936     if (NULL != tracker) free(tracker);
937     if (NULL != local_pattern) free(local_pattern);
938     free(colors);
939     if (NULL != lindex_to_grank) free(lindex_to_grank);
940     if (NULL != nodes_roots) free(nodes_roots);  /* only on root */
941     if (NULL != localrank_to_objnum) free(localrank_to_objnum);
942     if( NULL != set) hwloc_bitmap_free(set);
943     /* As the reordering is optional, if we encountered an error during the reordering,
944      * we can safely return with just a duplicate of the original communicator associated
945      * with the topology. */
946     if( OMPI_SUCCESS != err ) goto fallback;
947     return OMPI_SUCCESS;
948 }
949