1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2011-2017 The University of Tennessee and The University
4 * of Tennessee Research Foundation. All rights
5 * reserved.
6 * Copyright (c) 2011-2016 INRIA. All rights reserved.
7 * Copyright (c) 2012-2017 Bordeaux Polytechnic Institute
8 * Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
9 * Copyright (c) 2015-2017 Research Organization for Information Science
10 * and Technology (RIST). All rights reserved.
11 * Copyright (c) 2016 Los Alamos National Security, LLC. All rights
12 * reserved.
13 * Copyright (c) 2017 Cisco Systems, Inc. All rights reserved
14 * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
15 * $COPYRIGHT$
16 *
17 * Additional copyrights may follow
18 *
19 * $HEADER$
20 */
21
22 #include "ompi_config.h"
23
24 #include "opal/constants.h"
25 #include "opal/mca/hwloc/hwloc-internal.h"
26
27 #include "ompi/mca/topo/treematch/topo_treematch.h"
28 #include "ompi/mca/topo/treematch/treematch/treematch.h"
29 #include "ompi/mca/topo/treematch/treematch/tm_mapping.h"
30 #include "ompi/mca/topo/base/base.h"
31
32 #include "ompi/communicator/communicator.h"
33 #include "ompi/info/info.h"
34
35 #include "ompi/mca/pml/pml.h"
36
37 #include "opal/mca/pmix/pmix.h"
38
39 /* #define __DEBUG__ 1 */
40
41 /**
42 * This function is a allreduce between all processes to detect for oversubscription.
43 * On each node, the local_procs will be a different array, that contains only the
44 * local processes. Thus, that process will compute the node oversubscription and will
45 * bring this value to the operation, while every other process on the node will
46 * contribute 0.
47 * Doing an AllReduce might be an overkill for this situation, but it should remain
48 * more scalable than a star reduction (between the roots of each node (nodes_roots),
49 * followed by a bcast to all processes.
50 */
check_oversubscribing(int rank,int num_nodes,int num_objs_in_node,int num_procs_in_node,int * nodes_roots,int * local_procs,ompi_communicator_t * comm_old)51 static int check_oversubscribing(int rank,
52 int num_nodes,
53 int num_objs_in_node,
54 int num_procs_in_node,
55 int *nodes_roots,
56 int *local_procs,
57 ompi_communicator_t *comm_old)
58 {
59 int oversubscribed = 0, local_oversub = 0, err;
60
61 /* Only a single process per node, the local root, compute the oversubscription condition */
62 if (rank == local_procs[0])
63 if(num_objs_in_node < num_procs_in_node)
64 local_oversub = 1;
65
66
67 if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_allreduce(&local_oversub, &oversubscribed, 1, MPI_INT,
68 MPI_SUM, comm_old, comm_old->c_coll->coll_allreduce_module)))
69 return err;
70
71 return oversubscribed;
72 }
73
74 #ifdef __DEBUG__
dump_int_array(int level,int output_id,char * prolog,char * line_prolog,int * array,size_t length)75 static void dump_int_array( int level, int output_id, char* prolog, char* line_prolog, int* array, size_t length )
76 {
77 size_t i;
78 if( -1 == output_id ) return;
79
80 opal_output_verbose(level, output_id, "%s : ", prolog);
81 for(i = 0; i < length ; i++)
82 opal_output_verbose(level, output_id, "%s [%lu:%i] ", line_prolog, i, array[i]);
83 opal_output_verbose(level, output_id, "\n");
84 }
dump_double_array(int level,int output_id,char * prolog,char * line_prolog,double * array,size_t length)85 static void dump_double_array( int level, int output_id, char* prolog, char* line_prolog, double* array, size_t length )
86 {
87 size_t i;
88
89 if( -1 == output_id ) return;
90 opal_output_verbose(level, output_id, "%s : ", prolog);
91 for(i = 0; i < length ; i++)
92 opal_output_verbose(level, output_id, "%s [%lu:%lf] ", line_prolog, i, array[i]);
93 opal_output_verbose(level, output_id, "\n");
94 }
95 #endif
96
mca_topo_treematch_dist_graph_create(mca_topo_base_module_t * topo_module,ompi_communicator_t * comm_old,int n,const int nodes[],const int degrees[],const int targets[],const int weights[],struct opal_info_t * info,int reorder,ompi_communicator_t ** newcomm)97 int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module,
98 ompi_communicator_t *comm_old,
99 int n, const int nodes[],
100 const int degrees[], const int targets[],
101 const int weights[],
102 struct opal_info_t *info, int reorder,
103 ompi_communicator_t **newcomm)
104 {
105 int err;
106
107 if (OMPI_SUCCESS != (err = mca_topo_base_dist_graph_distribute(topo_module, comm_old, n, nodes,
108 degrees, targets, weights,
109 &(topo_module->mtc.dist_graph))))
110 return err;
111
112 if(!reorder) { /* No reorder. Create a new communicator, then */
113 /* jump out to attach the dist_graph and return */
114 fallback:
115
116 if( OMPI_SUCCESS == (err = ompi_comm_create(comm_old,
117 comm_old->c_local_group,
118 newcomm))){
119 /* Attach the dist_graph to the newly created communicator */
120 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
121 (*newcomm)->c_topo = topo_module;
122 (*newcomm)->c_topo->reorder = reorder;
123 }
124 return err;
125 } /* reorder == yes */
126
127 mca_topo_base_comm_dist_graph_2_2_0_t *topo = NULL;
128 ompi_proc_t *proc = NULL;
129 MPI_Request *reqs = NULL;
130 hwloc_cpuset_t set = NULL;
131 hwloc_obj_t object, root_obj;
132 hwloc_obj_t *tracker = NULL;
133 double *local_pattern = NULL;
134 int *vpids, *colors = NULL;
135 int *lindex_to_grank = NULL;
136 int *nodes_roots = NULL, *k = NULL;
137 int *localrank_to_objnum = NULL;
138 int depth, effective_depth, obj_rank = -1;
139 int num_objs_in_node = 0, num_pus_in_node = 0;
140 int numlevels = 0, num_nodes = 0, num_procs_in_node = 0;
141 int rank, size, newrank = -1, hwloc_err, i, j, idx;
142 int oversubscribing_objs = 0, oversubscribed_pus = 0;
143 uint32_t val, *pval;
144
145 /* We need to know if the processes are bound. We assume all
146 * processes are in the same state: all bound or none. */
147 if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
148 goto fallback;
149 }
150 root_obj = hwloc_get_root_obj(opal_hwloc_topology);
151 if (NULL == root_obj) goto fallback;
152
153 topo = topo_module->mtc.dist_graph;
154 rank = ompi_comm_rank(comm_old);
155 size = ompi_comm_size(comm_old);
156
157 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
158 "Process rank is : %i\n",rank));
159 /**
160 * In order to decrease the number of loops let's use a trick:
161 * build the lindex_to_grank in the vpids array, and only allocate
162 * it upon completion of the most costly loop.
163 */
164 vpids = (int *)malloc(size * sizeof(int));
165 colors = (int *)malloc(size * sizeof(int));
166 for(i = 0 ; i < size ; i++) {
167 proc = ompi_group_peer_lookup(comm_old->c_local_group, i);
168 if (( i == rank ) ||
169 (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags)))
170 vpids[num_procs_in_node++] = i;
171
172 pval = &val;
173 OPAL_MODEX_RECV_VALUE(err, OPAL_PMIX_NODEID, &(proc->super.proc_name), &pval, OPAL_UINT32);
174 if( OPAL_SUCCESS != err ) {
175 opal_output(0, "Unable to extract peer %s nodeid from the modex.\n",
176 OMPI_NAME_PRINT(&(proc->super)));
177 colors[i] = -1;
178 continue;
179 }
180 colors[i] = (int)val;
181 }
182 lindex_to_grank = (int *)malloc(num_procs_in_node * sizeof(int));
183 memcpy(lindex_to_grank, vpids, num_procs_in_node * sizeof(int));
184 memcpy(vpids, colors, size * sizeof(int));
185
186 #ifdef __DEBUG__
187 if ( 0 == rank ) {
188 dump_int_array(10, ompi_topo_base_framework.framework_output,
189 "lindex_to_grank : ", "", lindex_to_grank, num_procs_in_node);
190 dump_int_array(10, ompi_topo_base_framework.framework_output,
191 "Vpids : ", "", colors, size);
192 }
193 #endif
194 /* clean-up dupes in the array */
195 for(i = 0; i < size ; i++) {
196 if ( -1 == vpids[i] ) continue;
197 num_nodes++; /* compute number of nodes */
198 for(j = i+1; j < size; j++)
199 if( vpids[i] == vpids[j] )
200 vpids[j] = -1;
201 }
202 if( 0 == num_nodes ) {
203 /* No useful info has been retrieved from the runtime. Fallback
204 * and create a duplicate of the original communicator */
205 free(vpids);
206 free(colors);
207 goto fallback; /* return with success */
208 }
209 /* compute local roots ranks in comm_old */
210 /* Only the global root needs to do this */
211 if(0 == rank) {
212 nodes_roots = (int *)calloc(num_nodes, sizeof(int));
213 for(i = idx = 0; i < size; i++)
214 if( vpids[i] != -1 )
215 nodes_roots[idx++] = i;
216 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
217 "num nodes is %i\n", num_nodes));
218 #ifdef __DEBUG__
219 dump_int_array(10, ompi_topo_base_framework.framework_output,
220 "Root nodes are :\n", "root ", nodes_roots, num_nodes);
221 #endif
222 }
223 free(vpids);
224
225 /* if cpubind returns an error, it will be full anyway */
226 set = hwloc_bitmap_alloc_full();
227 hwloc_get_cpubind(opal_hwloc_topology, set, 0);
228 num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU);
229
230 /**
231 * In all situations (including heterogeneous environments) all processes must execute
232 * all the calls that involve collective communications, so we have to lay the logic
233 * accordingly.
234 */
235
236 if(hwloc_bitmap_isincluded(root_obj->cpuset,set)) { /* processes are not bound on the machine */
237 if (0 == rank)
238 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
239 ">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n"));
240
241 /* we try to bind to cores or above objects if enough are present */
242 /* Not sure that cores are present in ALL nodes */
243 depth = hwloc_get_type_or_above_depth(opal_hwloc_topology, HWLOC_OBJ_CORE);
244 num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, depth);
245 } else { /* the processes are already bound */
246 object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology, set);
247 obj_rank = object->logical_index;
248 effective_depth = object->depth;
249 num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth);
250 }
251 if( (0 == num_objs_in_node) || (0 == num_pus_in_node) ) { /* deal with bozo cases: COVERITY 1418505 */
252 free(colors);
253 goto fallback; /* return with success */
254 }
255 /* Check for oversubscribing */
256 oversubscribing_objs = check_oversubscribing(rank, num_nodes,
257 num_objs_in_node, num_procs_in_node,
258 nodes_roots, lindex_to_grank, comm_old);
259
260 if(oversubscribing_objs) {
261 if(hwloc_bitmap_isincluded(root_obj->cpuset, set)) { /* processes are not bound on the machine */
262 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
263 "Oversubscribing OBJ/CORES resources => Trying to use PUs \n"));
264
265 oversubscribed_pus = check_oversubscribing(rank, num_nodes,
266 num_pus_in_node, num_procs_in_node,
267 nodes_roots, lindex_to_grank, comm_old);
268 /* Update the data used to compute the correct binding */
269 if (!oversubscribed_pus) {
270 obj_rank = ompi_process_info.my_local_rank%num_pus_in_node;
271 effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1;
272 num_objs_in_node = num_pus_in_node;
273 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
274 "Process %i not bound : binding on PU#%i \n", rank, obj_rank));
275 }
276 } else {
277 /* Bound processes will participate with the same data as before */
278 oversubscribed_pus = check_oversubscribing(rank, num_nodes,
279 num_objs_in_node, num_procs_in_node,
280 nodes_roots, lindex_to_grank, comm_old);
281 }
282 }
283
284 if( !oversubscribing_objs && !oversubscribed_pus ) {
285 if( hwloc_bitmap_isincluded(root_obj->cpuset, set) ) { /* processes are not bound on the machine */
286 obj_rank = ompi_process_info.my_local_rank%num_objs_in_node;
287 effective_depth = depth;
288 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, obj_rank);
289 if( NULL == object) {
290 free(colors);
291 hwloc_bitmap_free(set);
292 goto fallback; /* return with success */
293 }
294
295 hwloc_bitmap_copy(set, object->cpuset);
296 hwloc_bitmap_singlify(set); /* we don't want the process to move */
297 hwloc_err = hwloc_set_cpubind(opal_hwloc_topology, set, 0);
298 if( -1 == hwloc_err) {
299 /* This is a local issue. Either we agree with the rest of the processes to stop the
300 * reordering or we have to complete the entire process. Let's complete.
301 */
302 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
303 "Process %i failed to bind on OBJ#%i \n", rank, obj_rank));
304 } else
305 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
306 "Process %i not bound : binding on OBJ#%i \n",rank, obj_rank));
307 } else {
308 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
309 "Process %i bound on OBJ #%i \n"
310 "=====> Num obj in node : %i | num pus in node : %i\n",
311 rank, obj_rank,
312 num_objs_in_node, num_pus_in_node));
313 }
314 } else {
315 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
316 "Oversubscribing PUs resources => Rank Reordering Impossible \n"));
317 free(colors);
318 hwloc_bitmap_free(set);
319 goto fallback; /* return with success */
320 }
321
322 reqs = (MPI_Request *)calloc(num_procs_in_node-1, sizeof(MPI_Request));
323 if( rank == lindex_to_grank[0] ) { /* local leader clean the hierarchy */
324 int array_size = effective_depth + 1;
325 int *myhierarchy = (int *)calloc(array_size, sizeof(int));
326
327 numlevels = 1;
328 myhierarchy[0] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, 0);
329 for (i = 1; i < array_size ; i++) {
330 myhierarchy[i] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, i);
331 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
332 "hierarchy[%i] = %i\n", i, myhierarchy[i]));
333 if ((myhierarchy[i] != 0) && (myhierarchy[i] != myhierarchy[i-1]))
334 numlevels++;
335 }
336
337 tracker = (hwloc_obj_t *)calloc(numlevels, sizeof(hwloc_obj_t));
338 for(idx = 0, i = 1; i < array_size; i++) {
339 if(myhierarchy[i] != myhierarchy[i-1])
340 tracker[idx++] = hwloc_get_obj_by_depth(opal_hwloc_topology, i-1, 0);
341 }
342 tracker[idx] = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, 0);
343 free(myhierarchy);
344
345 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
346 ">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n",
347 effective_depth, hwloc_topology_get_depth(opal_hwloc_topology), numlevels));
348 for(i = 0 ; i < numlevels ; i++) {
349 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
350 "tracker[%i] : arity %i | depth %i\n",
351 i, tracker[i]->arity, tracker[i]->depth));
352 }
353 /* get the obj number */
354 localrank_to_objnum = (int *)calloc(num_procs_in_node, sizeof(int));
355 localrank_to_objnum[0] = obj_rank;
356
357 for(i = 1; i < num_procs_in_node; i++) {
358 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&localrank_to_objnum[i], 1, MPI_INT,
359 lindex_to_grank[i], -111, comm_old, &reqs[i-1])))) {
360 free(reqs); reqs = NULL;
361 goto release_and_return;
362 }
363 }
364 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_procs_in_node-1,
365 reqs, MPI_STATUSES_IGNORE))) {
366 free(reqs); reqs = NULL;
367 goto release_and_return;
368 }
369 } else {
370 /* sending my core number to my local master on the node */
371 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&obj_rank, 1, MPI_INT, lindex_to_grank[0],
372 -111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
373 free(reqs); reqs = NULL;
374 goto release_and_return;
375 }
376 }
377 free(reqs); reqs = NULL;
378
379 /* Centralized Reordering */
380 if (0 == mca_topo_treematch_component.reorder_mode) {
381 int *k = NULL;
382 int *obj_mapping = NULL;
383 int num_objs_total = 0;
384
385 /* Gather comm pattern
386 * If weights have been provided take them in account. Otherwise rely
387 * solely on HWLOC information.
388 */
389 if( 0 == rank ) {
390
391 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
392 "========== Centralized Reordering ========= \n"));
393 local_pattern = (double *)calloc(size*size,sizeof(double));
394 } else {
395 local_pattern = (double *)calloc(size,sizeof(double));
396 }
397 if( true == topo->weighted ) {
398 for(i = 0; i < topo->indegree ; i++)
399 local_pattern[topo->in[i]] += topo->inw[i];
400 for(i = 0; i < topo->outdegree ; i++)
401 local_pattern[topo->out[i]] += topo->outw[i];
402 }
403 err = comm_old->c_coll->coll_gather( (0 == rank ? MPI_IN_PLACE : local_pattern), size, MPI_DOUBLE,
404 local_pattern, size, MPI_DOUBLE, /* ignored on non-root */
405 0, comm_old, comm_old->c_coll->coll_gather_module);
406 if (OMPI_SUCCESS != err) {
407 goto release_and_return;
408 }
409
410 if( rank == lindex_to_grank[0] ) {
411 tm_topology_t *tm_topology = NULL;
412 int *obj_to_rank_in_comm = NULL;
413 int *hierarchies = NULL;
414 int min;
415
416 /* create a table that derives the rank in comm_old from the object number */
417 obj_to_rank_in_comm = (int *)malloc(num_objs_in_node*sizeof(int));
418 for(i = 0 ; i < num_objs_in_node ; i++) {
419 obj_to_rank_in_comm[i] = -1;
420 object = hwloc_get_obj_by_depth(opal_hwloc_topology, effective_depth, i);
421 for( j = 0; j < num_procs_in_node ; j++ )
422 if(localrank_to_objnum[j] == (int)(object->logical_index)) {
423 obj_to_rank_in_comm[i] = lindex_to_grank[j];
424 break;
425 }
426 }
427
428 /* the global master gathers info from local_masters */
429 if ( 0 == rank ) {
430 if ( num_nodes > 1 ) {
431 int *objs_per_node = NULL, displ;
432
433 objs_per_node = (int *)calloc(num_nodes, sizeof(int));
434 reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request));
435 objs_per_node[0] = num_objs_in_node;
436 for(i = 1; i < num_nodes ; i++)
437 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(objs_per_node + i, 1, MPI_INT,
438 nodes_roots[i], -112, comm_old, &reqs[i-1])))) {
439 free(obj_to_rank_in_comm);
440 free(objs_per_node);
441 goto release_and_return;
442 }
443
444 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
445 reqs, MPI_STATUSES_IGNORE))) {
446 free(objs_per_node);
447 goto release_and_return;
448 }
449
450 for(i = 0; i < num_nodes; i++)
451 num_objs_total += objs_per_node[i];
452 obj_mapping = (int *)calloc(num_objs_total,sizeof(int));
453
454 memcpy(obj_mapping, obj_to_rank_in_comm, objs_per_node[0]*sizeof(int));
455 displ = objs_per_node[0];
456 for(i = 1; i < num_nodes ; i++) {
457 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displ, objs_per_node[i], MPI_INT,
458 nodes_roots[i], -113, comm_old, &reqs[i-1])))) {
459 free(obj_to_rank_in_comm);
460 free(objs_per_node);
461 free(obj_mapping);
462 goto release_and_return;
463 }
464 displ += objs_per_node[i];
465 }
466 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
467 reqs, MPI_STATUSES_IGNORE))) {
468 free(obj_to_rank_in_comm);
469 free(objs_per_node);
470 free(obj_mapping);
471 goto release_and_return;
472 }
473 free(objs_per_node);
474 } else {
475 /* if num_nodes == 1, then it's easy to get the obj mapping */
476 num_objs_total = num_objs_in_node;
477 obj_mapping = (int *)calloc(num_objs_total, sizeof(int));
478 memcpy(obj_mapping, obj_to_rank_in_comm, num_objs_total*sizeof(int));
479 }
480 #ifdef __DEBUG__
481 dump_int_array(10, ompi_topo_base_framework.framework_output,
482 "Obj mapping : ", "", obj_mapping, num_objs_total );
483 #endif
484 } else {
485 if ( num_nodes > 1 ) {
486 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&num_objs_in_node, 1, MPI_INT,
487 0, -112, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
488 free(obj_to_rank_in_comm);
489 goto release_and_return;
490 }
491 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(obj_to_rank_in_comm, num_objs_in_node, MPI_INT,
492 0, -113, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
493 free(obj_to_rank_in_comm);
494 goto release_and_return;
495 }
496 }
497 }
498 free(obj_to_rank_in_comm);
499
500 assert(numlevels < TM_MAX_LEVELS);
501 if( 0 == rank ) {
502 hierarchies = (int *)malloc(num_nodes*(TM_MAX_LEVELS+1)*sizeof(int));
503 } else {
504 hierarchies = (int *)malloc((TM_MAX_LEVELS+1)*sizeof(int));
505 }
506
507 hierarchies[0] = numlevels;
508
509 for(i = 0 ; i < hierarchies[0]; i++)
510 hierarchies[i+1] = tracker[i]->arity;
511 for(; i < (TM_MAX_LEVELS+1); i++) /* fill up everything else with -1 */
512 hierarchies[i] = -1;
513
514 /* gather hierarchies iff more than 1 node! */
515 if ( num_nodes > 1 ) {
516 if( rank != 0 ) {
517 if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchies,(TM_MAX_LEVELS+1), MPI_INT, 0,
518 -114, MCA_PML_BASE_SEND_STANDARD, comm_old)))) {
519 free(hierarchies);
520 goto release_and_return;
521 }
522 } else {
523 for(i = 1; i < num_nodes ; i++)
524 if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(TM_MAX_LEVELS+1), (TM_MAX_LEVELS+1), MPI_INT,
525 nodes_roots[i], -114, comm_old, &reqs[i-1])))) {
526 free(obj_mapping);
527 free(hierarchies);
528 goto release_and_return;
529 }
530 if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1,
531 reqs, MPI_STATUSES_IGNORE))) {
532 free(obj_mapping);
533 free(hierarchies);
534 goto release_and_return;
535 }
536 free(reqs); reqs = NULL;
537 }
538 }
539
540 if ( 0 == rank ) {
541 tm_tree_t *comm_tree = NULL;
542 tm_solution_t *sol = NULL;
543 tm_affinity_mat_t *aff_mat = NULL;
544 double **comm_pattern = NULL;
545
546 #ifdef __DEBUG__
547 dump_int_array(10, ompi_topo_base_framework.framework_output,
548 "hierarchies : ", "", hierarchies, num_nodes*(TM_MAX_LEVELS+1));
549 #endif
550 tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
551 tm_topology->nb_levels = hierarchies[0];
552
553 /* extract min depth */
554 for(i = 1 ; i < num_nodes ; i++)
555 if (hierarchies[i*(TM_MAX_LEVELS+1)] < tm_topology->nb_levels)
556 tm_topology->nb_levels = hierarchies[i*(TM_MAX_LEVELS+1)];
557
558 /* Crush levels in hierarchies too long (ie > tm_topology->nb_levels)*/
559 for(i = 0; i < num_nodes ; i++) {
560 int *base_ptr = hierarchies + i*(TM_MAX_LEVELS+1);
561 int suppl = *base_ptr - tm_topology->nb_levels;
562 for(j = 1 ; j <= suppl ; j++)
563 *(base_ptr + tm_topology->nb_levels) *= *(base_ptr + tm_topology->nb_levels + j);
564 }
565 if( num_nodes > 1) {
566 /* We aggregate all topos => +1 level!*/
567 tm_topology->nb_levels += 1;
568 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
569 tm_topology->arity[0] = num_nodes;
570 for(i = 1; i < tm_topology->nb_levels; i++) { /* compute the minimum for each level */
571 min = hierarchies[i];
572 for(j = 1; j < num_nodes ; j++)
573 if( hierarchies[j*(TM_MAX_LEVELS+1) + i] < min)
574 min = hierarchies[j*(TM_MAX_LEVELS+1) + i];
575 tm_topology->arity[i] = min;
576 }
577 } else {
578 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
579 for(i = 0; i < tm_topology->nb_levels; i++)
580 tm_topology->arity[i] = hierarchies[i+1];
581 }
582 free(hierarchies);
583
584 for(i = 0; i < tm_topology->nb_levels; i++) {
585 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
586 "topo_arity[%i] = %i\n", i, tm_topology->arity[i]));
587 }
588
589 /* compute the number of processing elements */
590 tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
591 tm_topology->nb_nodes[0] = 1;
592 for(i = 1 ; i < tm_topology->nb_levels; i++)
593 tm_topology->nb_nodes[i] = tm_topology->nb_nodes[i-1] * tm_topology->arity[i-1];
594
595 /* Build process id tab */
596 tm_topology->node_id = (int **)calloc(tm_topology->nb_levels, sizeof(int*));
597 tm_topology->node_rank = (int **)malloc(sizeof(int *) * tm_topology->nb_levels);
598 for(i = 0; i < tm_topology->nb_levels; i++) {
599 tm_topology->node_id[i] = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
600 tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int));
601 /*note : we make the hypothesis that logical indexes in hwloc range from
602 0 to N, are contiguous and crescent. */
603
604 for( j = 0 ; j < (int)tm_topology->nb_nodes[i] ; j++ ) {
605 tm_topology->node_id[i][j] = j;
606 tm_topology->node_rank[i][j] = j;
607
608 /* Should use object->logical_index */
609 /* obj = hwloc_get_obj_by_depth(topo,i,j%num_objs_in_node);
610 id = obj->logical_index + (num_objs_in_node)*(j/num_obj_in_node)*/
611 /*
612 int id = core_numbering[j%nb_core_per_nodes] + (nb_core_per_nodes)*(j/nb_core_per_nodes);
613 topology->node_id[i][j] = id;
614 topology->node_rank[i][id] = j;
615 */
616 }
617 }
618 /* unused for now*/
619 tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
620
621 tm_topology->nb_proc_units = num_objs_total;
622
623 tm_topology->nb_constraints = 0;
624 for(i = 0; i < tm_topology->nb_proc_units ; i++)
625 if (obj_mapping[i] != -1)
626 tm_topology->nb_constraints++;
627 tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
628 for(idx = 0, i = 0; i < tm_topology->nb_proc_units ; i++)
629 if (obj_mapping[i] != -1)
630 tm_topology->constraints[idx++] = obj_mapping[i];
631
632 tm_topology->oversub_fact = 1;
633
634 #ifdef __DEBUG__
635 assert(num_objs_total == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
636
637 for(i = 0; i < tm_topology->nb_levels ; i++) {
638 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
639 "tm topo node_id for level [%i] : ",i);
640 dump_int_array(10, ompi_topo_base_framework.framework_output,
641 "", "", obj_mapping, tm_topology->nb_nodes[i]);
642 }
643 tm_display_topology(tm_topology);
644 #endif
645
646 comm_pattern = (double **)malloc(size*sizeof(double *));
647 for(i = 0 ; i < size ; i++)
648 comm_pattern[i] = local_pattern + i * size;
649 /* matrix needs to be symmetric */
650 for( i = 0; i < size ; i++ )
651 for( j = i; j < size ; j++ ) {
652 comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
653 comm_pattern[j][i] = comm_pattern[i][j];
654 }
655 #ifdef __DEBUG__
656 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
657 "==== COMM PATTERN ====\n");
658 for( i = 0 ; i < size ; i++) {
659 dump_double_array(10, ompi_topo_base_framework.framework_output,
660 "", "", comm_pattern[i], size);
661 }
662 #endif
663 tm_optimize_topology(&tm_topology);
664 aff_mat = tm_build_affinity_mat(comm_pattern,size);
665 comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
666 sol = tm_compute_mapping(tm_topology, comm_tree);
667
668 k = (int *)calloc(sol->k_length, sizeof(int));
669 for(idx = 0 ; idx < (int)sol->k_length ; idx++)
670 k[idx] = sol->k[idx][0];
671
672 #ifdef __DEBUG__
673 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
674 "====> nb levels : %i\n",tm_topology->nb_levels);
675 dump_int_array(10, ompi_topo_base_framework.framework_output,
676 "Rank permutation sigma/k : ", "", k, num_objs_total);
677 assert(size == (int)sol->sigma_length);
678 dump_int_array(10, ompi_topo_base_framework.framework_output,
679 "Matching : ", "",sol->sigma, sol->sigma_length);
680 #endif
681 free(obj_mapping);
682 free(comm_pattern);
683 free(aff_mat->sum_row);
684 free(aff_mat);
685 tm_free_solution(sol);
686 tm_free_tree(comm_tree);
687 tm_free_topology(tm_topology);
688 }
689 }
690
691 /* Todo : Bcast + group creation */
692 /* scatter the ranks */
693 if (OMPI_SUCCESS != (err = comm_old->c_coll->coll_scatter(k, 1, MPI_INT,
694 &newrank, 1, MPI_INT,
695 0, comm_old,
696 comm_old->c_coll->coll_scatter_module))) {
697 if (NULL != k) free(k);
698 goto release_and_return;
699 }
700
701 if ( 0 == rank )
702 free(k);
703
704 /* this needs to be optimized but will do for now */
705 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
706 goto release_and_return;
707 }
708 /* end of TODO */
709
710 /* Attach the dist_graph to the newly created communicator */
711 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
712 (*newcomm)->c_topo = topo_module;
713 (*newcomm)->c_topo->reorder = reorder;
714
715 } else { /* partially distributed reordering */
716 int *grank_to_lrank = NULL, *lrank_to_grank = NULL, *marked = NULL;
717 int node_position = 0, offset = 0, pos = 0;
718 ompi_communicator_t *localcomm = NULL;
719
720 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, colors[rank], rank,
721 &localcomm, false))) {
722 goto release_and_return;
723 }
724
725 lrank_to_grank = (int *)calloc(num_procs_in_node, sizeof(int));
726 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_allgather(&rank, 1, MPI_INT,
727 lrank_to_grank, 1, MPI_INT,
728 localcomm, localcomm->c_coll->coll_allgather_module))) {
729 free(lrank_to_grank);
730 ompi_comm_free(&localcomm);
731 goto release_and_return;
732 }
733
734 grank_to_lrank = (int *)malloc(size * sizeof(int));
735 for(i = 0 ; i < size ; grank_to_lrank[i++] = -1);
736 for(i = 0 ; i < num_procs_in_node ; i++)
737 grank_to_lrank[lrank_to_grank[i]] = i;
738
739 /* Discover the local patterns */
740 if (rank == lindex_to_grank[0]) {
741 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
742 "========== Partially Distributed Reordering ========= \n"));
743 local_pattern = (double *)calloc(num_procs_in_node * num_procs_in_node, sizeof(double));
744 } else {
745 local_pattern = (double *)calloc(num_procs_in_node, sizeof(double));
746 }
747 /* Extract the local communication pattern */
748 if( true == topo->weighted ) {
749 for(i = 0; i < topo->indegree; i++)
750 if (grank_to_lrank[topo->in[i]] != -1)
751 local_pattern[grank_to_lrank[topo->in[i]]] += topo->inw[i];
752 for(i = 0; i < topo->outdegree; i++)
753 if (grank_to_lrank[topo->out[i]] != -1)
754 local_pattern[grank_to_lrank[topo->out[i]]] += topo->outw[i];
755 }
756 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_gather((rank == lindex_to_grank[0] ? MPI_IN_PLACE : local_pattern),
757 num_procs_in_node, MPI_DOUBLE,
758 local_pattern, num_procs_in_node, MPI_DOUBLE,
759 0, localcomm, localcomm->c_coll->coll_gather_module))) {
760 free(lrank_to_grank);
761 ompi_comm_free(&localcomm);
762 free(grank_to_lrank);
763 goto release_and_return;
764 }
765
766 /* The root has now the entire information, so let's crunch it */
767 if (rank == lindex_to_grank[0]) {
768 tm_topology_t *tm_topology = NULL;
769 tm_tree_t *comm_tree = NULL;
770 tm_solution_t *sol = NULL;
771 tm_affinity_mat_t *aff_mat = NULL;
772 double **comm_pattern = NULL;
773
774 comm_pattern = (double **)malloc(num_procs_in_node*sizeof(double *));
775 for( i = 0; i < num_procs_in_node; i++ ) {
776 comm_pattern[i] = local_pattern + i * num_procs_in_node;
777 }
778 /* Matrix needs to be symmetric. Beware: as comm_patterns
779 * refers to local_pattern we indirectly alter the content
780 * of local_pattern */
781 for( i = 0; i < num_procs_in_node ; i++ )
782 for( j = i; j < num_procs_in_node ; j++ ) {
783 comm_pattern[i][j] = (comm_pattern[i][j] + comm_pattern[j][i]) / 2;
784 comm_pattern[j][i] = comm_pattern[i][j];
785 }
786
787 #ifdef __DEBUG__
788 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
789 "========== COMM PATTERN ============= \n"));
790 for(i = 0 ; i < num_procs_in_node ; i++){
791 opal_output_verbose(10, ompi_topo_base_framework.framework_output," %i : ",i);
792 dump_double_array(10, ompi_topo_base_framework.framework_output,
793 "", "", comm_pattern[i], num_procs_in_node);
794 }
795 opal_output_verbose(10, ompi_topo_base_framework.framework_output,
796 "======================= \n");
797 #endif
798
799 tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t));
800 tm_topology->nb_levels = numlevels;
801 tm_topology->arity = (int *)calloc(tm_topology->nb_levels, sizeof(int));
802 tm_topology->nb_nodes = (size_t *)calloc(tm_topology->nb_levels, sizeof(size_t));
803 tm_topology->node_id = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
804 tm_topology->node_rank = (int **)malloc(tm_topology->nb_levels*sizeof(int *));
805
806 for(i = 0 ; i < tm_topology->nb_levels ; i++){
807 int nb_objs = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, tracker[i]->depth);
808 tm_topology->nb_nodes[i] = nb_objs;
809 tm_topology->arity[i] = tracker[i]->arity;
810 tm_topology->node_id[i] = (int *)calloc(tm_topology->nb_nodes[i], sizeof(int));
811 tm_topology->node_rank[i] = (int * )calloc(tm_topology->nb_nodes[i], sizeof(int));
812 for(j = 0; j < (int)tm_topology->nb_nodes[i] ; j++){
813 tm_topology->node_id[i][j] = j;
814 tm_topology->node_rank[i][j] = j;
815 }
816 }
817
818 /* unused for now*/
819 tm_topology->cost = (double*)calloc(tm_topology->nb_levels,sizeof(double));
820
821 tm_topology->nb_proc_units = num_objs_in_node;
822 //tm_topology->nb_proc_units = num_procs_in_node;
823 tm_topology->nb_constraints = 0;
824 for(i = 0; i < num_procs_in_node ; i++)
825 if (localrank_to_objnum[i] != -1)
826 tm_topology->nb_constraints++;
827
828 tm_topology->constraints = (int *)calloc(tm_topology->nb_constraints,sizeof(int));
829 for(idx = 0,i = 0; i < num_procs_in_node ; i++)
830 if (localrank_to_objnum[i] != -1)
831 tm_topology->constraints[idx++] = localrank_to_objnum[i];
832
833 tm_topology->oversub_fact = 1;
834
835 #ifdef __DEBUG__
836 assert(num_objs_in_node == (int)tm_topology->nb_nodes[tm_topology->nb_levels-1]);
837 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
838 "Levels in topo : %i | num procs in node : %i\n",
839 tm_topology->nb_levels,num_procs_in_node));
840 for(i = 0; i < tm_topology->nb_levels ; i++) {
841 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
842 "Nb objs for level %i : %lu | arity %i\n ",
843 i, tm_topology->nb_nodes[i],tm_topology->arity[i]));
844 dump_int_array(10, ompi_topo_base_framework.framework_output,
845 "", "Obj id ", tm_topology->node_id[i], tm_topology->nb_nodes[i]);
846 }
847 tm_display_topology(tm_topology);
848 #endif
849 tm_optimize_topology(&tm_topology);
850 aff_mat = tm_build_affinity_mat(comm_pattern,num_procs_in_node);
851 comm_tree = tm_build_tree_from_topology(tm_topology,aff_mat, NULL, NULL);
852 sol = tm_compute_mapping(tm_topology, comm_tree);
853
854 assert((int)sol->k_length == num_objs_in_node);
855
856 k = (int *)calloc(sol->k_length, sizeof(int));
857 for(idx = 0 ; idx < (int)sol->k_length ; idx++)
858 k[idx] = sol->k[idx][0];
859
860 #ifdef __DEBUG__
861 OPAL_OUTPUT_VERBOSE((10, ompi_topo_base_framework.framework_output,
862 "====> nb levels : %i\n",tm_topology->nb_levels));
863 dump_int_array(10, ompi_topo_base_framework.framework_output,
864 "Rank permutation sigma/k : ", "", k, num_procs_in_node);
865 assert(num_procs_in_node == (int)sol->sigma_length);
866 dump_int_array(10, ompi_topo_base_framework.framework_output,
867 "Matching : ", "", sol->sigma, sol->sigma_length);
868 #endif
869
870 free(aff_mat->sum_row);
871 free(aff_mat);
872 free(comm_pattern);
873 tm_free_solution(sol);
874 tm_free_tree(comm_tree);
875 tm_free_topology(tm_topology);
876 }
877
878 /* Todo : Bcast + group creation */
879 /* scatter the ranks */
880 if (OMPI_SUCCESS != (err = localcomm->c_coll->coll_scatter(k, 1, MPI_INT,
881 &newrank, 1, MPI_INT,
882 0, localcomm,
883 localcomm->c_coll->coll_scatter_module))) {
884 if (NULL != k) free(k);
885 ompi_comm_free(&localcomm);
886 free(lrank_to_grank);
887 free(grank_to_lrank);
888 goto release_and_return;
889 }
890
891 /* compute the offset of newrank before the split */
892 /* use the colors array, not the vpids */
893 marked = (int *)malloc((num_nodes-1)*sizeof(int));
894 for(idx = 0 ; idx < num_nodes - 1 ; idx++)
895 marked[idx] = -1;
896
897 while( (node_position != rank) && (colors[node_position] != colors[rank])) {
898 /* Have we already counted the current color ? */
899 for(idx = 0; idx < pos; idx++)
900 if( marked[idx] == colors[node_position] )
901 goto next_iter; /* yes, let's skip the rest */
902 /* How many elements of this color are here ? none before the current position */
903 for(; idx < size; idx++)
904 if(colors[idx] == colors[node_position])
905 offset++;
906 marked[pos++] = colors[node_position];
907 next_iter:
908 node_position++;
909 }
910 newrank += offset;
911 free(marked);
912
913 if (rank == lindex_to_grank[0])
914 free(k);
915
916 /* this needs to be optimized but will do for now */
917 if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank, newcomm, false))) {
918 ompi_comm_free(&localcomm);
919 free(lrank_to_grank);
920 free(grank_to_lrank);
921 goto release_and_return;
922 }
923 /* end of TODO */
924
925 /* Attach the dist_graph to the newly created communicator */
926 (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH;
927 (*newcomm)->c_topo = topo_module;
928 (*newcomm)->c_topo->reorder = reorder;
929
930 free(grank_to_lrank);
931 free(lrank_to_grank);
932 } /* distributed reordering end */
933
934 release_and_return:
935 if (NULL != reqs ) free(reqs);
936 if (NULL != tracker) free(tracker);
937 if (NULL != local_pattern) free(local_pattern);
938 free(colors);
939 if (NULL != lindex_to_grank) free(lindex_to_grank);
940 if (NULL != nodes_roots) free(nodes_roots); /* only on root */
941 if (NULL != localrank_to_objnum) free(localrank_to_objnum);
942 if( NULL != set) hwloc_bitmap_free(set);
943 /* As the reordering is optional, if we encountered an error during the reordering,
944 * we can safely return with just a duplicate of the original communicator associated
945 * with the topology. */
946 if( OMPI_SUCCESS != err ) goto fallback;
947 return OMPI_SUCCESS;
948 }
949