1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 #include <stdlib.h>
6 #include <string.h>
7 #include <stdio.h>
8 #include <unistd.h>
9
10 #include <mpi.h>
11
12 #if defined(__bgp__)
13 #include <spi/kernel_interface.h>
14 #include <common/bgp_personality.h>
15 #include <common/bgp_personality_inlines.h>
16 #elif defined(__bgq__)
17 # include <mpix.h>
18 #elif defined(__CRAYXT) || defined(__CRAYXE)
19 # include <pmi.h>
20 #endif
21
22 #include "comex.h"
23 #include "comex_impl.h"
24 #include "groups.h"
25
26
27 /* world group state */
28 comex_group_world_t g_state = {
29 MPI_COMM_NULL,
30 MPI_GROUP_NULL,
31 -1,
32 -1,
33 NULL,
34 MPI_COMM_NULL,
35 -1,
36 -1
37 };
38 /* the HEAD of the group linked list */
39 comex_igroup_t *group_list = NULL;
40
41 #define RANK_OR_PID (g_state.rank >= 0 ? g_state.rank : getpid())
42
43 /* static functions implemented in this file */
44 static void _create_group_and_igroup(comex_group_t *id, comex_igroup_t **igroup);
45 static void _igroup_free(comex_igroup_t *igroup);
46 static long xgethostid();
47
48
49 /**
50 * Return the comex igroup instance given the group id.
51 *
52 * The group linked list is searched sequentially until the given group
53 * is found. It is an error if this function is called before
54 * comex_group_init(). An error occurs if the given group is not found.
55 */
comex_get_igroup_from_group(comex_group_t id)56 comex_igroup_t* comex_get_igroup_from_group(comex_group_t id)
57 {
58 comex_igroup_t *current_group_list_item = group_list;
59
60 #if DEBUG
61 printf("[%d] comex_get_igroup_from_group(%d)\n", RANK_OR_PID, id);
62 #endif
63
64 COMEX_ASSERT(group_list != NULL);
65 while (current_group_list_item != NULL) {
66 if (current_group_list_item->id == id) {
67 return current_group_list_item;
68 }
69 current_group_list_item = current_group_list_item->next;
70 }
71 comex_error("comex group lookup failed", -1);
72
73 return NULL;
74 }
75
76
77 /**
78 * Creates and associates a comex group with a comex igroup.
79 *
80 * This does *not* initialize the members of the comex igroup.
81 */
_create_group_and_igroup(comex_group_t * id,comex_igroup_t ** igroup)82 static void _create_group_and_igroup(
83 comex_group_t *id, comex_igroup_t **igroup)
84 {
85 comex_igroup_t *new_group_list_item = NULL;
86 comex_igroup_t *last_group_list_item = NULL;
87
88 #if DEBUG
89 printf("[%d] _create_group_and_igroup(...)\n", RANK_OR_PID);
90 #endif
91
92 /* create, init, and insert the new node for the linked list */
93 new_group_list_item = malloc(sizeof(comex_igroup_t));
94 new_group_list_item->next = NULL;
95 new_group_list_item->id = -1;
96 new_group_list_item->comm = MPI_COMM_NULL;
97 new_group_list_item->group = MPI_GROUP_NULL;
98 new_group_list_item->size = -1;
99 new_group_list_item->rank = -1;
100
101 /* find the last group in the group linked list and insert */
102 if (group_list) {
103 last_group_list_item = group_list;
104 while (last_group_list_item->next != NULL) {
105 last_group_list_item = last_group_list_item->next;
106 }
107 last_group_list_item->next = new_group_list_item;
108 new_group_list_item->id = last_group_list_item->id + 1;
109 }
110 else {
111 group_list = new_group_list_item;
112 new_group_list_item->id = COMEX_GROUP_WORLD;
113 }
114
115 /* return the group id and comex igroup */
116 *igroup = new_group_list_item;
117 *id = new_group_list_item->id;
118 }
119
120
comex_group_rank(comex_group_t group,int * rank)121 int comex_group_rank(comex_group_t group, int *rank)
122 {
123 comex_igroup_t *igroup = comex_get_igroup_from_group(group);
124 *rank = igroup->rank;
125
126 #if DEBUG
127 printf("[%d] comex_group_rank(group=%d, *rank=%d)\n",
128 RANK_OR_PID, group, *rank);
129 #endif
130
131 return COMEX_SUCCESS;
132 }
133
134
comex_group_size(comex_group_t group,int * size)135 int comex_group_size(comex_group_t group, int *size)
136 {
137 comex_igroup_t *igroup = comex_get_igroup_from_group(group);
138 *size = igroup->size;
139
140 #if DEBUG
141 printf("[%d] comex_group_size(group=%d, *size=%d)\n",
142 RANK_OR_PID, group, *size);
143 #endif
144
145 return COMEX_SUCCESS;
146 }
147
148
comex_group_comm(comex_group_t group,MPI_Comm * comm)149 int comex_group_comm(comex_group_t group, MPI_Comm *comm)
150 {
151 comex_igroup_t *igroup = comex_get_igroup_from_group(group);
152 *comm = igroup->comm;
153
154 #if DEBUG
155 printf("[%d] comex_group_comm(group=%d, comm)\n",
156 RANK_OR_PID, group);
157 #endif
158
159 return COMEX_SUCCESS;
160 }
161
162
comex_group_translate_world(comex_group_t group,int group_rank,int * world_rank)163 int comex_group_translate_world(
164 comex_group_t group, int group_rank, int *world_rank)
165 {
166 #if DEBUG
167 printf("[%d] comex_group_translate_world("
168 "group=%d, group_rank=%d, world_rank)\n",
169 RANK_OR_PID, group, group_rank);
170 #endif
171
172 if (COMEX_GROUP_WORLD == group) {
173 *world_rank = group_rank;
174 }
175 else {
176 int status;
177 comex_igroup_t *igroup = comex_get_igroup_from_group(group);
178
179 COMEX_ASSERT(group_list); /* first group is world worker group */
180 status = MPI_Group_translate_ranks(igroup->group, 1, &group_rank,
181 group_list->group, world_rank);
182 }
183
184 return COMEX_SUCCESS;
185 }
186
187
188 /**
189 * Destroys the given comex igroup.
190 */
_igroup_free(comex_igroup_t * igroup)191 static void _igroup_free(comex_igroup_t *igroup)
192 {
193 int status;
194
195 #if DEBUG
196 printf("[%d] _igroup_free\n",
197 RANK_OR_PID);
198 #endif
199
200 COMEX_ASSERT(igroup);
201
202 if (igroup->group != MPI_GROUP_NULL) {
203 status = MPI_Group_free(&igroup->group);
204 if (status != MPI_SUCCESS) {
205 comex_error("MPI_Group_free: Failed ", status);
206 }
207 }
208 #if DEBUG
209 printf("[%d] free'd group\n", RANK_OR_PID);
210 #endif
211
212 if (igroup->comm != MPI_COMM_NULL) {
213 status = MPI_Comm_free(&igroup->comm);
214 if (status != MPI_SUCCESS) {
215 comex_error("MPI_Comm_free: Failed ", status);
216 }
217 }
218 #if DEBUG
219 printf("[%d] free'd comm\n", RANK_OR_PID);
220 #endif
221
222 free(igroup);
223 }
224
225
comex_group_free(comex_group_t id)226 int comex_group_free(comex_group_t id)
227 {
228 comex_igroup_t *current_group_list_item = group_list;
229 comex_igroup_t *previous_group_list_item = NULL;
230
231 #if DEBUG
232 printf("[%d] comex_group_free(id=%d)\n", RANK_OR_PID, id);
233 #endif
234
235 /* find the group to free */
236 while (current_group_list_item != NULL) {
237 if (current_group_list_item->id == id) {
238 break;
239 }
240 previous_group_list_item = current_group_list_item;
241 current_group_list_item = current_group_list_item->next;
242 }
243 /* make sure we found a group */
244 COMEX_ASSERT(current_group_list_item != NULL);
245 /* remove the group from the linked list */
246 if (previous_group_list_item != NULL) {
247 previous_group_list_item->next = current_group_list_item->next;
248 }
249 /* free the igroup */
250 _igroup_free(current_group_list_item);
251
252 return COMEX_SUCCESS;
253 }
254
255
comex_group_create(int n,int * pid_list,comex_group_t id_parent,comex_group_t * id_child)256 int comex_group_create(
257 int n, int *pid_list, comex_group_t id_parent, comex_group_t *id_child)
258 {
259 int status = 0;
260 int grp_me = 0;
261 comex_igroup_t *igroup_child = NULL;
262 MPI_Group *group_child = NULL;
263 MPI_Comm *comm_child = NULL;
264 comex_igroup_t *igroup_parent = NULL;
265 MPI_Group *group_parent = NULL;
266 MPI_Comm *comm_parent = NULL;
267
268 #if DEBUG
269 printf("[%d] comex_group_create("
270 "n=%d, pid_list=%p, id_parent=%d, id_child)\n",
271 RANK_OR_PID, n, pid_list, id_parent);
272 {
273 int p;
274 printf("[%d] pid_list={%d", RANK_OR_PID, pid_list[0]);
275 for (p=1; p<n; ++p) {
276 printf(",%d", pid_list[p]);
277 }
278 printf("}\n");
279 }
280 #endif
281
282 /* create the node in the linked list of groups and */
283 /* get the child's MPI_Group and MPI_Comm, to be populated shortly */
284 _create_group_and_igroup(id_child, &igroup_child);
285 group_child = &(igroup_child->group);
286 comm_child = &(igroup_child->comm);
287
288 /* get the parent's MPI_Group and MPI_Comm */
289 igroup_parent = comex_get_igroup_from_group(id_parent);
290 group_parent = &(igroup_parent->group);
291 comm_parent = &(igroup_parent->comm);
292
293 status = MPI_Group_incl(*group_parent, n, pid_list, group_child);
294 COMEX_ASSERT(MPI_SUCCESS == status);
295
296 #if DEBUG
297 printf("[%d] comex_group_create before crazy logic\n", RANK_OR_PID);
298 #endif
299 {
300 MPI_Comm comm, comm1, comm2;
301 int lvl=1, local_ldr_pos;
302 status = MPI_Group_rank(*group_child, &grp_me);
303 COMEX_ASSERT(MPI_SUCCESS == status);
304 if (grp_me == MPI_UNDEFINED) {
305 /* FIXME: keeping the group around for now */
306 #if DEBUG
307 printf("[%d] comex_group_create aborting -- not in group\n", RANK_OR_PID);
308 #endif
309 return COMEX_SUCCESS;
310 }
311 /* SK: sanity check for the following bitwise operations */
312 COMEX_ASSERT(grp_me>=0);
313 /* FIXME: can be optimized away */
314 status = MPI_Comm_dup(MPI_COMM_SELF, &comm);
315 COMEX_ASSERT(MPI_SUCCESS == status);
316 local_ldr_pos = grp_me;
317 while(n>lvl) {
318 int tag=0;
319 int remote_ldr_pos = local_ldr_pos^lvl;
320 if (remote_ldr_pos < n) {
321 int remote_leader = pid_list[remote_ldr_pos];
322 MPI_Comm peer_comm = *comm_parent;
323 int high = (local_ldr_pos<remote_ldr_pos)?0:1;
324 status = MPI_Intercomm_create(
325 comm, 0, peer_comm, remote_leader, tag, &comm1);
326 COMEX_ASSERT(MPI_SUCCESS == status);
327 status = MPI_Comm_free(&comm);
328 COMEX_ASSERT(MPI_SUCCESS == status);
329 status = MPI_Intercomm_merge(comm1, high, &comm2);
330 COMEX_ASSERT(MPI_SUCCESS == status);
331 status = MPI_Comm_free(&comm1);
332 COMEX_ASSERT(MPI_SUCCESS == status);
333 comm = comm2;
334 }
335 local_ldr_pos &= ((~0)^lvl);
336 lvl<<=1;
337 }
338 *comm_child = comm;
339 /* cleanup temporary group (from MPI_Group_incl above) */
340 status = MPI_Group_free(group_child);
341 COMEX_ASSERT(MPI_SUCCESS == status);
342 /* get the actual group associated with comm */
343 status = MPI_Comm_group(*comm_child, group_child);
344 COMEX_ASSERT(MPI_SUCCESS == status);
345 /* rank and size of new comm */
346 status = MPI_Comm_size(igroup_child->comm, &(igroup_child->size));
347 COMEX_ASSERT(MPI_SUCCESS == status);
348 status = MPI_Comm_rank(igroup_child->comm, &(igroup_child->rank));
349 COMEX_ASSERT(MPI_SUCCESS == status);
350 }
351 #if DEBUG
352 printf("[%d] comex_group_create after crazy logic\n", RANK_OR_PID);
353 #endif
354
355 return COMEX_SUCCESS;
356 }
357
358
cmplong(const void * p1,const void * p2)359 static int cmplong(const void *p1, const void *p2)
360 {
361 return *((long*)p1) - *((long*)p2);
362 }
363
364
365 /**
366 * Initialize group linked list. Prepopulate with world group.
367 */
comex_group_init()368 void comex_group_init()
369 {
370 int status = 0;
371 int i = 0;
372 comex_group_t group = 0;
373 comex_igroup_t *igroup = NULL;
374 long *sorted = NULL;
375 int count = 0;
376
377 /* populate g_state */
378
379 /* dup MPI_COMM_WORLD and get group, rank, and size */
380 status = MPI_Comm_dup(MPI_COMM_WORLD, &(g_state.comm));
381 COMEX_ASSERT(MPI_SUCCESS == status);
382 status = MPI_Comm_group(g_state.comm, &(g_state.group));
383 COMEX_ASSERT(MPI_SUCCESS == status);
384 status = MPI_Comm_rank(g_state.comm, &(g_state.rank));
385 COMEX_ASSERT(MPI_SUCCESS == status);
386 status = MPI_Comm_size(g_state.comm, &(g_state.size));
387 COMEX_ASSERT(MPI_SUCCESS == status);
388
389 #if DEBUG_TO_FILE
390 {
391 char pathname[80];
392 sprintf(pathname, "trace.%d.log", g_state.rank);
393 comex_trace_file = fopen(pathname, "w");
394 COMEX_ASSERT(NULL != comex_trace_file);
395
396 printf("[%d] comex_group_init()\n", RANK_OR_PID);
397 }
398 #endif
399
400 g_state.hostid = (long*)malloc(sizeof(long)*g_state.size);
401 g_state.hostid[g_state.rank] = xgethostid();
402 status = MPI_Allgather(MPI_IN_PLACE, 1, MPI_LONG,
403 g_state.hostid, 1, MPI_LONG, g_state.comm);
404 COMEX_ASSERT(MPI_SUCCESS == status);
405
406 COMEX_ASSERT(group_list == NULL);
407
408 /* create the head of the group linked list */
409 _create_group_and_igroup(&group, &igroup);
410 /* create a comm of only the workers (every rank is a worker) */
411 status = MPI_Comm_dup(MPI_COMM_WORLD, &(igroup->comm));
412 COMEX_ASSERT(MPI_SUCCESS == status);
413 status = MPI_Comm_group(igroup->comm, &(igroup->group));
414 COMEX_ASSERT(MPI_SUCCESS == status);
415 status = MPI_Comm_rank(igroup->comm, &(igroup->rank));
416 COMEX_ASSERT(MPI_SUCCESS == status);
417 status = MPI_Comm_size(igroup->comm, &(igroup->size));
418 COMEX_ASSERT(MPI_SUCCESS == status);
419
420 /* create node comm */
421 /* MPI_Comm_split requires a non-negative color,
422 * so sort and sanitize */
423 sorted = (long*)malloc(sizeof(long) * g_state.size);
424 (void)memcpy(sorted, g_state.hostid, sizeof(long)*g_state.size);
425 qsort(sorted, g_state.size, sizeof(long), cmplong);
426 for (i=0; i<g_state.size-1; ++i) {
427 if (sorted[i] == g_state.hostid[g_state.rank]) {
428 break;
429 }
430 if (sorted[i] != sorted[i+1]) {
431 count += 1;
432 }
433 }
434 free(sorted);
435 status = MPI_Comm_split(MPI_COMM_WORLD, count,
436 g_state.rank, &(g_state.node_comm));
437 COMEX_ASSERT(MPI_SUCCESS == status);
438 /* node rank */
439 status = MPI_Comm_rank(g_state.node_comm, &(g_state.node_rank));
440 COMEX_ASSERT(MPI_SUCCESS == status);
441 /* node size */
442 status = MPI_Comm_size(g_state.node_comm, &(g_state.node_size));
443 COMEX_ASSERT(MPI_SUCCESS == status);
444 }
445
446
comex_group_finalize()447 void comex_group_finalize()
448 {
449 int status;
450 comex_igroup_t *current_group_list_item = group_list;
451 comex_igroup_t *previous_group_list_item = NULL;
452
453 #if DEBUG
454 printf("[%d] comex_group_finalize()\n", RANK_OR_PID);
455 #endif
456
457 while (current_group_list_item != NULL) {
458 previous_group_list_item = current_group_list_item;
459 current_group_list_item = current_group_list_item->next;
460 _igroup_free(previous_group_list_item);
461 }
462
463 free(g_state.hostid);
464 status = MPI_Comm_free(&(g_state.node_comm));
465 COMEX_ASSERT(MPI_SUCCESS == status);
466 status = MPI_Group_free(&(g_state.group));
467 COMEX_ASSERT(MPI_SUCCESS == status);
468 status = MPI_Comm_free(&(g_state.comm));
469 COMEX_ASSERT(MPI_SUCCESS == status);
470 }
471
472
xgethostid()473 static long xgethostid()
474 {
475 #if defined(__bgp__)
476 #warning BGP
477 long nodeid;
478 int matched,midplane,nodecard,computecard;
479 char rack_row,rack_col;
480 char location[128];
481 char location_clean[128];
482 (void) memset(location, '\0', 128);
483 (void) memset(location_clean, '\0', 128);
484 _BGP_Personality_t personality;
485 Kernel_GetPersonality(&personality, sizeof(personality));
486 BGP_Personality_getLocationString(&personality, location);
487 matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d",
488 &rack_row, &rack_col, &midplane, &nodecard, &computecard);
489 COMEX_ASSERT(matched == 5);
490 sprintf(location_clean, "%2d%02d%1d%02d%02d",
491 (int)rack_row, (int)rack_col, midplane, nodecard, computecard);
492 nodeid = atol(location_clean);
493 #elif defined(__bgq__)
494 #warning BGQ
495 int nodeid;
496 MPIX_Hardware_t hw;
497 MPIX_Hardware(&hw);
498
499 nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4]
500 + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4]
501 + hw.Coords[2] * hw.Size[3] * hw.Size[4]
502 + hw.Coords[3] * hw.Size[4]
503 + hw.Coords[4];
504 #elif defined(__CRAYXT) || defined(__CRAYXE)
505 #warning CRAY
506 int nodeid;
507 # if defined(__CRAYXT)
508 PMI_Portals_get_nid(g_state.rank, &nodeid);
509 # elif defined(__CRAYXE)
510 PMI_Get_nid(g_state.rank, &nodeid);
511 # endif
512 #else
513 long nodeid = gethostid();
514 #endif
515
516 return nodeid;
517 }
518