1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 #include <stdlib.h>
6 #include <string.h>
7 #include <stdio.h>
8 #include <unistd.h>
9 
10 #include <mpi.h>
11 
12 #if defined(__bgp__)
13 #include <spi/kernel_interface.h>
14 #include <common/bgp_personality.h>
15 #include <common/bgp_personality_inlines.h>
16 #elif defined(__bgq__)
17 #  include <mpix.h>
18 #elif defined(__CRAYXT) || defined(__CRAYXE)
19 #  include <pmi.h>
20 #endif
21 
22 #include "comex.h"
23 #include "comex_impl.h"
24 #include "groups.h"
25 
26 
27 /* world group state */
28 comex_group_world_t g_state = {
29     MPI_COMM_NULL,
30     MPI_GROUP_NULL,
31     -1,
32     -1,
33     NULL,
34     MPI_COMM_NULL,
35     -1,
36     -1
37 };
38 /* the HEAD of the group linked list */
39 comex_igroup_t *group_list = NULL;
40 
41 #define RANK_OR_PID (g_state.rank >= 0 ? g_state.rank : getpid())
42 
43 /* static functions implemented in this file */
44 static void _create_group_and_igroup(comex_group_t *id, comex_igroup_t **igroup);
45 static void _igroup_free(comex_igroup_t *igroup);
46 static long xgethostid();
47 
48 
49 /**
50  * Return the comex igroup instance given the group id.
51  *
52  * The group linked list is searched sequentially until the given group
53  * is found. It is an error if this function is called before
54  * comex_group_init(). An error occurs if the given group is not found.
55  */
comex_get_igroup_from_group(comex_group_t id)56 comex_igroup_t* comex_get_igroup_from_group(comex_group_t id)
57 {
58     comex_igroup_t *current_group_list_item = group_list;
59 
60 #if DEBUG
61     printf("[%d] comex_get_igroup_from_group(%d)\n", RANK_OR_PID, id);
62 #endif
63 
64     COMEX_ASSERT(group_list != NULL);
65     while (current_group_list_item != NULL) {
66         if (current_group_list_item->id == id) {
67             return current_group_list_item;
68         }
69         current_group_list_item = current_group_list_item->next;
70     }
71     comex_error("comex group lookup failed", -1);
72 
73     return NULL;
74 }
75 
76 
77 /**
78  * Creates and associates a comex group with a comex igroup.
79  *
80  * This does *not* initialize the members of the comex igroup.
81  */
_create_group_and_igroup(comex_group_t * id,comex_igroup_t ** igroup)82 static void _create_group_and_igroup(
83         comex_group_t *id, comex_igroup_t **igroup)
84 {
85     comex_igroup_t *new_group_list_item = NULL;
86     comex_igroup_t *last_group_list_item = NULL;
87 
88 #if DEBUG
89     printf("[%d] _create_group_and_igroup(...)\n", RANK_OR_PID);
90 #endif
91 
92     /* create, init, and insert the new node for the linked list */
93     new_group_list_item = malloc(sizeof(comex_igroup_t));
94     new_group_list_item->next = NULL;
95     new_group_list_item->id = -1;
96     new_group_list_item->comm = MPI_COMM_NULL;
97     new_group_list_item->group = MPI_GROUP_NULL;
98     new_group_list_item->size = -1;
99     new_group_list_item->rank = -1;
100 
101     /* find the last group in the group linked list and insert */
102     if (group_list) {
103         last_group_list_item = group_list;
104         while (last_group_list_item->next != NULL) {
105             last_group_list_item = last_group_list_item->next;
106         }
107         last_group_list_item->next = new_group_list_item;
108         new_group_list_item->id = last_group_list_item->id + 1;
109     }
110     else {
111         group_list = new_group_list_item;
112         new_group_list_item->id = COMEX_GROUP_WORLD;
113     }
114 
115     /* return the group id and comex igroup */
116     *igroup = new_group_list_item;
117     *id = new_group_list_item->id;
118 }
119 
120 
comex_group_rank(comex_group_t group,int * rank)121 int comex_group_rank(comex_group_t group, int *rank)
122 {
123     comex_igroup_t *igroup = comex_get_igroup_from_group(group);
124     *rank = igroup->rank;
125 
126 #if DEBUG
127     printf("[%d] comex_group_rank(group=%d, *rank=%d)\n",
128             RANK_OR_PID, group, *rank);
129 #endif
130 
131     return COMEX_SUCCESS;
132 }
133 
134 
comex_group_size(comex_group_t group,int * size)135 int comex_group_size(comex_group_t group, int *size)
136 {
137     comex_igroup_t *igroup = comex_get_igroup_from_group(group);
138     *size = igroup->size;
139 
140 #if DEBUG
141     printf("[%d] comex_group_size(group=%d, *size=%d)\n",
142             RANK_OR_PID, group, *size);
143 #endif
144 
145     return COMEX_SUCCESS;
146 }
147 
148 
comex_group_comm(comex_group_t group,MPI_Comm * comm)149 int comex_group_comm(comex_group_t group, MPI_Comm *comm)
150 {
151     comex_igroup_t *igroup = comex_get_igroup_from_group(group);
152     *comm = igroup->comm;
153 
154 #if DEBUG
155     printf("[%d] comex_group_comm(group=%d, comm)\n",
156             RANK_OR_PID, group);
157 #endif
158 
159     return COMEX_SUCCESS;
160 }
161 
162 
comex_group_translate_world(comex_group_t group,int group_rank,int * world_rank)163 int comex_group_translate_world(
164         comex_group_t group, int group_rank, int *world_rank)
165 {
166 #if DEBUG
167     printf("[%d] comex_group_translate_world("
168             "group=%d, group_rank=%d, world_rank)\n",
169             RANK_OR_PID, group, group_rank);
170 #endif
171 
172     if (COMEX_GROUP_WORLD == group) {
173         *world_rank = group_rank;
174     }
175     else {
176         int status;
177         comex_igroup_t *igroup = comex_get_igroup_from_group(group);
178 
179         COMEX_ASSERT(group_list); /* first group is world worker group */
180         status = MPI_Group_translate_ranks(igroup->group, 1, &group_rank,
181                 group_list->group, world_rank);
182     }
183 
184     return COMEX_SUCCESS;
185 }
186 
187 
188 /**
189  * Destroys the given comex igroup.
190  */
_igroup_free(comex_igroup_t * igroup)191 static void _igroup_free(comex_igroup_t *igroup)
192 {
193     int status;
194 
195 #if DEBUG
196     printf("[%d] _igroup_free\n",
197             RANK_OR_PID);
198 #endif
199 
200     COMEX_ASSERT(igroup);
201 
202     if (igroup->group != MPI_GROUP_NULL) {
203         status = MPI_Group_free(&igroup->group);
204         if (status != MPI_SUCCESS) {
205             comex_error("MPI_Group_free: Failed ", status);
206         }
207     }
208 #if DEBUG
209     printf("[%d] free'd group\n", RANK_OR_PID);
210 #endif
211 
212     if (igroup->comm != MPI_COMM_NULL) {
213         status = MPI_Comm_free(&igroup->comm);
214         if (status != MPI_SUCCESS) {
215             comex_error("MPI_Comm_free: Failed ", status);
216         }
217     }
218 #if DEBUG
219     printf("[%d] free'd comm\n", RANK_OR_PID);
220 #endif
221 
222     free(igroup);
223 }
224 
225 
comex_group_free(comex_group_t id)226 int comex_group_free(comex_group_t id)
227 {
228     comex_igroup_t *current_group_list_item = group_list;
229     comex_igroup_t *previous_group_list_item = NULL;
230 
231 #if DEBUG
232     printf("[%d] comex_group_free(id=%d)\n", RANK_OR_PID, id);
233 #endif
234 
235     /* find the group to free */
236     while (current_group_list_item != NULL) {
237         if (current_group_list_item->id == id) {
238             break;
239         }
240         previous_group_list_item = current_group_list_item;
241         current_group_list_item = current_group_list_item->next;
242     }
243     /* make sure we found a group */
244     COMEX_ASSERT(current_group_list_item != NULL);
245     /* remove the group from the linked list */
246     if (previous_group_list_item != NULL) {
247         previous_group_list_item->next = current_group_list_item->next;
248     }
249     /* free the igroup */
250     _igroup_free(current_group_list_item);
251 
252     return COMEX_SUCCESS;
253 }
254 
255 
comex_group_create(int n,int * pid_list,comex_group_t id_parent,comex_group_t * id_child)256 int comex_group_create(
257         int n, int *pid_list, comex_group_t id_parent, comex_group_t *id_child)
258 {
259     int status = 0;
260     int grp_me = 0;
261     comex_igroup_t *igroup_child = NULL;
262     MPI_Group      *group_child = NULL;
263     MPI_Comm       *comm_child = NULL;
264     comex_igroup_t *igroup_parent = NULL;
265     MPI_Group      *group_parent = NULL;
266     MPI_Comm       *comm_parent = NULL;
267 
268 #if DEBUG
269     printf("[%d] comex_group_create("
270             "n=%d, pid_list=%p, id_parent=%d, id_child)\n",
271             RANK_OR_PID, n, pid_list, id_parent);
272     {
273         int p;
274         printf("[%d] pid_list={%d", RANK_OR_PID, pid_list[0]);
275         for (p=1; p<n; ++p) {
276             printf(",%d", pid_list[p]);
277         }
278         printf("}\n");
279     }
280 #endif
281 
282     /* create the node in the linked list of groups and */
283     /* get the child's MPI_Group and MPI_Comm, to be populated shortly */
284     _create_group_and_igroup(id_child, &igroup_child);
285     group_child = &(igroup_child->group);
286     comm_child  = &(igroup_child->comm);
287 
288     /* get the parent's MPI_Group and MPI_Comm */
289     igroup_parent = comex_get_igroup_from_group(id_parent);
290     group_parent = &(igroup_parent->group);
291     comm_parent  = &(igroup_parent->comm);
292 
293     status = MPI_Group_incl(*group_parent, n, pid_list, group_child);
294     COMEX_ASSERT(MPI_SUCCESS == status);
295 
296 #if DEBUG
297     printf("[%d] comex_group_create before crazy logic\n", RANK_OR_PID);
298 #endif
299     {
300         MPI_Comm comm, comm1, comm2;
301         int lvl=1, local_ldr_pos;
302         status = MPI_Group_rank(*group_child, &grp_me);
303         COMEX_ASSERT(MPI_SUCCESS == status);
304         if (grp_me == MPI_UNDEFINED) {
305             /* FIXME: keeping the group around for now */
306 #if DEBUG
307     printf("[%d] comex_group_create aborting -- not in group\n", RANK_OR_PID);
308 #endif
309             return COMEX_SUCCESS;
310         }
311         /* SK: sanity check for the following bitwise operations */
312         COMEX_ASSERT(grp_me>=0);
313         /* FIXME: can be optimized away */
314         status = MPI_Comm_dup(MPI_COMM_SELF, &comm);
315         COMEX_ASSERT(MPI_SUCCESS == status);
316         local_ldr_pos = grp_me;
317         while(n>lvl) {
318             int tag=0;
319             int remote_ldr_pos = local_ldr_pos^lvl;
320             if (remote_ldr_pos < n) {
321                 int remote_leader = pid_list[remote_ldr_pos];
322                 MPI_Comm peer_comm = *comm_parent;
323                 int high = (local_ldr_pos<remote_ldr_pos)?0:1;
324                 status = MPI_Intercomm_create(
325                         comm, 0, peer_comm, remote_leader, tag, &comm1);
326                 COMEX_ASSERT(MPI_SUCCESS == status);
327                 status = MPI_Comm_free(&comm);
328                 COMEX_ASSERT(MPI_SUCCESS == status);
329                 status = MPI_Intercomm_merge(comm1, high, &comm2);
330                 COMEX_ASSERT(MPI_SUCCESS == status);
331                 status = MPI_Comm_free(&comm1);
332                 COMEX_ASSERT(MPI_SUCCESS == status);
333                 comm = comm2;
334             }
335             local_ldr_pos &= ((~0)^lvl);
336             lvl<<=1;
337         }
338         *comm_child = comm;
339         /* cleanup temporary group (from MPI_Group_incl above) */
340         status = MPI_Group_free(group_child);
341         COMEX_ASSERT(MPI_SUCCESS == status);
342         /* get the actual group associated with comm */
343         status = MPI_Comm_group(*comm_child, group_child);
344         COMEX_ASSERT(MPI_SUCCESS == status);
345         /* rank and size of new comm */
346         status = MPI_Comm_size(igroup_child->comm, &(igroup_child->size));
347         COMEX_ASSERT(MPI_SUCCESS == status);
348         status = MPI_Comm_rank(igroup_child->comm, &(igroup_child->rank));
349         COMEX_ASSERT(MPI_SUCCESS == status);
350     }
351 #if DEBUG
352     printf("[%d] comex_group_create after crazy logic\n", RANK_OR_PID);
353 #endif
354 
355     return COMEX_SUCCESS;
356 }
357 
358 
cmplong(const void * p1,const void * p2)359 static int cmplong(const void *p1, const void *p2)
360 {
361     return *((long*)p1) - *((long*)p2);
362 }
363 
364 
365 /**
366  * Initialize group linked list. Prepopulate with world group.
367  */
comex_group_init()368 void comex_group_init()
369 {
370     int status = 0;
371     int i = 0;
372     comex_group_t group = 0;
373     comex_igroup_t *igroup = NULL;
374     long *sorted = NULL;
375     int count = 0;
376 
377     /* populate g_state */
378 
379     /* dup MPI_COMM_WORLD and get group, rank, and size */
380     status = MPI_Comm_dup(MPI_COMM_WORLD, &(g_state.comm));
381     COMEX_ASSERT(MPI_SUCCESS == status);
382     status = MPI_Comm_group(g_state.comm, &(g_state.group));
383     COMEX_ASSERT(MPI_SUCCESS == status);
384     status = MPI_Comm_rank(g_state.comm, &(g_state.rank));
385     COMEX_ASSERT(MPI_SUCCESS == status);
386     status = MPI_Comm_size(g_state.comm, &(g_state.size));
387     COMEX_ASSERT(MPI_SUCCESS == status);
388 
389 #if DEBUG_TO_FILE
390     {
391         char pathname[80];
392         sprintf(pathname, "trace.%d.log", g_state.rank);
393         comex_trace_file = fopen(pathname, "w");
394         COMEX_ASSERT(NULL != comex_trace_file);
395 
396         printf("[%d] comex_group_init()\n", RANK_OR_PID);
397     }
398 #endif
399 
400     g_state.hostid = (long*)malloc(sizeof(long)*g_state.size);
401     g_state.hostid[g_state.rank] = xgethostid();
402     status = MPI_Allgather(MPI_IN_PLACE, 1, MPI_LONG,
403             g_state.hostid, 1, MPI_LONG, g_state.comm);
404     COMEX_ASSERT(MPI_SUCCESS == status);
405 
406     COMEX_ASSERT(group_list == NULL);
407 
408     /* create the head of the group linked list */
409     _create_group_and_igroup(&group, &igroup);
410     /* create a comm of only the workers (every rank is a worker) */
411     status = MPI_Comm_dup(MPI_COMM_WORLD, &(igroup->comm));
412     COMEX_ASSERT(MPI_SUCCESS == status);
413     status = MPI_Comm_group(igroup->comm, &(igroup->group));
414     COMEX_ASSERT(MPI_SUCCESS == status);
415     status = MPI_Comm_rank(igroup->comm, &(igroup->rank));
416     COMEX_ASSERT(MPI_SUCCESS == status);
417     status = MPI_Comm_size(igroup->comm, &(igroup->size));
418     COMEX_ASSERT(MPI_SUCCESS == status);
419 
420     /* create node comm */
421     /* MPI_Comm_split requires a non-negative color,
422      * so sort and sanitize */
423     sorted = (long*)malloc(sizeof(long) * g_state.size);
424     (void)memcpy(sorted, g_state.hostid, sizeof(long)*g_state.size);
425     qsort(sorted, g_state.size, sizeof(long), cmplong);
426     for (i=0; i<g_state.size-1; ++i) {
427         if (sorted[i] == g_state.hostid[g_state.rank]) {
428             break;
429         }
430         if (sorted[i] != sorted[i+1]) {
431             count += 1;
432         }
433     }
434     free(sorted);
435     status = MPI_Comm_split(MPI_COMM_WORLD, count,
436             g_state.rank, &(g_state.node_comm));
437     COMEX_ASSERT(MPI_SUCCESS == status);
438     /* node rank */
439     status = MPI_Comm_rank(g_state.node_comm, &(g_state.node_rank));
440     COMEX_ASSERT(MPI_SUCCESS == status);
441     /* node size */
442     status = MPI_Comm_size(g_state.node_comm, &(g_state.node_size));
443     COMEX_ASSERT(MPI_SUCCESS == status);
444 }
445 
446 
comex_group_finalize()447 void comex_group_finalize()
448 {
449     int status;
450     comex_igroup_t *current_group_list_item = group_list;
451     comex_igroup_t *previous_group_list_item = NULL;
452 
453 #if DEBUG
454     printf("[%d] comex_group_finalize()\n", RANK_OR_PID);
455 #endif
456 
457     while (current_group_list_item != NULL) {
458         previous_group_list_item = current_group_list_item;
459         current_group_list_item = current_group_list_item->next;
460         _igroup_free(previous_group_list_item);
461     }
462 
463     free(g_state.hostid);
464     status = MPI_Comm_free(&(g_state.node_comm));
465     COMEX_ASSERT(MPI_SUCCESS == status);
466     status = MPI_Group_free(&(g_state.group));
467     COMEX_ASSERT(MPI_SUCCESS == status);
468     status = MPI_Comm_free(&(g_state.comm));
469     COMEX_ASSERT(MPI_SUCCESS == status);
470 }
471 
472 
xgethostid()473 static long xgethostid()
474 {
475 #if defined(__bgp__)
476 #warning BGP
477     long nodeid;
478     int matched,midplane,nodecard,computecard;
479     char rack_row,rack_col;
480     char location[128];
481     char location_clean[128];
482     (void) memset(location, '\0', 128);
483     (void) memset(location_clean, '\0', 128);
484     _BGP_Personality_t personality;
485     Kernel_GetPersonality(&personality, sizeof(personality));
486     BGP_Personality_getLocationString(&personality, location);
487     matched = sscanf(location, "R%c%c-M%1d-N%2d-J%2d",
488             &rack_row, &rack_col, &midplane, &nodecard, &computecard);
489     COMEX_ASSERT(matched == 5);
490     sprintf(location_clean, "%2d%02d%1d%02d%02d",
491             (int)rack_row, (int)rack_col, midplane, nodecard, computecard);
492     nodeid = atol(location_clean);
493 #elif defined(__bgq__)
494 #warning BGQ
495     int nodeid;
496     MPIX_Hardware_t hw;
497     MPIX_Hardware(&hw);
498 
499     nodeid = hw.Coords[0] * hw.Size[1] * hw.Size[2] * hw.Size[3] * hw.Size[4]
500         + hw.Coords[1] * hw.Size[2] * hw.Size[3] * hw.Size[4]
501         + hw.Coords[2] * hw.Size[3] * hw.Size[4]
502         + hw.Coords[3] * hw.Size[4]
503         + hw.Coords[4];
504 #elif defined(__CRAYXT) || defined(__CRAYXE)
505 #warning CRAY
506     int nodeid;
507 #  if defined(__CRAYXT)
508     PMI_Portals_get_nid(g_state.rank, &nodeid);
509 #  elif defined(__CRAYXE)
510     PMI_Get_nid(g_state.rank, &nodeid);
511 #  endif
512 #else
513     long nodeid = gethostid();
514 #endif
515 
516     return nodeid;
517 }
518