1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpiimpl.h"
7 #include "mpid_nem_impl.h"
8 #include "mpid_nem_nets.h"
9 #include <errno.h>
10 #include "mpidi_nem_statistics.h"
11 #include "mpit.h"
12 #include "mpidu_init_shm.h"
13 
14 /*
15 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
16 
17 categories:
18     - name        : NEMESIS
19       description : cvars that control behavior of the ch3:nemesis channel
20 
21 cvars:
22     - name        : MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ
23       category    : NEMESIS
24       type        : int
25       default     : -1
26       class       : none
27       verbosity   : MPI_T_VERBOSITY_USER_BASIC
28       scope       : MPI_T_SCOPE_ALL_EQ
29       description : >-
30         This cvar controls the message size at which Nemesis
31         switches from eager to rendezvous mode for shared memory.
32         If this cvar is set to -1, then Nemesis will choose
33         an appropriate value.
34 
35     - name        : MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ
36       category    : NEMESIS
37       type        : int
38       default     : -2
39       class       : none
40       verbosity   : MPI_T_VERBOSITY_USER_BASIC
41       scope       : MPI_T_SCOPE_ALL_EQ
42       description : >-
43         This cvar controls the message size at which Nemesis
44         switches from eager to rendezvous mode for ready-send
45         messages.  If this cvar is set to -1, then ready messages
46         will always be sent eagerly.  If this cvar is set to -2,
47         then Nemesis will choose an appropriate value.
48 
49 === END_MPI_T_CVAR_INFO_BLOCK ===
50 */
51 
52 /* constants for configure time selection of local LMT implementations */
53 #define MPID_NEM_LOCAL_LMT_NONE 0
54 #define MPID_NEM_LOCAL_LMT_SHM_COPY 1
55 #define MPID_NEM_LOCAL_LMT_DMA 2
56 #define MPID_NEM_LOCAL_LMT_VMSPLICE 3
57 
58 #ifdef MEM_REGION_IN_HEAP
59 MPID_nem_mem_region_t *MPID_nem_mem_region_ptr = 0;
60 #else /* MEM_REGION_IN_HEAP */
61 MPID_nem_mem_region_t MPID_nem_mem_region;
62 #endif /* MEM_REGION_IN_HEAP */
63 
64 char MPID_nem_hostname[MAX_HOSTNAME_LEN] = "UNKNOWN";
65 
66 static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
67                            int **local_procs_p, int *local_rank_p);
68 
69 char *MPID_nem_asymm_base_addr = 0;
70 
71 /* used by mpid_nem_inline.h and mpid_nem_finalize.c */
72 unsigned long long *MPID_nem_fbox_fall_back_to_queue_count = NULL;
73 
MPID_nem_init_stats(int n_local_ranks)74 static int MPID_nem_init_stats(int n_local_ranks)
75 {
76     int mpi_errno = MPI_SUCCESS;
77 
78     if (ENABLE_PVAR_NEM) {
79         MPID_nem_fbox_fall_back_to_queue_count = MPL_calloc(n_local_ranks, sizeof(unsigned long long), MPL_MEM_MPIT);
80     }
81 
82     MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
83         NEM,
84         MPI_UNSIGNED_LONG_LONG,
85         nem_fbox_fall_back_to_queue_count, /* name */
86         MPID_nem_fbox_fall_back_to_queue_count, /* address */
87         n_local_ranks, /* count, known at pvar registeration time */
88         MPI_T_VERBOSITY_USER_DETAIL,
89         MPI_T_BIND_NO_OBJECT,
90         MPIR_T_PVAR_FLAG_CONTINUOUS, /* flags */
91         NULL, /* get_value */
92         NULL, /* get_count */
93         "NEMESIS", /* category */
94         "Array counting how many times nemesis had to fall back to the regular queue when sending messages between pairs of local processes");
95 
96 fn_exit:
97     return mpi_errno;
98 fn_fail:
99     goto fn_exit;
100 }
101 
102 int
MPID_nem_init(int pg_rank,MPIDI_PG_t * pg_p,int has_parent ATTRIBUTE ((unused)))103 MPID_nem_init(int pg_rank, MPIDI_PG_t *pg_p, int has_parent ATTRIBUTE((unused)))
104 {
105     int    mpi_errno       = MPI_SUCCESS;
106     int    tmp_mpi_errno;
107     int    num_procs       = pg_p->size;
108     int    ret;
109     int    num_local       = -1;
110     int   *local_procs     = NULL;
111     int    local_rank      = -1;
112     int    idx;
113     int    i;
114     char  *publish_bc_orig = NULL;
115     char  *bc_val          = NULL;
116     int    val_max_remaining;
117     int    grank;
118     size_t len;
119     void *fastboxes_p = NULL;
120     void *cells_p = NULL;
121     MPID_nem_queue_t *recv_queues_p = NULL;
122     MPID_nem_queue_t *free_queues_p = NULL;
123     char strerrbuf[MPIR_STRERROR_BUF_SIZE];
124 
125     MPIR_CHKPMEM_DECL(8);
126 
127     /* TODO add compile-time asserts (rather than run-time) and convert most of these */
128 
129     /* Make sure the nemesis packet is no larger than the generic
130        packet.  This is needed because we no longer include channel
131        packet types in the CH3 packet types to allow dynamic channel
132        loading. */
133     MPIR_Assert(sizeof(MPIDI_CH3_nem_pkt_t) <= sizeof(MPIDI_CH3_Pkt_t));
134 
135     /* The MPID_nem_cell_rel_ptr_t defined in mpid_nem_datatypes.h
136        should only contain a MPL_atomic_ptr_t.  This is to check that
137        absolute pointers are exactly the same size as relative
138        pointers. */
139     MPIR_Assert(sizeof(MPID_nem_cell_rel_ptr_t) == sizeof(MPL_atomic_ptr_t));
140 
141     /* Make sure payload is aligned on 8-byte boundary */
142     MPIR_Assert(MPID_NEM_ALIGNED(&((MPID_nem_cell_t*)0)->payload[0], 8));
143     /* Make sure the padding to cacheline size in MPID_nem_queue_t works */
144     MPIR_Assert(MPID_NEM_CACHE_LINE_LEN > 2 * sizeof(MPID_nem_cell_rel_ptr_t));
145 
146     /* Initialize the business card */
147     mpi_errno = MPIDI_CH3I_BCInit( &bc_val, &val_max_remaining );
148     if (mpi_errno) MPIR_ERR_POP (mpi_errno);
149     publish_bc_orig = bc_val;
150 
151     ret = gethostname (MPID_nem_hostname, MAX_HOSTNAME_LEN);
152     MPIR_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**sock_gethost", "**sock_gethost %s %d",
153                           MPIR_Strerror(errno, strerrbuf, MPIR_STRERROR_BUF_SIZE), errno);
154 
155     MPID_nem_hostname[MAX_HOSTNAME_LEN-1] = '\0';
156 
157     mpi_errno = get_local_procs(pg_p, pg_rank, &num_local, &local_procs, &local_rank);
158     MPIR_ERR_CHECK(mpi_errno);
159 
160 #ifdef MEM_REGION_IN_HEAP
161     MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region_ptr, MPID_nem_mem_region_t *, sizeof(MPID_nem_mem_region_t), mpi_errno, "mem_region", MPL_MEM_SHM);
162 #endif /* MEM_REGION_IN_HEAP */
163 
164     MPID_nem_mem_region.rank           = pg_rank;
165     MPID_nem_mem_region.num_local      = num_local;
166     MPID_nem_mem_region.num_procs      = num_procs;
167     MPID_nem_mem_region.local_procs    = local_procs;
168     MPID_nem_mem_region.local_rank     = local_rank;
169     MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region.local_ranks, int *, num_procs * sizeof(int), mpi_errno, "mem_region local ranks", MPL_MEM_SHM);
170     MPID_nem_mem_region.ext_procs      = num_procs - num_local ;
171     if (MPID_nem_mem_region.ext_procs > 0)
172         MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region.ext_ranks, int *, MPID_nem_mem_region.ext_procs * sizeof(int), mpi_errno, "mem_region ext ranks", MPL_MEM_SHM);
173     MPID_nem_mem_region.next           = NULL;
174 
175     for (idx = 0 ; idx < num_procs; idx++)
176     {
177 	MPID_nem_mem_region.local_ranks[idx] = MPID_NEM_NON_LOCAL;
178     }
179     for (idx = 0; idx < num_local; idx++)
180     {
181 	grank = local_procs[idx];
182 	MPID_nem_mem_region.local_ranks[grank] = idx;
183     }
184 
185     idx = 0;
186     for(grank = 0 ; grank < num_procs ; grank++)
187     {
188 	if(!MPID_NEM_IS_LOCAL(grank))
189 	{
190 	    MPID_nem_mem_region.ext_ranks[idx++] = grank;
191 	}
192     }
193 
194 #ifdef FORCE_ASYM
195     {
196         /* this is used for debugging
197            each process allocates a different sized piece of shared
198            memory so that when the shared memory segment used for
199            communication is allocated it will probably be mapped at a
200            different location for each process
201         */
202         MPL_shm_hnd_t handle;
203 	int size = (local_rank * 65536) + 65536;
204 	char *base_addr;
205 
206         mpi_errno = MPL_shm_hnd_init(&handle);
207         if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
208 
209         mpi_errno = MPL_shm_seg_create_and_attach(handle, size, &base_addr, 0);
210         /* --BEGIN ERROR HANDLING-- */
211         if (mpi_errno)
212         {
213             MPL_shm_seg_remove(handle);
214             MPL_shm_hnd_finalize(&handle);
215             MPIR_ERR_POP (mpi_errno);
216         }
217         /* --END ERROR HANDLING-- */
218 
219         mpi_errno = MPL_shm_seg_remove(handle);
220         /* --BEGIN ERROR HANDLING-- */
221         if (mpi_errno)
222         {
223             MPL_shm_hnd_finalize(&handle);
224             MPIR_ERR_POP (mpi_errno);
225         }
226         /* --END ERROR HANDLING-- */
227 
228         MPL_shm_hnd_finalize(&handle);
229     }
230     /*fprintf(stderr,"[%i] -- address shift ok \n",pg_rank); */
231 #endif  /*FORCE_ASYM */
232 
233     /* Initialize core shared memory segment */
234     mpi_errno = MPIDU_Init_shm_init();
235     MPIR_ERR_CHECK(mpi_errno);
236 
237     /* Request fastboxes region */
238     size_t fbox_len = MPL_MAX((num_local*((num_local-1) * MPID_NEM_FBOX_LEN)),
239                               MPID_NEM_ASYMM_NULL_VAL);
240 
241     /* Request data cells region */
242     size_t cells_len = num_local * MPID_NEM_NUM_CELLS * MPID_NEM_CELL_LEN;
243 
244     /* Request free q region */
245     size_t freeQ_len = num_local * sizeof(MPID_nem_queue_t);
246 
247     /* Request recv q region */
248     size_t recvQ_len = num_local * sizeof(MPID_nem_queue_t);
249 
250     len = fbox_len + cells_len + freeQ_len + recvQ_len;
251 
252     /* Actually allocate the segment and assign regions to the pointers */
253     mpi_errno = MPIDU_Init_shm_alloc(len, &MPID_nem_mem_region.shm_ptr);
254     /* check_alloc steps */
255     if (MPIDU_Init_shm_is_symm(MPID_nem_mem_region.shm_ptr) == 1) {
256         MPID_nem_asymm_base_addr = NULL;
257     } else {
258         MPID_nem_asymm_base_addr = MPID_nem_mem_region.shm_ptr;
259 #ifdef MPID_NEM_SYMMETRIC_QUEUES
260         MPIR_ERR_INTERNALANDJUMP(mpi_errno, "queues are not symmetrically allocated as expected");
261 #endif
262     }
263 
264     if (mpi_errno) MPIR_ERR_POP (mpi_errno);
265 
266     fastboxes_p = (void *) MPID_nem_mem_region.shm_ptr;
267     cells_p = (char *) fastboxes_p + fbox_len;
268     free_queues_p = (MPID_nem_queue_t *)((char *) cells_p + cells_len);
269     recv_queues_p = (MPID_nem_queue_t *)((char *) free_queues_p + freeQ_len);
270 
271     /* local procs barrier */
272     mpi_errno = MPIDU_Init_shm_barrier();
273     if (mpi_errno) MPIR_ERR_POP (mpi_errno);
274 
275     /* find our cell region */
276     MPID_nem_mem_region.Elements = (void *) ((char *) cells_p + local_rank * MPID_NEM_NUM_CELLS * MPID_NEM_CELL_LEN);
277 
278     /* Tables of pointers to shared memory Qs */
279     MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.FreeQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "FreeQ", MPL_MEM_SHM);
280     MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.RecvQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "RecvQ", MPL_MEM_SHM);
281 
282     /* Init table entry for our Qs */
283     MPID_nem_mem_region.FreeQ[pg_rank] = &free_queues_p[local_rank];
284     MPID_nem_mem_region.RecvQ[pg_rank] = &recv_queues_p[local_rank];
285 
286     /* Init our queues */
287     MPID_nem_queue_init(MPID_nem_mem_region.RecvQ[pg_rank]);
288     MPID_nem_queue_init(MPID_nem_mem_region.FreeQ[pg_rank]);
289 
290     /* Init and enqueue our free cells */
291     for (idx = 0; idx < MPID_NEM_NUM_CELLS; ++idx)
292     {
293         MPID_nem_cell_t *cell = (void *) ((char *) MPID_nem_mem_region.Elements + idx * MPID_NEM_CELL_LEN);
294         MPID_nem_cell_init(cell);
295         MPID_nem_queue_enqueue(MPID_nem_mem_region.FreeQ[pg_rank], cell);
296     }
297 
298     mpi_errno = MPID_nem_coll_init();
299     if (mpi_errno) MPIR_ERR_POP (mpi_errno);
300 
301     /* This must be done before initializing the netmod so that the nemesis
302        communicator creation hooks get registered (and therefore called) before
303        the netmod hooks, giving the netmod an opportunity to override the
304        nemesis collective function table. */
305     mpi_errno = MPIDI_CH3U_Comm_register_create_hook(MPIDI_CH3I_comm_create, NULL);
306     MPIR_ERR_CHECK(mpi_errno);
307 
308     /* network init */
309     if (MPID_nem_num_netmods)
310     {
311         mpi_errno = MPID_nem_choose_netmod();
312         MPIR_ERR_CHECK(mpi_errno);
313 	mpi_errno = MPID_nem_netmod_func->init(pg_p, pg_rank, &bc_val, &val_max_remaining);
314         MPIR_ERR_CHECK(mpi_errno);
315     }
316 
317     /* Register detroy hooks after netmod init so the netmod hooks get called
318        before nemesis hooks. */
319     mpi_errno = MPIDI_CH3U_Comm_register_destroy_hook(MPIDI_CH3I_comm_destroy, NULL);
320     MPIR_ERR_CHECK(mpi_errno);
321 
322     /* set default route for external processes through network */
323     for (idx = 0 ; idx < MPID_nem_mem_region.ext_procs ; idx++)
324     {
325 	grank = MPID_nem_mem_region.ext_ranks[idx];
326 	MPID_nem_mem_region.FreeQ[grank] = NULL;
327 	MPID_nem_mem_region.RecvQ[grank] = NULL;
328     }
329 
330 
331     /* set route for local procs through shmem */
332     for (idx = 0; idx < num_local; idx++)
333     {
334 	grank = local_procs[idx];
335 	MPID_nem_mem_region.FreeQ[grank] = &free_queues_p[idx];
336 	MPID_nem_mem_region.RecvQ[grank] = &recv_queues_p[idx];
337 
338 	MPIR_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.FreeQ[grank], MPID_NEM_CACHE_LINE_LEN));
339 	MPIR_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.RecvQ[grank], MPID_NEM_CACHE_LINE_LEN));
340     }
341 
342     /* make pointers to our queues global so we don't have to dereference the array */
343     MPID_nem_mem_region.my_freeQ = MPID_nem_mem_region.FreeQ[pg_rank];
344     MPID_nem_mem_region.my_recvQ = MPID_nem_mem_region.RecvQ[pg_rank];
345 
346 
347     /* local barrier */
348     mpi_errno = MPIDU_Init_shm_barrier();
349     MPIR_ERR_CHECK(mpi_errno);
350 
351 
352     /* Allocate table of pointers to fastboxes */
353     MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.in,  MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes", MPL_MEM_SHM);
354     MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.out, MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes", MPL_MEM_SHM);
355 
356     MPIR_Assert(num_local > 0);
357 
358 #define MAILBOX_INDEX(sender, receiver) ( ((sender) > (receiver)) ? ((num_local-1) * (sender) + (receiver)) :		\
359                                           (((sender) < (receiver)) ? ((num_local-1) * (sender) + ((receiver)-1)) : 0) )
360 
361     /* fill in tables */
362     for (i = 0; i < num_local; ++i)
363     {
364 	if (i == local_rank)
365 	{
366             /* No fastboxs to myself */
367 	    MPID_nem_mem_region.mailboxes.in [i] = NULL ;
368 	    MPID_nem_mem_region.mailboxes.out[i] = NULL ;
369 	}
370 	else
371 	{
372 	    MPID_nem_mem_region.mailboxes.in [i] = (void *) ((char *) fastboxes_p + (MAILBOX_INDEX(i, local_rank)) * MPID_NEM_FBOX_LEN);
373 	    MPID_nem_mem_region.mailboxes.out[i] = (void *) ((char *) fastboxes_p + (MAILBOX_INDEX(local_rank, i)) * MPID_NEM_FBOX_LEN);
374 	    MPL_atomic_relaxed_store_int(&MPID_nem_mem_region.mailboxes.in [i]->flag, 0);
375 	    MPL_atomic_relaxed_store_int(&MPID_nem_mem_region.mailboxes.out[i]->flag, 0);
376 	}
377     }
378 #undef MAILBOX_INDEX
379 
380     /* setup local LMT */
381 #if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
382         MPID_nem_local_lmt_progress = MPID_nem_lmt_shm_progress;
383 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
384         MPID_nem_local_lmt_progress = MPID_nem_lmt_dma_progress;
385 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
386         MPID_nem_local_lmt_progress = MPID_nem_lmt_vmsplice_progress;
387 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
388         MPID_nem_local_lmt_progress = NULL;
389 #else
390 #  error Must select a valid local LMT implementation!
391 #endif
392 
393     /* publish business card */
394     mpi_errno = MPIDI_PG_SetConnInfo(pg_rank, (const char *)publish_bc_orig);
395     MPIR_ERR_CHECK(mpi_errno);
396     MPL_free(publish_bc_orig);
397 
398 
399     mpi_errno = MPIDU_Init_shm_barrier();
400     MPIR_ERR_CHECK(mpi_errno);
401     mpi_errno = MPID_nem_mpich_init();
402     MPIR_ERR_CHECK(mpi_errno);
403     mpi_errno = MPIDU_Init_shm_barrier();
404     MPIR_ERR_CHECK(mpi_errno);
405 #ifdef ENABLE_CHECKPOINTING
406     mpi_errno = MPIDI_nem_ckpt_init();
407     MPIR_ERR_CHECK(mpi_errno);
408 #endif
409 
410 #ifdef PAPI_MONITOR
411     my_papi_start( pg_rank );
412 #endif /*PAPI_MONITOR   */
413 
414     MPID_nem_init_stats(num_local);
415 
416     MPIR_CHKPMEM_COMMIT();
417  fn_exit:
418     /* we do not want to lose a potential failed errno */
419     tmp_mpi_errno = MPIDU_Init_shm_finalize();
420     MPIR_ERR_ADD(mpi_errno, tmp_mpi_errno);
421     return mpi_errno;
422  fn_fail:
423     /* --BEGIN ERROR HANDLING-- */
424     MPIR_CHKPMEM_REAP();
425     goto fn_exit;
426     /* --END ERROR HANDLING-- */
427 
428 }
429 
430 /* MPID_nem_vc_init initialize nemesis' part of the vc */
431 int
MPID_nem_vc_init(MPIDI_VC_t * vc)432 MPID_nem_vc_init (MPIDI_VC_t *vc)
433 {
434     int mpi_errno = MPI_SUCCESS;
435     MPIDI_CH3I_VC *vc_ch = &vc->ch;
436     MPIR_CHKPMEM_DECL(1);
437     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_VC_INIT);
438 
439     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_VC_INIT);
440 
441     vc_ch->pkt_handler = NULL;
442     vc_ch->num_pkt_handlers = 0;
443 
444     vc_ch->send_seqno         = 0;
445 #ifdef ENABLE_CHECKPOINTING
446     vc_ch->ckpt_msg_len       = 0;
447     vc_ch->ckpt_msg_buf       = NULL;
448     vc_ch->ckpt_pause_send_vc = NULL;
449     vc_ch->ckpt_continue_vc   = NULL;
450     vc_ch->ckpt_restart_vc    = NULL;
451 #endif
452     vc_ch->pending_pkt_len    = 0;
453     MPIR_CHKPMEM_MALLOC (vc_ch->pending_pkt, MPIDI_CH3_Pkt_t *, sizeof (MPIDI_CH3_Pkt_t), mpi_errno, "pending_pkt", MPL_MEM_BUFFER);
454 
455     /* We do different things for vcs in the COMM_WORLD pg vs other pgs
456        COMM_WORLD vcs may use shared memory, and already have queues allocated
457     */
458     if (vc->lpid < MPID_nem_mem_region.num_procs)
459     {
460 	/* This vc is in COMM_WORLD */
461 	vc_ch->is_local = MPID_NEM_IS_LOCAL (vc->lpid);
462 	vc_ch->free_queue = MPID_nem_mem_region.FreeQ[vc->lpid]; /* networks and local procs have free queues */
463     }
464     else
465     {
466 	/* this vc is the result of a connect */
467 	vc_ch->is_local = 0;
468 	vc_ch->free_queue = NULL;
469     }
470 
471     /* MT we acquire the LMT CS here, b/c there is at least a theoretical race
472      * on some fields, such as lmt_copy_buf.  In practice it's not an issue, but
473      * this will keep DRD happy. */
474     MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
475 
476     /* override rendezvous functions */
477     vc->rndvSend_fn = MPID_nem_lmt_RndvSend;
478     vc->rndvRecv_fn = MPID_nem_lmt_RndvRecv;
479 
480     if (vc_ch->is_local)
481     {
482         MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
483 
484 	vc_ch->fbox_out = MPID_nem_mem_region.mailboxes.out[MPID_nem_mem_region.local_ranks[vc->lpid]];
485 	vc_ch->fbox_in = MPID_nem_mem_region.mailboxes.in[MPID_nem_mem_region.local_ranks[vc->lpid]];
486 	vc_ch->recv_queue = MPID_nem_mem_region.RecvQ[vc->lpid];
487 
488         /* override nocontig send function */
489         vc->sendNoncontig_fn = MPIDI_CH3I_SendNoncontig;
490 
491         /* local processes use the default method */
492         vc_ch->iStartContigMsg = NULL;
493         vc_ch->iSendContig     = NULL;
494         vc_ch->iSendIov     = NULL;
495 
496 #if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
497         vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_shm_initiate_lmt;
498         vc_ch->lmt_start_recv    = MPID_nem_lmt_shm_start_recv;
499         vc_ch->lmt_start_send    = MPID_nem_lmt_shm_start_send;
500         vc_ch->lmt_handle_cookie = MPID_nem_lmt_shm_handle_cookie;
501         vc_ch->lmt_done_send     = MPID_nem_lmt_shm_done_send;
502         vc_ch->lmt_done_recv     = MPID_nem_lmt_shm_done_recv;
503         vc_ch->lmt_vc_terminated = MPID_nem_lmt_shm_vc_terminated;
504 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
505         vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_dma_initiate_lmt;
506         vc_ch->lmt_start_recv    = MPID_nem_lmt_dma_start_recv;
507         vc_ch->lmt_start_send    = MPID_nem_lmt_dma_start_send;
508         vc_ch->lmt_handle_cookie = MPID_nem_lmt_dma_handle_cookie;
509         vc_ch->lmt_done_send     = MPID_nem_lmt_dma_done_send;
510         vc_ch->lmt_done_recv     = MPID_nem_lmt_dma_done_recv;
511         vc_ch->lmt_vc_terminated = MPID_nem_lmt_dma_vc_terminated;
512 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
513         vc_ch->lmt_initiate_lmt  = MPID_nem_lmt_vmsplice_initiate_lmt;
514         vc_ch->lmt_start_recv    = MPID_nem_lmt_vmsplice_start_recv;
515         vc_ch->lmt_start_send    = MPID_nem_lmt_vmsplice_start_send;
516         vc_ch->lmt_handle_cookie = MPID_nem_lmt_vmsplice_handle_cookie;
517         vc_ch->lmt_done_send     = MPID_nem_lmt_vmsplice_done_send;
518         vc_ch->lmt_done_recv     = MPID_nem_lmt_vmsplice_done_recv;
519         vc_ch->lmt_vc_terminated = MPID_nem_lmt_vmsplice_vc_terminated;
520 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
521         vc_ch->lmt_initiate_lmt  = NULL;
522         vc_ch->lmt_start_recv    = NULL;
523         vc_ch->lmt_start_send    = NULL;
524         vc_ch->lmt_handle_cookie = NULL;
525         vc_ch->lmt_done_send     = NULL;
526         vc_ch->lmt_done_recv     = NULL;
527         vc_ch->lmt_vc_terminated = NULL;
528 #else
529 #  error Must select a valid local LMT implementation!
530 #endif
531 
532         vc_ch->lmt_copy_buf        = NULL;
533         mpi_errno = MPL_shm_hnd_init(&(vc_ch->lmt_copy_buf_handle));
534         if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
535         mpi_errno = MPL_shm_hnd_init(&(vc_ch->lmt_recv_copy_buf_handle));
536         if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
537         vc_ch->lmt_queue.head      = NULL;
538         vc_ch->lmt_queue.tail      = NULL;
539         vc_ch->lmt_active_lmt      = NULL;
540         vc_ch->lmt_enqueued        = FALSE;
541         vc_ch->lmt_rts_queue.head  = NULL;
542         vc_ch->lmt_rts_queue.tail  = NULL;
543 
544         if (MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ == -1)
545             vc->eager_max_msg_sz = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
546         else
547             vc->eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ;
548 
549         if (MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ == -2)
550             vc->ready_eager_max_msg_sz = vc->eager_max_msg_sz; /* force local ready sends to use LMT */
551         else
552             vc->ready_eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ;
553 
554         MPL_DBG_MSG(MPIDI_CH3_DBG_VC, VERBOSE, "vc using shared memory");
555     }
556     else
557     {
558 	vc_ch->fbox_out   = NULL;
559 	vc_ch->fbox_in    = NULL;
560 	vc_ch->recv_queue = NULL;
561 
562         vc_ch->lmt_initiate_lmt  = NULL;
563         vc_ch->lmt_start_recv    = NULL;
564         vc_ch->lmt_start_send    = NULL;
565         vc_ch->lmt_handle_cookie = NULL;
566         vc_ch->lmt_done_send     = NULL;
567         vc_ch->lmt_done_recv     = NULL;
568         vc_ch->lmt_vc_terminated = NULL;
569 
570         /* FIXME: DARIUS set these to default for now */
571         vc_ch->iStartContigMsg = NULL;
572         vc_ch->iSendContig     = NULL;
573 
574         MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_VC, VERBOSE, (MPL_DBG_FDEST, "vc using %s netmod for rank %d pg %s",
575                                        MPID_nem_netmod_strings[MPID_nem_netmod_id], vc->pg_rank,
576                                        ((vc->pg == MPIDI_Process.my_pg)
577                                         ? "my_pg"
578                                         :   ((vc->pg)
579                                              ? ((char *)vc->pg->id)
580                                              : "unknown"
581                                             )
582                                            )
583                              ));
584 
585         mpi_errno = MPID_nem_netmod_func->vc_init(vc);
586 	MPIR_ERR_CHECK(mpi_errno);
587 
588 /* FIXME: DARIUS -- enable this assert once these functions are implemented */
589 /*         /\* iStartContigMsg iSendContig and sendNoncontig_fn must */
590 /*            be set for nonlocal processes.  Default functions only */
591 /*            support shared-memory communication. *\/ */
592 /*         MPIR_Assert(vc_ch->iStartContigMsg && vc_ch->iSendContig && vc->sendNoncontig_fn); */
593 
594     }
595 
596     MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
597 
598     /* FIXME: ch3 assumes there is a field called sendq_head in the ch
599        portion of the vc.  This is unused in nemesis and should be set
600        to NULL */
601     vc_ch->sendq_head = NULL;
602 
603     MPIR_CHKPMEM_COMMIT();
604  fn_exit:
605     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_VC_INIT);
606     return mpi_errno;
607  fn_fail:
608     MPIR_CHKPMEM_REAP();
609     goto fn_exit;
610 }
611 
612 int
MPID_nem_vc_destroy(MPIDI_VC_t * vc)613 MPID_nem_vc_destroy(MPIDI_VC_t *vc)
614 {
615     int mpi_errno = MPI_SUCCESS;
616     MPIDI_CH3I_VC *vc_ch = &vc->ch;
617     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_VC_DESTROY);
618 
619     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_VC_DESTROY);
620 
621     MPL_free(vc_ch->pending_pkt);
622 
623     mpi_errno = MPID_nem_netmod_func->vc_destroy(vc);
624     MPIR_ERR_CHECK(mpi_errno);
625 
626     fn_exit:
627     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_VC_DESTROY);
628     return mpi_errno;
629  fn_fail:
630     goto fn_exit;
631 }
632 
633 int
MPID_nem_get_business_card(int my_rank,char * value,int length)634 MPID_nem_get_business_card (int my_rank, char *value, int length)
635 {
636     return MPID_nem_netmod_func->get_business_card (my_rank, &value, &length);
637 }
638 
MPID_nem_connect_to_root(const char * business_card,MPIDI_VC_t * new_vc)639 int MPID_nem_connect_to_root (const char *business_card, MPIDI_VC_t *new_vc)
640 {
641     return MPID_nem_netmod_func->connect_to_root (business_card, new_vc);
642 }
643 
644 /* get_local_procs() determines which processes are local and
645    should use shared memory
646 
647    If an output variable pointer is NULL, it won't be set.
648 
649    Caller should NOT free any returned buffers.
650 
651    Note that this is really only a temporary solution as it only
652    calculates these values for processes MPI_COMM_WORLD, i.e., not for
653    spawned or attached processes.
654 */
get_local_procs(MPIDI_PG_t * pg,int our_pg_rank,int * num_local_p,int ** local_procs_p,int * local_rank_p)655 static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
656                            int **local_procs_p, int *local_rank_p)
657 {
658     int mpi_errno = MPI_SUCCESS;
659     int *procs;
660     int i;
661     int num_local = 0;
662     int our_node_id;
663     MPIR_CHKPMEM_DECL(1);
664 
665     MPIR_Assert(our_pg_rank < pg->size);
666     our_node_id = pg->vct[our_pg_rank].node_id;
667 
668     MPIR_CHKPMEM_MALLOC(procs, int *, pg->size * sizeof(int), mpi_errno, "local process index array", MPL_MEM_ADDRESS);
669 
670     for (i = 0; i < pg->size; ++i) {
671         if (our_node_id == pg->vct[i].node_id) {
672             if (i == our_pg_rank && local_rank_p != NULL) {
673                 *local_rank_p = num_local;
674             }
675             procs[num_local] = i;
676             ++num_local;
677         }
678     }
679 
680     MPIR_CHKPMEM_COMMIT();
681 
682     if (num_local_p != NULL)
683         *num_local_p = num_local;
684     if (local_procs_p != NULL)
685         *local_procs_p = procs;
686 fn_exit:
687     return mpi_errno;
688 fn_fail:
689     /* --BEGIN ERROR HANDLING-- */
690     MPIR_CHKPMEM_REAP();
691     goto fn_exit;
692     /* --END ERROR HANDLING-- */
693 }
694 
695