1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpiimpl.h"
7 #include "mpid_nem_impl.h"
8 #include "mpid_nem_nets.h"
9 #include <errno.h>
10 #include "mpidi_nem_statistics.h"
11 #include "mpit.h"
12 #include "mpidu_init_shm.h"
13
14 /*
15 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
16
17 categories:
18 - name : NEMESIS
19 description : cvars that control behavior of the ch3:nemesis channel
20
21 cvars:
22 - name : MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ
23 category : NEMESIS
24 type : int
25 default : -1
26 class : none
27 verbosity : MPI_T_VERBOSITY_USER_BASIC
28 scope : MPI_T_SCOPE_ALL_EQ
29 description : >-
30 This cvar controls the message size at which Nemesis
31 switches from eager to rendezvous mode for shared memory.
32 If this cvar is set to -1, then Nemesis will choose
33 an appropriate value.
34
35 - name : MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ
36 category : NEMESIS
37 type : int
38 default : -2
39 class : none
40 verbosity : MPI_T_VERBOSITY_USER_BASIC
41 scope : MPI_T_SCOPE_ALL_EQ
42 description : >-
43 This cvar controls the message size at which Nemesis
44 switches from eager to rendezvous mode for ready-send
45 messages. If this cvar is set to -1, then ready messages
46 will always be sent eagerly. If this cvar is set to -2,
47 then Nemesis will choose an appropriate value.
48
49 === END_MPI_T_CVAR_INFO_BLOCK ===
50 */
51
52 /* constants for configure time selection of local LMT implementations */
53 #define MPID_NEM_LOCAL_LMT_NONE 0
54 #define MPID_NEM_LOCAL_LMT_SHM_COPY 1
55 #define MPID_NEM_LOCAL_LMT_DMA 2
56 #define MPID_NEM_LOCAL_LMT_VMSPLICE 3
57
58 #ifdef MEM_REGION_IN_HEAP
59 MPID_nem_mem_region_t *MPID_nem_mem_region_ptr = 0;
60 #else /* MEM_REGION_IN_HEAP */
61 MPID_nem_mem_region_t MPID_nem_mem_region;
62 #endif /* MEM_REGION_IN_HEAP */
63
64 char MPID_nem_hostname[MAX_HOSTNAME_LEN] = "UNKNOWN";
65
66 static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
67 int **local_procs_p, int *local_rank_p);
68
69 char *MPID_nem_asymm_base_addr = 0;
70
71 /* used by mpid_nem_inline.h and mpid_nem_finalize.c */
72 unsigned long long *MPID_nem_fbox_fall_back_to_queue_count = NULL;
73
MPID_nem_init_stats(int n_local_ranks)74 static int MPID_nem_init_stats(int n_local_ranks)
75 {
76 int mpi_errno = MPI_SUCCESS;
77
78 if (ENABLE_PVAR_NEM) {
79 MPID_nem_fbox_fall_back_to_queue_count = MPL_calloc(n_local_ranks, sizeof(unsigned long long), MPL_MEM_MPIT);
80 }
81
82 MPIR_T_PVAR_COUNTER_REGISTER_DYNAMIC(
83 NEM,
84 MPI_UNSIGNED_LONG_LONG,
85 nem_fbox_fall_back_to_queue_count, /* name */
86 MPID_nem_fbox_fall_back_to_queue_count, /* address */
87 n_local_ranks, /* count, known at pvar registeration time */
88 MPI_T_VERBOSITY_USER_DETAIL,
89 MPI_T_BIND_NO_OBJECT,
90 MPIR_T_PVAR_FLAG_CONTINUOUS, /* flags */
91 NULL, /* get_value */
92 NULL, /* get_count */
93 "NEMESIS", /* category */
94 "Array counting how many times nemesis had to fall back to the regular queue when sending messages between pairs of local processes");
95
96 fn_exit:
97 return mpi_errno;
98 fn_fail:
99 goto fn_exit;
100 }
101
102 int
MPID_nem_init(int pg_rank,MPIDI_PG_t * pg_p,int has_parent ATTRIBUTE ((unused)))103 MPID_nem_init(int pg_rank, MPIDI_PG_t *pg_p, int has_parent ATTRIBUTE((unused)))
104 {
105 int mpi_errno = MPI_SUCCESS;
106 int tmp_mpi_errno;
107 int num_procs = pg_p->size;
108 int ret;
109 int num_local = -1;
110 int *local_procs = NULL;
111 int local_rank = -1;
112 int idx;
113 int i;
114 char *publish_bc_orig = NULL;
115 char *bc_val = NULL;
116 int val_max_remaining;
117 int grank;
118 size_t len;
119 void *fastboxes_p = NULL;
120 void *cells_p = NULL;
121 MPID_nem_queue_t *recv_queues_p = NULL;
122 MPID_nem_queue_t *free_queues_p = NULL;
123 char strerrbuf[MPIR_STRERROR_BUF_SIZE];
124
125 MPIR_CHKPMEM_DECL(8);
126
127 /* TODO add compile-time asserts (rather than run-time) and convert most of these */
128
129 /* Make sure the nemesis packet is no larger than the generic
130 packet. This is needed because we no longer include channel
131 packet types in the CH3 packet types to allow dynamic channel
132 loading. */
133 MPIR_Assert(sizeof(MPIDI_CH3_nem_pkt_t) <= sizeof(MPIDI_CH3_Pkt_t));
134
135 /* The MPID_nem_cell_rel_ptr_t defined in mpid_nem_datatypes.h
136 should only contain a MPL_atomic_ptr_t. This is to check that
137 absolute pointers are exactly the same size as relative
138 pointers. */
139 MPIR_Assert(sizeof(MPID_nem_cell_rel_ptr_t) == sizeof(MPL_atomic_ptr_t));
140
141 /* Make sure payload is aligned on 8-byte boundary */
142 MPIR_Assert(MPID_NEM_ALIGNED(&((MPID_nem_cell_t*)0)->payload[0], 8));
143 /* Make sure the padding to cacheline size in MPID_nem_queue_t works */
144 MPIR_Assert(MPID_NEM_CACHE_LINE_LEN > 2 * sizeof(MPID_nem_cell_rel_ptr_t));
145
146 /* Initialize the business card */
147 mpi_errno = MPIDI_CH3I_BCInit( &bc_val, &val_max_remaining );
148 if (mpi_errno) MPIR_ERR_POP (mpi_errno);
149 publish_bc_orig = bc_val;
150
151 ret = gethostname (MPID_nem_hostname, MAX_HOSTNAME_LEN);
152 MPIR_ERR_CHKANDJUMP2 (ret == -1, mpi_errno, MPI_ERR_OTHER, "**sock_gethost", "**sock_gethost %s %d",
153 MPIR_Strerror(errno, strerrbuf, MPIR_STRERROR_BUF_SIZE), errno);
154
155 MPID_nem_hostname[MAX_HOSTNAME_LEN-1] = '\0';
156
157 mpi_errno = get_local_procs(pg_p, pg_rank, &num_local, &local_procs, &local_rank);
158 MPIR_ERR_CHECK(mpi_errno);
159
160 #ifdef MEM_REGION_IN_HEAP
161 MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region_ptr, MPID_nem_mem_region_t *, sizeof(MPID_nem_mem_region_t), mpi_errno, "mem_region", MPL_MEM_SHM);
162 #endif /* MEM_REGION_IN_HEAP */
163
164 MPID_nem_mem_region.rank = pg_rank;
165 MPID_nem_mem_region.num_local = num_local;
166 MPID_nem_mem_region.num_procs = num_procs;
167 MPID_nem_mem_region.local_procs = local_procs;
168 MPID_nem_mem_region.local_rank = local_rank;
169 MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region.local_ranks, int *, num_procs * sizeof(int), mpi_errno, "mem_region local ranks", MPL_MEM_SHM);
170 MPID_nem_mem_region.ext_procs = num_procs - num_local ;
171 if (MPID_nem_mem_region.ext_procs > 0)
172 MPIR_CHKPMEM_MALLOC (MPID_nem_mem_region.ext_ranks, int *, MPID_nem_mem_region.ext_procs * sizeof(int), mpi_errno, "mem_region ext ranks", MPL_MEM_SHM);
173 MPID_nem_mem_region.next = NULL;
174
175 for (idx = 0 ; idx < num_procs; idx++)
176 {
177 MPID_nem_mem_region.local_ranks[idx] = MPID_NEM_NON_LOCAL;
178 }
179 for (idx = 0; idx < num_local; idx++)
180 {
181 grank = local_procs[idx];
182 MPID_nem_mem_region.local_ranks[grank] = idx;
183 }
184
185 idx = 0;
186 for(grank = 0 ; grank < num_procs ; grank++)
187 {
188 if(!MPID_NEM_IS_LOCAL(grank))
189 {
190 MPID_nem_mem_region.ext_ranks[idx++] = grank;
191 }
192 }
193
194 #ifdef FORCE_ASYM
195 {
196 /* this is used for debugging
197 each process allocates a different sized piece of shared
198 memory so that when the shared memory segment used for
199 communication is allocated it will probably be mapped at a
200 different location for each process
201 */
202 MPL_shm_hnd_t handle;
203 int size = (local_rank * 65536) + 65536;
204 char *base_addr;
205
206 mpi_errno = MPL_shm_hnd_init(&handle);
207 if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
208
209 mpi_errno = MPL_shm_seg_create_and_attach(handle, size, &base_addr, 0);
210 /* --BEGIN ERROR HANDLING-- */
211 if (mpi_errno)
212 {
213 MPL_shm_seg_remove(handle);
214 MPL_shm_hnd_finalize(&handle);
215 MPIR_ERR_POP (mpi_errno);
216 }
217 /* --END ERROR HANDLING-- */
218
219 mpi_errno = MPL_shm_seg_remove(handle);
220 /* --BEGIN ERROR HANDLING-- */
221 if (mpi_errno)
222 {
223 MPL_shm_hnd_finalize(&handle);
224 MPIR_ERR_POP (mpi_errno);
225 }
226 /* --END ERROR HANDLING-- */
227
228 MPL_shm_hnd_finalize(&handle);
229 }
230 /*fprintf(stderr,"[%i] -- address shift ok \n",pg_rank); */
231 #endif /*FORCE_ASYM */
232
233 /* Initialize core shared memory segment */
234 mpi_errno = MPIDU_Init_shm_init();
235 MPIR_ERR_CHECK(mpi_errno);
236
237 /* Request fastboxes region */
238 size_t fbox_len = MPL_MAX((num_local*((num_local-1) * MPID_NEM_FBOX_LEN)),
239 MPID_NEM_ASYMM_NULL_VAL);
240
241 /* Request data cells region */
242 size_t cells_len = num_local * MPID_NEM_NUM_CELLS * MPID_NEM_CELL_LEN;
243
244 /* Request free q region */
245 size_t freeQ_len = num_local * sizeof(MPID_nem_queue_t);
246
247 /* Request recv q region */
248 size_t recvQ_len = num_local * sizeof(MPID_nem_queue_t);
249
250 len = fbox_len + cells_len + freeQ_len + recvQ_len;
251
252 /* Actually allocate the segment and assign regions to the pointers */
253 mpi_errno = MPIDU_Init_shm_alloc(len, &MPID_nem_mem_region.shm_ptr);
254 /* check_alloc steps */
255 if (MPIDU_Init_shm_is_symm(MPID_nem_mem_region.shm_ptr) == 1) {
256 MPID_nem_asymm_base_addr = NULL;
257 } else {
258 MPID_nem_asymm_base_addr = MPID_nem_mem_region.shm_ptr;
259 #ifdef MPID_NEM_SYMMETRIC_QUEUES
260 MPIR_ERR_INTERNALANDJUMP(mpi_errno, "queues are not symmetrically allocated as expected");
261 #endif
262 }
263
264 if (mpi_errno) MPIR_ERR_POP (mpi_errno);
265
266 fastboxes_p = (void *) MPID_nem_mem_region.shm_ptr;
267 cells_p = (char *) fastboxes_p + fbox_len;
268 free_queues_p = (MPID_nem_queue_t *)((char *) cells_p + cells_len);
269 recv_queues_p = (MPID_nem_queue_t *)((char *) free_queues_p + freeQ_len);
270
271 /* local procs barrier */
272 mpi_errno = MPIDU_Init_shm_barrier();
273 if (mpi_errno) MPIR_ERR_POP (mpi_errno);
274
275 /* find our cell region */
276 MPID_nem_mem_region.Elements = (void *) ((char *) cells_p + local_rank * MPID_NEM_NUM_CELLS * MPID_NEM_CELL_LEN);
277
278 /* Tables of pointers to shared memory Qs */
279 MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.FreeQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "FreeQ", MPL_MEM_SHM);
280 MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.RecvQ, MPID_nem_queue_ptr_t *, num_procs * sizeof(MPID_nem_queue_ptr_t), mpi_errno, "RecvQ", MPL_MEM_SHM);
281
282 /* Init table entry for our Qs */
283 MPID_nem_mem_region.FreeQ[pg_rank] = &free_queues_p[local_rank];
284 MPID_nem_mem_region.RecvQ[pg_rank] = &recv_queues_p[local_rank];
285
286 /* Init our queues */
287 MPID_nem_queue_init(MPID_nem_mem_region.RecvQ[pg_rank]);
288 MPID_nem_queue_init(MPID_nem_mem_region.FreeQ[pg_rank]);
289
290 /* Init and enqueue our free cells */
291 for (idx = 0; idx < MPID_NEM_NUM_CELLS; ++idx)
292 {
293 MPID_nem_cell_t *cell = (void *) ((char *) MPID_nem_mem_region.Elements + idx * MPID_NEM_CELL_LEN);
294 MPID_nem_cell_init(cell);
295 MPID_nem_queue_enqueue(MPID_nem_mem_region.FreeQ[pg_rank], cell);
296 }
297
298 mpi_errno = MPID_nem_coll_init();
299 if (mpi_errno) MPIR_ERR_POP (mpi_errno);
300
301 /* This must be done before initializing the netmod so that the nemesis
302 communicator creation hooks get registered (and therefore called) before
303 the netmod hooks, giving the netmod an opportunity to override the
304 nemesis collective function table. */
305 mpi_errno = MPIDI_CH3U_Comm_register_create_hook(MPIDI_CH3I_comm_create, NULL);
306 MPIR_ERR_CHECK(mpi_errno);
307
308 /* network init */
309 if (MPID_nem_num_netmods)
310 {
311 mpi_errno = MPID_nem_choose_netmod();
312 MPIR_ERR_CHECK(mpi_errno);
313 mpi_errno = MPID_nem_netmod_func->init(pg_p, pg_rank, &bc_val, &val_max_remaining);
314 MPIR_ERR_CHECK(mpi_errno);
315 }
316
317 /* Register detroy hooks after netmod init so the netmod hooks get called
318 before nemesis hooks. */
319 mpi_errno = MPIDI_CH3U_Comm_register_destroy_hook(MPIDI_CH3I_comm_destroy, NULL);
320 MPIR_ERR_CHECK(mpi_errno);
321
322 /* set default route for external processes through network */
323 for (idx = 0 ; idx < MPID_nem_mem_region.ext_procs ; idx++)
324 {
325 grank = MPID_nem_mem_region.ext_ranks[idx];
326 MPID_nem_mem_region.FreeQ[grank] = NULL;
327 MPID_nem_mem_region.RecvQ[grank] = NULL;
328 }
329
330
331 /* set route for local procs through shmem */
332 for (idx = 0; idx < num_local; idx++)
333 {
334 grank = local_procs[idx];
335 MPID_nem_mem_region.FreeQ[grank] = &free_queues_p[idx];
336 MPID_nem_mem_region.RecvQ[grank] = &recv_queues_p[idx];
337
338 MPIR_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.FreeQ[grank], MPID_NEM_CACHE_LINE_LEN));
339 MPIR_Assert(MPID_NEM_ALIGNED(MPID_nem_mem_region.RecvQ[grank], MPID_NEM_CACHE_LINE_LEN));
340 }
341
342 /* make pointers to our queues global so we don't have to dereference the array */
343 MPID_nem_mem_region.my_freeQ = MPID_nem_mem_region.FreeQ[pg_rank];
344 MPID_nem_mem_region.my_recvQ = MPID_nem_mem_region.RecvQ[pg_rank];
345
346
347 /* local barrier */
348 mpi_errno = MPIDU_Init_shm_barrier();
349 MPIR_ERR_CHECK(mpi_errno);
350
351
352 /* Allocate table of pointers to fastboxes */
353 MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.in, MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes", MPL_MEM_SHM);
354 MPIR_CHKPMEM_MALLOC(MPID_nem_mem_region.mailboxes.out, MPID_nem_fastbox_t **, num_local * sizeof(MPID_nem_fastbox_t *), mpi_errno, "fastboxes", MPL_MEM_SHM);
355
356 MPIR_Assert(num_local > 0);
357
358 #define MAILBOX_INDEX(sender, receiver) ( ((sender) > (receiver)) ? ((num_local-1) * (sender) + (receiver)) : \
359 (((sender) < (receiver)) ? ((num_local-1) * (sender) + ((receiver)-1)) : 0) )
360
361 /* fill in tables */
362 for (i = 0; i < num_local; ++i)
363 {
364 if (i == local_rank)
365 {
366 /* No fastboxs to myself */
367 MPID_nem_mem_region.mailboxes.in [i] = NULL ;
368 MPID_nem_mem_region.mailboxes.out[i] = NULL ;
369 }
370 else
371 {
372 MPID_nem_mem_region.mailboxes.in [i] = (void *) ((char *) fastboxes_p + (MAILBOX_INDEX(i, local_rank)) * MPID_NEM_FBOX_LEN);
373 MPID_nem_mem_region.mailboxes.out[i] = (void *) ((char *) fastboxes_p + (MAILBOX_INDEX(local_rank, i)) * MPID_NEM_FBOX_LEN);
374 MPL_atomic_relaxed_store_int(&MPID_nem_mem_region.mailboxes.in [i]->flag, 0);
375 MPL_atomic_relaxed_store_int(&MPID_nem_mem_region.mailboxes.out[i]->flag, 0);
376 }
377 }
378 #undef MAILBOX_INDEX
379
380 /* setup local LMT */
381 #if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
382 MPID_nem_local_lmt_progress = MPID_nem_lmt_shm_progress;
383 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
384 MPID_nem_local_lmt_progress = MPID_nem_lmt_dma_progress;
385 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
386 MPID_nem_local_lmt_progress = MPID_nem_lmt_vmsplice_progress;
387 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
388 MPID_nem_local_lmt_progress = NULL;
389 #else
390 # error Must select a valid local LMT implementation!
391 #endif
392
393 /* publish business card */
394 mpi_errno = MPIDI_PG_SetConnInfo(pg_rank, (const char *)publish_bc_orig);
395 MPIR_ERR_CHECK(mpi_errno);
396 MPL_free(publish_bc_orig);
397
398
399 mpi_errno = MPIDU_Init_shm_barrier();
400 MPIR_ERR_CHECK(mpi_errno);
401 mpi_errno = MPID_nem_mpich_init();
402 MPIR_ERR_CHECK(mpi_errno);
403 mpi_errno = MPIDU_Init_shm_barrier();
404 MPIR_ERR_CHECK(mpi_errno);
405 #ifdef ENABLE_CHECKPOINTING
406 mpi_errno = MPIDI_nem_ckpt_init();
407 MPIR_ERR_CHECK(mpi_errno);
408 #endif
409
410 #ifdef PAPI_MONITOR
411 my_papi_start( pg_rank );
412 #endif /*PAPI_MONITOR */
413
414 MPID_nem_init_stats(num_local);
415
416 MPIR_CHKPMEM_COMMIT();
417 fn_exit:
418 /* we do not want to lose a potential failed errno */
419 tmp_mpi_errno = MPIDU_Init_shm_finalize();
420 MPIR_ERR_ADD(mpi_errno, tmp_mpi_errno);
421 return mpi_errno;
422 fn_fail:
423 /* --BEGIN ERROR HANDLING-- */
424 MPIR_CHKPMEM_REAP();
425 goto fn_exit;
426 /* --END ERROR HANDLING-- */
427
428 }
429
430 /* MPID_nem_vc_init initialize nemesis' part of the vc */
431 int
MPID_nem_vc_init(MPIDI_VC_t * vc)432 MPID_nem_vc_init (MPIDI_VC_t *vc)
433 {
434 int mpi_errno = MPI_SUCCESS;
435 MPIDI_CH3I_VC *vc_ch = &vc->ch;
436 MPIR_CHKPMEM_DECL(1);
437 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_VC_INIT);
438
439 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_VC_INIT);
440
441 vc_ch->pkt_handler = NULL;
442 vc_ch->num_pkt_handlers = 0;
443
444 vc_ch->send_seqno = 0;
445 #ifdef ENABLE_CHECKPOINTING
446 vc_ch->ckpt_msg_len = 0;
447 vc_ch->ckpt_msg_buf = NULL;
448 vc_ch->ckpt_pause_send_vc = NULL;
449 vc_ch->ckpt_continue_vc = NULL;
450 vc_ch->ckpt_restart_vc = NULL;
451 #endif
452 vc_ch->pending_pkt_len = 0;
453 MPIR_CHKPMEM_MALLOC (vc_ch->pending_pkt, MPIDI_CH3_Pkt_t *, sizeof (MPIDI_CH3_Pkt_t), mpi_errno, "pending_pkt", MPL_MEM_BUFFER);
454
455 /* We do different things for vcs in the COMM_WORLD pg vs other pgs
456 COMM_WORLD vcs may use shared memory, and already have queues allocated
457 */
458 if (vc->lpid < MPID_nem_mem_region.num_procs)
459 {
460 /* This vc is in COMM_WORLD */
461 vc_ch->is_local = MPID_NEM_IS_LOCAL (vc->lpid);
462 vc_ch->free_queue = MPID_nem_mem_region.FreeQ[vc->lpid]; /* networks and local procs have free queues */
463 }
464 else
465 {
466 /* this vc is the result of a connect */
467 vc_ch->is_local = 0;
468 vc_ch->free_queue = NULL;
469 }
470
471 /* MT we acquire the LMT CS here, b/c there is at least a theoretical race
472 * on some fields, such as lmt_copy_buf. In practice it's not an issue, but
473 * this will keep DRD happy. */
474 MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
475
476 /* override rendezvous functions */
477 vc->rndvSend_fn = MPID_nem_lmt_RndvSend;
478 vc->rndvRecv_fn = MPID_nem_lmt_RndvRecv;
479
480 if (vc_ch->is_local)
481 {
482 MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
483
484 vc_ch->fbox_out = MPID_nem_mem_region.mailboxes.out[MPID_nem_mem_region.local_ranks[vc->lpid]];
485 vc_ch->fbox_in = MPID_nem_mem_region.mailboxes.in[MPID_nem_mem_region.local_ranks[vc->lpid]];
486 vc_ch->recv_queue = MPID_nem_mem_region.RecvQ[vc->lpid];
487
488 /* override nocontig send function */
489 vc->sendNoncontig_fn = MPIDI_CH3I_SendNoncontig;
490
491 /* local processes use the default method */
492 vc_ch->iStartContigMsg = NULL;
493 vc_ch->iSendContig = NULL;
494 vc_ch->iSendIov = NULL;
495
496 #if MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_SHM_COPY
497 vc_ch->lmt_initiate_lmt = MPID_nem_lmt_shm_initiate_lmt;
498 vc_ch->lmt_start_recv = MPID_nem_lmt_shm_start_recv;
499 vc_ch->lmt_start_send = MPID_nem_lmt_shm_start_send;
500 vc_ch->lmt_handle_cookie = MPID_nem_lmt_shm_handle_cookie;
501 vc_ch->lmt_done_send = MPID_nem_lmt_shm_done_send;
502 vc_ch->lmt_done_recv = MPID_nem_lmt_shm_done_recv;
503 vc_ch->lmt_vc_terminated = MPID_nem_lmt_shm_vc_terminated;
504 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_DMA
505 vc_ch->lmt_initiate_lmt = MPID_nem_lmt_dma_initiate_lmt;
506 vc_ch->lmt_start_recv = MPID_nem_lmt_dma_start_recv;
507 vc_ch->lmt_start_send = MPID_nem_lmt_dma_start_send;
508 vc_ch->lmt_handle_cookie = MPID_nem_lmt_dma_handle_cookie;
509 vc_ch->lmt_done_send = MPID_nem_lmt_dma_done_send;
510 vc_ch->lmt_done_recv = MPID_nem_lmt_dma_done_recv;
511 vc_ch->lmt_vc_terminated = MPID_nem_lmt_dma_vc_terminated;
512 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_VMSPLICE
513 vc_ch->lmt_initiate_lmt = MPID_nem_lmt_vmsplice_initiate_lmt;
514 vc_ch->lmt_start_recv = MPID_nem_lmt_vmsplice_start_recv;
515 vc_ch->lmt_start_send = MPID_nem_lmt_vmsplice_start_send;
516 vc_ch->lmt_handle_cookie = MPID_nem_lmt_vmsplice_handle_cookie;
517 vc_ch->lmt_done_send = MPID_nem_lmt_vmsplice_done_send;
518 vc_ch->lmt_done_recv = MPID_nem_lmt_vmsplice_done_recv;
519 vc_ch->lmt_vc_terminated = MPID_nem_lmt_vmsplice_vc_terminated;
520 #elif MPID_NEM_LOCAL_LMT_IMPL == MPID_NEM_LOCAL_LMT_NONE
521 vc_ch->lmt_initiate_lmt = NULL;
522 vc_ch->lmt_start_recv = NULL;
523 vc_ch->lmt_start_send = NULL;
524 vc_ch->lmt_handle_cookie = NULL;
525 vc_ch->lmt_done_send = NULL;
526 vc_ch->lmt_done_recv = NULL;
527 vc_ch->lmt_vc_terminated = NULL;
528 #else
529 # error Must select a valid local LMT implementation!
530 #endif
531
532 vc_ch->lmt_copy_buf = NULL;
533 mpi_errno = MPL_shm_hnd_init(&(vc_ch->lmt_copy_buf_handle));
534 if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
535 mpi_errno = MPL_shm_hnd_init(&(vc_ch->lmt_recv_copy_buf_handle));
536 if(mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); }
537 vc_ch->lmt_queue.head = NULL;
538 vc_ch->lmt_queue.tail = NULL;
539 vc_ch->lmt_active_lmt = NULL;
540 vc_ch->lmt_enqueued = FALSE;
541 vc_ch->lmt_rts_queue.head = NULL;
542 vc_ch->lmt_rts_queue.tail = NULL;
543
544 if (MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ == -1)
545 vc->eager_max_msg_sz = MPID_NEM_MPICH_DATA_LEN - sizeof(MPIDI_CH3_Pkt_t);
546 else
547 vc->eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_EAGER_MAX_SZ;
548
549 if (MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ == -2)
550 vc->ready_eager_max_msg_sz = vc->eager_max_msg_sz; /* force local ready sends to use LMT */
551 else
552 vc->ready_eager_max_msg_sz = MPIR_CVAR_NEMESIS_SHM_READY_EAGER_MAX_SZ;
553
554 MPL_DBG_MSG(MPIDI_CH3_DBG_VC, VERBOSE, "vc using shared memory");
555 }
556 else
557 {
558 vc_ch->fbox_out = NULL;
559 vc_ch->fbox_in = NULL;
560 vc_ch->recv_queue = NULL;
561
562 vc_ch->lmt_initiate_lmt = NULL;
563 vc_ch->lmt_start_recv = NULL;
564 vc_ch->lmt_start_send = NULL;
565 vc_ch->lmt_handle_cookie = NULL;
566 vc_ch->lmt_done_send = NULL;
567 vc_ch->lmt_done_recv = NULL;
568 vc_ch->lmt_vc_terminated = NULL;
569
570 /* FIXME: DARIUS set these to default for now */
571 vc_ch->iStartContigMsg = NULL;
572 vc_ch->iSendContig = NULL;
573
574 MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_VC, VERBOSE, (MPL_DBG_FDEST, "vc using %s netmod for rank %d pg %s",
575 MPID_nem_netmod_strings[MPID_nem_netmod_id], vc->pg_rank,
576 ((vc->pg == MPIDI_Process.my_pg)
577 ? "my_pg"
578 : ((vc->pg)
579 ? ((char *)vc->pg->id)
580 : "unknown"
581 )
582 )
583 ));
584
585 mpi_errno = MPID_nem_netmod_func->vc_init(vc);
586 MPIR_ERR_CHECK(mpi_errno);
587
588 /* FIXME: DARIUS -- enable this assert once these functions are implemented */
589 /* /\* iStartContigMsg iSendContig and sendNoncontig_fn must */
590 /* be set for nonlocal processes. Default functions only */
591 /* support shared-memory communication. *\/ */
592 /* MPIR_Assert(vc_ch->iStartContigMsg && vc_ch->iSendContig && vc->sendNoncontig_fn); */
593
594 }
595
596 MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
597
598 /* FIXME: ch3 assumes there is a field called sendq_head in the ch
599 portion of the vc. This is unused in nemesis and should be set
600 to NULL */
601 vc_ch->sendq_head = NULL;
602
603 MPIR_CHKPMEM_COMMIT();
604 fn_exit:
605 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_VC_INIT);
606 return mpi_errno;
607 fn_fail:
608 MPIR_CHKPMEM_REAP();
609 goto fn_exit;
610 }
611
612 int
MPID_nem_vc_destroy(MPIDI_VC_t * vc)613 MPID_nem_vc_destroy(MPIDI_VC_t *vc)
614 {
615 int mpi_errno = MPI_SUCCESS;
616 MPIDI_CH3I_VC *vc_ch = &vc->ch;
617 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_VC_DESTROY);
618
619 MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_VC_DESTROY);
620
621 MPL_free(vc_ch->pending_pkt);
622
623 mpi_errno = MPID_nem_netmod_func->vc_destroy(vc);
624 MPIR_ERR_CHECK(mpi_errno);
625
626 fn_exit:
627 MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_VC_DESTROY);
628 return mpi_errno;
629 fn_fail:
630 goto fn_exit;
631 }
632
633 int
MPID_nem_get_business_card(int my_rank,char * value,int length)634 MPID_nem_get_business_card (int my_rank, char *value, int length)
635 {
636 return MPID_nem_netmod_func->get_business_card (my_rank, &value, &length);
637 }
638
MPID_nem_connect_to_root(const char * business_card,MPIDI_VC_t * new_vc)639 int MPID_nem_connect_to_root (const char *business_card, MPIDI_VC_t *new_vc)
640 {
641 return MPID_nem_netmod_func->connect_to_root (business_card, new_vc);
642 }
643
644 /* get_local_procs() determines which processes are local and
645 should use shared memory
646
647 If an output variable pointer is NULL, it won't be set.
648
649 Caller should NOT free any returned buffers.
650
651 Note that this is really only a temporary solution as it only
652 calculates these values for processes MPI_COMM_WORLD, i.e., not for
653 spawned or attached processes.
654 */
get_local_procs(MPIDI_PG_t * pg,int our_pg_rank,int * num_local_p,int ** local_procs_p,int * local_rank_p)655 static int get_local_procs(MPIDI_PG_t *pg, int our_pg_rank, int *num_local_p,
656 int **local_procs_p, int *local_rank_p)
657 {
658 int mpi_errno = MPI_SUCCESS;
659 int *procs;
660 int i;
661 int num_local = 0;
662 int our_node_id;
663 MPIR_CHKPMEM_DECL(1);
664
665 MPIR_Assert(our_pg_rank < pg->size);
666 our_node_id = pg->vct[our_pg_rank].node_id;
667
668 MPIR_CHKPMEM_MALLOC(procs, int *, pg->size * sizeof(int), mpi_errno, "local process index array", MPL_MEM_ADDRESS);
669
670 for (i = 0; i < pg->size; ++i) {
671 if (our_node_id == pg->vct[i].node_id) {
672 if (i == our_pg_rank && local_rank_p != NULL) {
673 *local_rank_p = num_local;
674 }
675 procs[num_local] = i;
676 ++num_local;
677 }
678 }
679
680 MPIR_CHKPMEM_COMMIT();
681
682 if (num_local_p != NULL)
683 *num_local_p = num_local;
684 if (local_procs_p != NULL)
685 *local_procs_p = procs;
686 fn_exit:
687 return mpi_errno;
688 fn_fail:
689 /* --BEGIN ERROR HANDLING-- */
690 MPIR_CHKPMEM_REAP();
691 goto fn_exit;
692 /* --END ERROR HANDLING-- */
693 }
694
695