1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 /*
7  * WARNING: Functions and macros in this file are for internal use only.
8  * As such, they are only visible to the device and
9  * channel.  Do not include them in the MPID macros.
10  */
11 
12 #ifndef MPIDIMPL_H_INCLUDED
13 #define MPIDIMPL_H_INCLUDED
14 
15 #include "mpichconf.h"
16 
17 #if defined(HAVE_ASSERT_H)
18 #include <assert.h>
19 #endif
20 
21 #include "mpiimpl.h"
22 
23 /* Add the ch3 packet definitions */
24 #include "mpidpkt.h"
25 
26 #if !defined(MPIDI_IOV_DENSITY_MIN)
27 #   define MPIDI_IOV_DENSITY_MIN (16 * 1024)
28 #endif
29 
30 #if defined(HAVE_GETHOSTNAME) && defined(NEEDS_GETHOSTNAME_DECL) && \
31    !defined(gethostname)
32 int gethostname(char *name, size_t len);
33 # endif
34 
35 /* Default PMI version to use */
36 #define MPIDI_CH3I_DEFAULT_PMI_VERSION 1
37 #define MPIDI_CH3I_DEFAULT_PMI_SUBVERSION 1
38 
39 /* group of processes detected to have failed.  This is a subset of
40    comm_world group. */
41 extern MPIR_Group *MPIDI_Failed_procs_group;
42 extern int MPIDI_last_known_failed;
43 extern char *MPIDI_failed_procs_string;
44 
45 #if defined(MPL_USE_DBG_LOGGING)
46 extern MPL_dbg_class MPIDI_CH3_DBG_CONNECT;
47 extern MPL_dbg_class MPIDI_CH3_DBG_DISCONNECT;
48 extern MPL_dbg_class MPIDI_CH3_DBG_PROGRESS;
49 extern MPL_dbg_class MPIDI_CH3_DBG_CHANNEL;
50 extern MPL_dbg_class MPIDI_CH3_DBG_OTHER;
51 extern MPL_dbg_class MPIDI_CH3_DBG_MSG;
52 extern MPL_dbg_class MPIDI_CH3_DBG_VC;
53 extern MPL_dbg_class MPIDI_CH3_DBG_REFCOUNT;
54 #endif /* MPL_USE_DBG_LOGGING */
55 
56 #define MPIDI_CHANGE_VC_STATE(vc, new_state) do {               \
57         MPL_DBG_VCSTATECHANGE(vc, VC_STATE_##new_state);       \
58         (vc)->state = MPIDI_VC_STATE_##new_state;               \
59     } while (0)
60 
61 /*S
62   MPIDI_PG_t - Process group description
63 
64   Notes:
65   Every 'MPI_COMM_WORLD' known to this process has an associated process
66   group.
67   S*/
68 typedef struct MPIDI_PG
69 {
70     /* MPIU_Object field.  MPIDI_PG_t objects are not allocated using the
71        MPIU_Object system, but we do use the associated reference counting
72        routines.  Therefore, handle must be present, but is not used
73        except by debugging routines */
74     MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */
75 
76     /* Next pointer used to maintain a list of all process groups known to
77        this process */
78     struct MPIDI_PG * next;
79 
80     /* Number of processes in the process group */
81     int size;
82 
83     /* VC table.  At present this is a pointer to an array of VC structures.
84        Someday we may want make this a pointer to an array
85        of VC references.  Thus, it is important to use MPIDI_PG_Get_vc()
86        instead of directly referencing this field. */
87     struct MPIDI_VC * vct;
88 
89     /* Pointer to the process group ID.  The actual ID is defined and
90        allocated by the process group.  The pointer is kept in the
91        device space because it is necessary for the device to be able to
92        find a particular process group. */
93     void * id;
94 
95     /* Flag to mark a procress group which is finalizing. This means thay
96        the VCs for this process group are closing, (normally becuase
97        MPI_Finalize was called). This is required to avoid a reconnection
98        of the VCs when the PG is closed due to unused elements in the event
99        queue  */
100     int finalize;
101 
102     /* Replacement abstraction for connection information */
103     /* Connection information needed to access processes in this process
104        group and to share the data with other processes.  The items are
105        connData - pointer for data used to implement these functions
106                   (e.g., a pointer to an array of process group info)
107        getConnInfo( rank, buf, bufsize, self ) - function to store into
108                   buf the connection information for rank in this process
109                   group
110        connInfoToString( buf_p, size, self ) - return in buf_p a string
111                   that can be sent to another process to recreate the
112                   connection information (the info needed to support
113                   getConnInfo)
114        connInfoFromString( buf, self ) - setup the information needed
115                   to implement getConnInfo
116        freeConnInfo( self ) - free any storage or resources associated
117                   with the connection information.
118 
119        See ch3/src/mpidi_pg.c
120     */
121     void *connData;
122     int  (*getConnInfo)( int, char *, int, struct MPIDI_PG * );
123     int  (*connInfoToString)( char **, int *, struct MPIDI_PG * );
124     int  (*connInfoFromString)( const char *,  struct MPIDI_PG * );
125     int  (*freeConnInfo)( struct MPIDI_PG * );
126 
127     /* Rather than have each channel define its own fields for the
128        channel-specific data, we provide a fixed-sized scratchpad.  Currently,
129        this has a very generous size, though this may shrink later (a channel
130        can always allocate storage and hang it off of the end).  This
131        is necessary to allow dynamic loading of channels at MPI_Init time. */
132 #define MPIDI_CH3_PG_SIZE 48
133     int32_t channel_private[MPIDI_CH3_PG_SIZE];
134 #if defined(MPIDI_CH3_PG_DECL)
135     MPIDI_CH3_PG_DECL
136 #endif
137 }
138 MPIDI_PG_t;
139 
140 
141 /*S
142   MPIDI_Process_t - The information required about this process by the CH3
143   device.
144 
145   S*/
146 typedef struct MPIDI_Process
147 {
148     MPIDI_PG_t * my_pg;
149     int my_pg_rank;
150 }
151 MPIDI_Process_t;
152 
153 extern MPIDI_Process_t MPIDI_Process;
154 
155 /*----------------------
156   BEGIN DATATYPE SECTION
157   ----------------------*/
158 /* FIXME: We want to avoid even storing information about the builtins
159    if we can */
160 #define MPIDI_Datatype_get_info(count_, datatype_, dt_contig_out_, data_sz_out_, dt_ptr_, dt_true_lb_)\
161 {									\
162     if (HANDLE_IS_BUILTIN(datatype_))		\
163     {									\
164 	(dt_ptr_) = NULL;						\
165 	(dt_contig_out_) = TRUE;					\
166         (dt_true_lb_)    = 0;                                           \
167 	(data_sz_out_) = (intptr_t) (count_) * MPIR_Datatype_get_basic_size(datatype_); \
168 	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER, TERSE, (MPL_DBG_FDEST,"basic datatype: dt_contig=%d, dt_sz=%d, data_sz=%" PRIdPTR, \
169 			  (dt_contig_out_), MPIR_Datatype_get_basic_size(datatype_), (data_sz_out_)));\
170     }									\
171     else								\
172     {									\
173 	MPIR_Datatype_get_ptr((datatype_), (dt_ptr_));			\
174 	MPIR_Datatype_is_contig((datatype_), (&dt_contig_out_));	\
175 	(data_sz_out_) = (intptr_t) (count_) * (dt_ptr_)->size;	\
176         (dt_true_lb_)    = (dt_ptr_)->true_lb;                          \
177 	MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER, TERSE, (MPL_DBG_FDEST, "user defined datatype: dt_contig=%d, dt_sz=" MPI_AINT_FMT_DEC_SPEC ", data_sz=%" PRIdPTR, \
178 			  (dt_contig_out_), (dt_ptr_)->size, (data_sz_out_)));\
179     }									\
180 }
181 /*--------------------
182   END DATATYPE SECTION
183   --------------------*/
184 
185 
186 /*---------------------
187   BEGIN REQUEST SECTION
188   ---------------------*/
189 
190 /*
191  * MPIR_Requests
192  *
193  * MPI Requests are handles to MPIR_Request structures.  These are used
194  * for most communication operations to provide a uniform way in which to
195  * define pending operations.  As such, they contain many fields that are
196  * only used by some operations (logically, an MPIR_Request is a union type).
197  *
198  * There are several kinds of requests.  They are
199  *    Send, Receive, RMA, User, Persistent
200  * In addition, send and RMA requests may be "incomplete"; this means that
201  * they have not sent their initial packet, and they may store additional
202  * data about the operation that will be used when the initial packet
203  * can be sent.
204  *
205  * Also, requests that are used internally within blocking MPI routines
206  * (only Send and Receive requests) do not require references to
207  * (or increments of the reference counts) communicators or datatypes.
208  * Thus, freeing these requests also does not require testing or
209  * decrementing these fields.
210  *
211  * Finally, we want to avoid multiple tests for a failure to allocate
212  * a request.  Thus, the request allocation macros will jump to fn_fail
213  * if there is an error.  This is akin to using a "throw" in C++.
214  *
215  * For example, a posted (unmatched) receive queue entry needs only:
216  *     match info
217  *     buffer info (address, count, datatype)
218  *     if nonblocking, communicator (used for finding error handler)
219  *     if nonblocking, cancelled state
220  * Once matched, a receive queue entry also needs
221  *     actual match info
222  *     message type (eager, rndv, eager-sync)
223  *     completion state (is all data available)
224  *        If destination datatype is non-contiguous, it also needs
225  *        current unpack state.
226  * An unexpected message (in the unexpected receive queue) needs only:
227  *     match info
228  *     message type (eager, rndv, eager-sync)
229  *     if (eager, eager-sync), data
230  *     completion state (is all data available?)
231  * A send request requires only
232  *     message type (eager, rndv, eager-sync)
233  *     completion state (has all data been sent?)
234  *     canceled state
235  *     if nonblocking, communicator (used for finding error handler)
236  *     if the initial envelope is still pending (e.g., could not write yet)
237  *         match info
238  *     if the data is still pending (rndv or would not send eager)
239  *         buffer info (address, count, datatype)
240  * RMA requests require (what)?
241  * User (generalized) requests require
242  *     function pointers for operations
243  *     completion state
244  *     cancelled state
245  */
246 
247 /* If the channel doesn't initialize anything in the request,
248    provide a dummy */
249 #ifndef MPIDI_CH3_REQUEST_INIT
250 #define MPIDI_CH3_REQUEST_INIT(a_)
251 #endif
252 
253 /* FIXME: Why does a send request need the match information?
254    Is that for debugging information?  In case the initial envelope
255    cannot be sent? Ditto for the dev.user_buf, count, and datatype
256    fields when the data is sent eagerly.
257 
258    The following fields needed to be set:
259    datatype_ptr
260    status.MPI_ERROR
261 
262    Note that this macro requires that rank, tag, context_offset,
263    comm, buf, datatype, and count all be available with those names
264    (they are not arguments to the routine)
265 */
266 #define MPIDI_Request_create_sreq(sreq_, mpi_errno_, FAIL_)	\
267 {								\
268     (sreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__SEND);     \
269     MPIR_Object_set_ref((sreq_), 2);				\
270     (sreq_)->comm = comm;					\
271     (sreq_)->dev.partner_request   = NULL;                         \
272     MPIR_Comm_add_ref(comm);					\
273     (sreq_)->dev.match.parts.rank = rank;			\
274     (sreq_)->dev.match.parts.tag = tag;				\
275     (sreq_)->dev.match.parts.context_id = comm->context_id + context_offset;	\
276     (sreq_)->dev.user_buf = (void *) buf;			\
277     (sreq_)->dev.user_count = count;				\
278     (sreq_)->dev.datatype = datatype;				\
279     (sreq_)->dev.iov_count	   = 0;                         \
280 }
281 
282 /* This is the receive request version of MPIDI_Request_create_sreq */
283 #define MPIDI_Request_create_rreq(rreq_, mpi_errno_, FAIL_)	\
284 {								\
285     (rreq_) = MPIR_Request_create(MPIR_REQUEST_KIND__RECV);           \
286     MPIR_Object_set_ref((rreq_), 2);				\
287     (rreq_)->dev.partner_request   = NULL;                         \
288 }
289 
290 #define MPIDI_REQUEST_MSG_MASK (0x3 << MPIDI_REQUEST_MSG_SHIFT)
291 #define MPIDI_REQUEST_MSG_SHIFT 0
292 #define MPIDI_REQUEST_NO_MSG 0
293 #define MPIDI_REQUEST_EAGER_MSG 1
294 #define MPIDI_REQUEST_RNDV_MSG 2
295 #define MPIDI_REQUEST_SELF_MSG 3
296 
297 #define MPIDI_Request_get_msg_type(req_)				\
298 (((req_)->dev.state & MPIDI_REQUEST_MSG_MASK) >> MPIDI_REQUEST_MSG_SHIFT)
299 
300 #define MPIDI_Request_set_msg_type(req_, msgtype_)			\
301 {									\
302     (req_)->dev.state &= ~MPIDI_REQUEST_MSG_MASK;			\
303     (req_)->dev.state |= ((msgtype_) << MPIDI_REQUEST_MSG_SHIFT) & MPIDI_REQUEST_MSG_MASK;\
304 }
305 
306 #define MPIDI_REQUEST_SRBUF_MASK (0x1 << MPIDI_REQUEST_SRBUF_SHIFT)
307 #define MPIDI_REQUEST_SRBUF_SHIFT 2
308 
309 #define MPIDI_Request_get_srbuf_flag(req_)					\
310 (((req_)->dev.state & MPIDI_REQUEST_SRBUF_MASK) >> MPIDI_REQUEST_SRBUF_SHIFT)
311 
312 #define MPIDI_Request_set_srbuf_flag(req_, flag_)			\
313 {									\
314     (req_)->dev.state &= ~MPIDI_REQUEST_SRBUF_MASK;			\
315     (req_)->dev.state |= ((flag_) << MPIDI_REQUEST_SRBUF_SHIFT) & MPIDI_REQUEST_SRBUF_MASK;	\
316 }
317 
318 #define MPIDI_REQUEST_SYNC_SEND_MASK (0x1 << MPIDI_REQUEST_SYNC_SEND_SHIFT)
319 #define MPIDI_REQUEST_SYNC_SEND_SHIFT 3
320 
321 #define MPIDI_Request_get_sync_send_flag(req_)						\
322 (((req_)->dev.state & MPIDI_REQUEST_SYNC_SEND_MASK) >> MPIDI_REQUEST_SYNC_SEND_SHIFT)
323 
324 #define MPIDI_Request_set_sync_send_flag(req_, flag_)			\
325 {									\
326     (req_)->dev.state &= ~MPIDI_REQUEST_SYNC_SEND_MASK;			\
327     (req_)->dev.state |= ((flag_) << MPIDI_REQUEST_SYNC_SEND_SHIFT) & MPIDI_REQUEST_SYNC_SEND_MASK;\
328 }
329 
330 #define MPIDI_REQUEST_TYPE_MASK (0xF << MPIDI_REQUEST_TYPE_SHIFT)
331 #define MPIDI_REQUEST_TYPE_SHIFT 4
332 #define MPIDI_REQUEST_TYPE_RECV 0
333 #define MPIDI_REQUEST_TYPE_SEND 1
334 #define MPIDI_REQUEST_TYPE_RSEND 2
335 #define MPIDI_REQUEST_TYPE_SSEND 3
336 /* We need a BSEND type for persistent bsends (see mpid_startall.c) */
337 #define MPIDI_REQUEST_TYPE_BSEND 4
338 #define MPIDI_REQUEST_TYPE_PUT_RECV 5                    /* target is receiving PUT data */
339 #define MPIDI_REQUEST_TYPE_GET_RESP 6                    /* target is sending GET response data */
340 #define MPIDI_REQUEST_TYPE_ACCUM_RECV 7                  /* target is receiving ACC data */
341 #define MPIDI_REQUEST_TYPE_PUT_RECV_DERIVED_DT 8         /* target is receiving derived DT info for PUT data */
342 #define MPIDI_REQUEST_TYPE_GET_RECV_DERIVED_DT 9         /* target is receiving derived DT info for GET data */
343 #define MPIDI_REQUEST_TYPE_ACCUM_RECV_DERIVED_DT 10      /* target is receiving derived DT info for ACC data */
344 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV 11             /* target is receiving GACC data */
345 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RECV_DERIVED_DT 12  /* target is receiving derived DT info for GACC data */
346 #define MPIDI_REQUEST_TYPE_GET_ACCUM_RESP 13             /* target is sending GACC response data */
347 #define MPIDI_REQUEST_TYPE_FOP_RECV 14                   /* target is receiving FOP data */
348 #define MPIDI_REQUEST_TYPE_FOP_RESP 15                   /* target is sending FOP response data */
349 
350 
351 #define MPIDI_Request_get_type(req_)						\
352 (((req_)->dev.state & MPIDI_REQUEST_TYPE_MASK) >> MPIDI_REQUEST_TYPE_SHIFT)
353 
354 #define MPIDI_Request_set_type(req_, type_)				\
355 {									\
356     (req_)->dev.state &= ~MPIDI_REQUEST_TYPE_MASK;			\
357     (req_)->dev.state |= ((type_) << MPIDI_REQUEST_TYPE_SHIFT) & MPIDI_REQUEST_TYPE_MASK;\
358 }
359 
360 /* NOTE: Request updates may require atomic ops (critical sections) if
361    a fine-grain thread-sync model is used. */
362 #define MPIDI_Request_cancel_pending(req_, flag_)	\
363 {							\
364     *(flag_) = (req_)->dev.cancel_pending;		\
365     (req_)->dev.cancel_pending = TRUE;			\
366 }
367 
368 /* the following two macros were formerly a single confusing macro with side
369    effects named MPIDI_Request_recv_pending() */
370 #define MPIDI_Request_check_pending(req_, recv_pending_)   \
371     do {                                                   \
372         *(recv_pending_) = (req_)->dev.recv_pending_count; \
373     } while (0)
374 
375 #define MPIDI_Request_decr_pending(req_)    \
376     do {                                    \
377         --(req_)->dev.recv_pending_count;   \
378     } while (0)
379 
380 /* MPIDI_Request_fetch_and_clear_rts_sreq() - atomically fetch current
381    partner RTS sreq and nullify partner request */
382 #define MPIDI_Request_fetch_and_clear_rts_sreq(sreq_, rts_sreq_)	\
383     {									\
384         *(rts_sreq_) = (sreq_)->dev.partner_request;			\
385         (sreq_)->dev.partner_request = NULL;				\
386     }
387 
388 /* FIXME: We've moved to allow finer-grain critical sections... */
389 /* Note: In the current implementation, the mpid_xsend.c routines that
390    make use of MPIDI_VC_FAI_send_seqnum are all protected by the
391    SINGLE_CS_ENTER/EXIT macros, so all uses of this macro are
392    alreay within a critical section when needed.  If/when we move to
393    a finer-grain model, we'll need to examine whether this requires
394    a separate lock. */
395 #if defined(MPID_USE_SEQUENCE_NUMBERS)
396 #   define MPIDI_Request_set_seqnum(req_, seqnum_)	\
397     {							\
398     	(req_)->dev.seqnum = (seqnum_);			\
399     }
400 #   define MPIDI_VC_FAI_send_seqnum(vc_, seqnum_out_)	\
401     {							\
402 	(seqnum_out_) = (vc_)->seqnum_send++;		\
403     }
404 #   define MPIDI_Pkt_set_seqnum(pkt_, seqnum_)	\
405     {						\
406     	(pkt_)->seqnum = (seqnum_);		\
407     }
408 #   define MPIDI_VC_Init_seqnum_send(vc_)	\
409     {						\
410     	(vc_)->seqnum_send = 0;			\
411     }
412 #else
413 #   define MPIDI_Request_set_seqnum(req_, seqnum_)
414 #   define MPIDI_VC_FAI_send_seqnum(vc_, seqnum_out_)
415 #   define MPIDI_Pkt_set_seqnum(pkt_, seqnum_)
416 #   define MPIDI_VC_Init_seqnum_send(vc_)
417 #endif
418 
419 
420 /*-------------------
421   END REQUEST SECTION
422   -------------------*/
423 
424 
425 /*------------------
426   BEGIN COMM SECTION
427   ------------------*/
428 #define MPIDI_Comm_get_vc(comm_, rank_, vcp_) *(vcp_) = (comm_)->dev.vcrt->vcr_table[(rank_)]
429 
430 #ifdef USE_MPIDI_DBG_PRINT_VC
431 void MPIDI_DBG_PrintVC(MPIDI_VC_t *vc);
432 void MPIDI_DBG_PrintVCState2(MPIDI_VC_t *vc, MPIDI_VC_State_t new_state);
433 void MPIDI_DBG_PrintVCState(MPIDI_VC_t *vc);
434 #else
435 #define MPIDI_DBG_PrintVC(vc)
436 #define MPIDI_DBG_PrintVCState2(vc, new_state)
437 #define MPIDI_DBG_PrintVCState(vc)
438 #endif
439 
440 #define MPIDI_Comm_get_vc_set_active(comm_, rank_, vcp_) do {           \
441         *(vcp_) = (comm_)->dev.vcrt->vcr_table[(rank_)];                \
442         if ((*(vcp_))->state == MPIDI_VC_STATE_INACTIVE)                \
443         {                                                               \
444             MPIDI_DBG_PrintVCState2(*(vcp_), MPIDI_VC_STATE_ACTIVE);     \
445             MPIDI_CHANGE_VC_STATE((*(vcp_)), ACTIVE);                   \
446         }                                                               \
447     } while(0)
448 
449 /*----------------
450   END COMM SECTION
451   ----------------*/
452 
453 
454 /*--------------------
455   BEGIN PACKET SECTION
456   --------------------*/
457 #if !defined(MPICH_DEBUG_MEMINIT)
458 #   define MPIDI_Pkt_init(pkt_, type_)		\
459     {						\
460 	(pkt_)->type = (type_);			\
461     }
462 #else
463 #   define MPIDI_Pkt_init(pkt_, type_)				\
464     {								\
465 	memset((void *) (pkt_), 0xfc, sizeof(MPIDI_CH3_Pkt_t));	\
466 	(pkt_)->type = (type_);					\
467     }
468 #endif
469 
470 /*------------------
471   END PACKET SECTION
472   ------------------*/
473 
474 
475 /*---------------------------
476   BEGIN PROCESS GROUP SECTION
477   ---------------------------*/
478 /* FIXME: Determine which of these functions should be exported to all of
479    the MPICH routines and which are internal to the device implementation */
480 typedef int (*MPIDI_PG_Compare_ids_fn_t)(void * id1, void * id2);
481 typedef int (*MPIDI_PG_Destroy_fn_t)(MPIDI_PG_t * pg);
482 
483 int MPIDI_VCRT_Create(int size, struct MPIDI_VCRT **vcrt_ptr);
484 int MPIDI_VCRT_Add_ref(struct MPIDI_VCRT *vcrt);
485 int MPIDI_VCRT_Release(struct MPIDI_VCRT *vcrt, int isDisconnect);
486 int MPIDI_VCR_Dup(MPIDI_VCR orig_vcr, MPIDI_VCR * new_vcr);
487 
488 int MPIDI_PG_Init(MPIDI_PG_Compare_ids_fn_t, MPIDI_PG_Destroy_fn_t);
489 int MPIDI_PG_Finalize(void);
490 int MPIDI_PG_Create(int vct_sz, void * pg_id, MPIDI_PG_t ** ppg);
491 int MPIDI_PG_Destroy(MPIDI_PG_t * pg);
492 int MPIDI_PG_Find(void * id, MPIDI_PG_t ** pgp);
493 int MPIDI_PG_Id_compare(void *id1, void *id2);
494 void MPIDI_PG_set_verbose(int level);
495 
496 /* Always use the MPIDI_PG_iterator type, never its expansion.  Otherwise it
497    will be difficult to make any changes later. */
498 typedef MPIDI_PG_t * MPIDI_PG_iterator;
499 /* 'iter' is similar to 'saveptr' in strtok_r */
500 int MPIDI_PG_Get_iterator(MPIDI_PG_iterator *iter);
501 int MPIDI_PG_Has_next(MPIDI_PG_iterator *iter);
502 int MPIDI_PG_Get_next(MPIDI_PG_iterator *iter, MPIDI_PG_t **pgp);
503 
504 int MPIDI_PG_Close_VCs( void );
505 
506 int MPIDI_PG_InitConnKVS( MPIDI_PG_t * );
507 int MPIDI_PG_GetConnKVSname( char ** );
508 int MPIDI_PG_InitConnString( MPIDI_PG_t * );
509 int MPIDI_PG_GetConnString( MPIDI_PG_t *, int, char *, int );
510 int MPIDI_PG_Dup_vcr( MPIDI_PG_t *, int, struct MPIDI_VC ** );
511 int MPIDI_PG_Get_size(MPIDI_PG_t * pg);
512 void MPIDI_PG_IdToNum( MPIDI_PG_t *, int * );
513 int MPIU_PG_Printall( FILE * );
514 int MPIDI_PG_CheckForSingleton( void );
515 
516 /* CH3_PG_Init allows the channel to pre-initialize the process group */
517 int MPIDI_CH3_PG_Init( MPIDI_PG_t * );
518 
519 #define MPIDI_PG_add_ref(pg_)			\
520 do {                                            \
521     MPIR_Object_add_ref(pg_);			\
522 } while (0)
523 #define MPIDI_PG_release_ref(pg_, inuse_)	\
524 do {                                            \
525     MPIR_Object_release_ref(pg_, inuse_);	\
526 } while (0)
527 
528 #define MPIDI_PG_Get_vc(pg_, rank_, vcp_) *(vcp_) = &(pg_)->vct[rank_]
529 
530 #define MPIDI_PG_Get_vc_set_active(pg_, rank_, vcp_)  do {              \
531         *(vcp_) = &(pg_)->vct[rank_];                                   \
532         if ((*(vcp_))->state == MPIDI_VC_STATE_INACTIVE)                \
533         {                                                               \
534             MPIDI_DBG_PrintVCState2(*(vcp_), MPIDI_VC_STATE_ACTIVE);     \
535             MPIDI_CHANGE_VC_STATE((*(vcp_)), ACTIVE);                   \
536         }                                                               \
537     } while(0)
538 
539 #define MPIDI_PG_Get_size(pg_) ((pg_)->size)
540 
541 #ifdef MPIDI_DEV_IMPLEMENTS_KVS
542 int MPIDI_PG_To_string(MPIDI_PG_t *pg_ptr, char **str_ptr, int *);
543 int MPIDI_PG_Create_from_string(const char * str, MPIDI_PG_t ** pg_pptr,
544 				int *flag);
545 #endif
546 /*-------------------------
547   END PROCESS GROUP SECTION
548   -------------------------*/
549 
550 
551 /*--------------------------------
552   BEGIN VIRTUAL CONNECTION SECTION
553   --------------------------------*/
554 /*E
555   MPIDI_VC_State - States for a virtual connection.
556 
557   Notes:
558   A closed connection is placed into 'STATE_INACTIVE'. (is this true?)
559  E*/
560 typedef enum MPIDI_VC_State
561 {
562     MPIDI_VC_STATE_INACTIVE=1,      /* Comm either hasn't started or has completed. */
563     MPIDI_VC_STATE_ACTIVE,          /* Comm has started and hasn't completed */
564     MPIDI_VC_STATE_LOCAL_CLOSE,     /* Local side has initiated close protocol */
565     MPIDI_VC_STATE_REMOTE_CLOSE,    /* Remote side has initiated close protocol */
566     MPIDI_VC_STATE_CLOSE_ACKED,     /* Both have initiated close, we have acknowledged remote side */
567     MPIDI_VC_STATE_CLOSED,          /* Both have initiated close, both have acked */
568     MPIDI_VC_STATE_INACTIVE_CLOSED, /* INACTIVE VCs are moved to this state in Finalize */
569     MPIDI_VC_STATE_MORIBUND         /* Abnormally terminated, there may be unsent/unreceived msgs */
570 } MPIDI_VC_State_t;
571 
572 struct MPIR_Comm;
573 
574 #ifdef ENABLE_COMM_OVERRIDES
575 typedef struct MPIDI_Comm_ops
576 {
577     /* Overriding calls in case of matching-capable interfaces */
578     int (*recv_posted)(struct MPIDI_VC *vc, struct MPIR_Request *req);
579 
580     int (*send)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
581 		int dest, int tag, MPIR_Comm *comm, int context_offset,
582 		struct MPIR_Request **request);
583     int (*rsend)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
584 		 int dest, int tag, MPIR_Comm *comm, int context_offset,
585 		 struct MPIR_Request **request);
586     int (*ssend)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
587 		 int dest, int tag, MPIR_Comm *comm, int context_offset,
588 		 struct MPIR_Request **request );
589     int (*isend)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
590 		 int dest, int tag, MPIR_Comm *comm, int context_offset,
591 		 struct MPIR_Request **request );
592     int (*irsend)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
593 		  int dest, int tag, MPIR_Comm *comm, int context_offset,
594 		  struct MPIR_Request **request );
595     int (*issend)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
596 		  int dest, int tag, MPIR_Comm *comm, int context_offset,
597 		  struct MPIR_Request **request );
598 
599     int (*send_init)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
600 		     int dest, int tag, MPIR_Comm *comm, int context_offset,
601 		     struct MPIR_Request **request );
602     int (*bsend_init)(struct MPIDI_VC *vc, const void *buf, int count, MPI_Datatype datatype,
603 		      int dest, int tag, MPIR_Comm *comm, int context_offset,
604 		      struct MPIR_Request **request);
605     int (*rsend_init)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
606 		      int dest, int tag, MPIR_Comm *comm, int context_offset,
607 		      struct MPIR_Request **request );
608     int (*ssend_init)(struct MPIDI_VC *vc, const void *buf, MPI_Aint count, MPI_Datatype datatype,
609 		      int dest, int tag, MPIR_Comm *comm, int context_offset,
610 		      struct MPIR_Request **request );
611     int (*startall)(struct MPIDI_VC *vc, int count,  struct MPIR_Request *requests[]);
612 
613     int (*cancel_send)(struct MPIDI_VC *vc,  struct MPIR_Request *sreq);
614     int (*cancel_recv)(struct MPIDI_VC *vc,  struct MPIR_Request *rreq);
615 
616     int (*probe)(struct MPIDI_VC *vc,  int source, int tag, MPIR_Comm *comm, int context_offset,
617 		                  MPI_Status *status);
618     int (*iprobe)(struct MPIDI_VC *vc,  int source, int tag, MPIR_Comm *comm, int context_offset,
619 		  int *flag, MPI_Status *status);
620     int (*improbe)(struct MPIDI_VC *vc,  int source, int tag, MPIR_Comm *comm, int context_offset,
621                    int *flag, MPIR_Request **message, MPI_Status *status);
622     int (*imrecv)(struct MPIDI_VC *vc, struct MPIR_Request *req);
623 } MPIDI_Comm_ops_t;
624 
625 extern int (*MPIDI_Anysource_iprobe_fn)(int tag, MPIR_Comm * comm, int context_offset, int *flag,
626                                         MPI_Status * status);
627 extern int (*MPIDI_Anysource_improbe_fn)(int tag, MPIR_Comm * comm, int context_offset,
628                                          int *flag, MPIR_Request **message,
629                                          MPI_Status * status);
630 #endif
631 
632 typedef struct MPIDI_VC
633 {
634     /* XXX - need better comment */
635     /* MPIU_Object fields.  MPIDI_VC_t objects are not allocated using the
636        MPIU_Object system, but we do use the associated
637        reference counting routines.  The handle value is required
638        when debugging objects (the handle kind is used in reporting
639        on changes to the object).
640     */
641     MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */
642 
643     /* state of the VC */
644     MPIDI_VC_State_t state;
645 
646     /* Process group to which this VC belongs */
647     struct MPIDI_PG * pg;
648 
649     /* Rank of the process in that process group associated with this VC */
650     int pg_rank;
651 
652     /* Local process ID */
653     int lpid;
654 
655     /* The node id of this process, used for topologically aware collectives. */
656     int node_id;
657 
658     /* port name tag */
659     int port_name_tag; /* added to handle dynamic process mgmt */
660 
661 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
662     void *connreq_obj;  /* pointer to dynamic connection mgmt object */
663 #endif
664 
665 #if defined(MPID_USE_SEQUENCE_NUMBERS)
666     /* Sequence number of the next packet to be sent */
667     MPID_Seqnum_t seqnum_send;
668 #endif
669 
670 #if defined(MPIDI_CH3_MSGS_UNORDERED)
671     /* Sequence number of the next packet we expect to receive */
672     MPID_Seqnum_t seqnum_recv;
673 
674     /* Queue for holding packets received out of order.  NOTE: the CH3 device
675        only orders packets.  Handling of out-of-order data
676        is the responsibility of the channel. */
677     MPIDI_CH3_Pkt_send_container_t * msg_reorder_queue;
678 #endif
679 
680     /* rendezvous function pointers.  Called to send a rendevous
681        message or when one is matched */
682     int (* rndvSend_fn)( struct MPIR_Request **sreq_p, const void * buf, MPI_Aint count,
683                          MPI_Datatype datatype, int dt_contig, intptr_t data_sz,
684                          MPI_Aint dt_true_lb, int rank, int tag,
685                          struct MPIR_Comm * comm, int context_offset );
686     int (* rndvRecv_fn)( struct MPIDI_VC * vc, struct MPIR_Request *rreq );
687 
688     /* eager message threshold */
689     int eager_max_msg_sz;
690     /* eager message threshold for ready sends.  -1 means there's no limit */
691     int ready_eager_max_msg_sz;
692 
693     /* noncontiguous send function pointer.  Called to send a
694        noncontiguous message.  Caller must initialize
695        sreq->dev.segment, _first and _size.  Contiguous messages are
696        called directly from CH3 and cannot be overridden.
697        The optional hdr_iov and n_hdr_iov input parameters are used for
698        variable-length extended header, specify NULL and zero if unused.
699        n_hdr_iov should not exceed MPL_IOV_LIMIT - 2 (one for header and one
700        for packed data).*/
701     int (* sendNoncontig_fn)( struct MPIDI_VC *vc, struct MPIR_Request *sreq,
702 			      void *header, intptr_t hdr_sz, struct iovec *hdr_iov, int n_hdr_iov);
703 
704 #ifdef ENABLE_COMM_OVERRIDES
705     MPIDI_Comm_ops_t *comm_ops;
706 #endif
707 
708 # if defined(MPIDI_CH3_VC_DECL)
709     MPIDI_CH3_VC_DECL
710 # endif
711 }
712 MPIDI_VC_t;
713 
714 typedef enum MPIDI_VC_Event
715 {
716     MPIDI_VC_EVENT_TERMINATED
717 }
718 MPIDI_VC_Event_t;
719 
720 /*S
721  * MPIDI_VCRT - virtual connection reference table
722  *
723  * handle - this element is not used, but exists so that we may use the
724  * MPIU_Object routines for reference counting
725  *
726  * ref_count - number of references to this table
727  *
728  * vcr_table - array of virtual connection references
729  S*/
730 typedef struct MPIDI_VCRT
731 {
732     MPIR_OBJECT_HEADER; /* adds handle and ref_count fields */
733     int size;
734     MPIDI_VC_t * vcr_table[1];
735 }
736 MPIDI_VCRT_t;
737 
738 /* number of VCs that are in MORIBUND state */
739 extern int MPIDI_Failed_vc_count;
740 
741 /* Initialize a new VC */
742 int MPIDI_VC_Init( MPIDI_VC_t *, MPIDI_PG_t *, int );
743 
744 #if defined(MPIDI_CH3_MSGS_UNORDERED)
745 #   define MPIDI_VC_Init_seqnum_recv(vc_);	\
746     {						\
747     	(vc_)->seqnum_recv = 0;			\
748     	(vc_)->msg_reorder_queue = NULL;	\
749     }
750 #else
751 #   define MPIDI_VC_Init_seqnum_recv(vc_);
752 #endif
753 
754 
755 #define MPIDI_VC_add_ref( _vc )                                 \
756     do { MPIR_Object_add_ref( _vc ); } while (0)
757 
758 #define MPIDI_VC_release_ref( _vc, _inuse ) \
759     do { MPIR_Object_release_ref( _vc, _inuse ); } while (0)
760 
761 /*------------------------------
762   END VIRTUAL CONNECTION SECTION
763   ------------------------------*/
764 
765 
766 /*---------------------------------
767   BEGIN SEND/RECEIVE BUFFER SECTION
768   ---------------------------------*/
769 #if !defined(MPIDI_CH3U_Offsetof)
770 #    define MPIDI_CH3U_Offsetof(struct_, field_) ((MPI_Aint) &((struct_*)0)->field_)
771 #endif
772 
773 #if !defined(MPIDI_CH3U_SRBuf_size)
774 #    define MPIDI_CH3U_SRBuf_size (256 * 1024)
775 #endif
776 
777 typedef struct __MPIDI_CH3U_SRBuf_element {
778     /* Keep the buffer at the top to help keep the memory alignment */
779     char   buf[MPIDI_CH3U_SRBuf_size];
780     struct __MPIDI_CH3U_SRBuf_element * next;
781 } MPIDI_CH3U_SRBuf_element_t;
782 
783 extern MPIDI_CH3U_SRBuf_element_t * MPIDI_CH3U_SRBuf_pool;
784 
785 #if !defined (MPIDI_CH3U_SRBuf_get)
786 #   define MPIDI_CH3U_SRBuf_get(req_)                                   \
787     {                                                                   \
788         MPIDI_CH3U_SRBuf_element_t * tmp;                               \
789         if (!MPIDI_CH3U_SRBuf_pool) {                                   \
790              MPIDI_CH3U_SRBuf_pool =                                    \
791                 MPL_malloc(sizeof(MPIDI_CH3U_SRBuf_element_t), MPL_MEM_BUFFER); \
792             MPIDI_CH3U_SRBuf_pool->next = NULL;                         \
793         }                                                               \
794         tmp = MPIDI_CH3U_SRBuf_pool;                                    \
795         MPIDI_CH3U_SRBuf_pool = MPIDI_CH3U_SRBuf_pool->next;            \
796         tmp->next = NULL;                                               \
797         (req_)->dev.tmpbuf = tmp->buf;                                  \
798     }
799 #endif
800 
801 #if !defined (MPIDI_CH3U_SRBuf_free)
802 #   define MPIDI_CH3U_SRBuf_free(req_)                                  \
803     {                                                                   \
804         MPIDI_CH3U_SRBuf_element_t * tmp;                               \
805         MPIR_Assert(MPIDI_Request_get_srbuf_flag(req_));                \
806         MPIDI_Request_set_srbuf_flag((req_), FALSE);                    \
807         tmp = (MPIDI_CH3U_SRBuf_element_t *) (((MPI_Aint) ((req_)->dev.tmpbuf)) - \
808                ((MPI_Aint) MPIDI_CH3U_Offsetof(MPIDI_CH3U_SRBuf_element_t, buf))); \
809         tmp->next = MPIDI_CH3U_SRBuf_pool;                              \
810         MPIDI_CH3U_SRBuf_pool = tmp;                                    \
811     }
812 #endif
813 
814 #if !defined(MPIDI_CH3U_SRBuf_alloc)
815 #   define MPIDI_CH3U_SRBuf_alloc(req_, size_)				\
816     {									\
817         MPIDI_CH3U_SRBuf_get(req_);                                     \
818  	if ((req_)->dev.tmpbuf != NULL)					\
819  	{								\
820  	    (req_)->dev.tmpbuf_sz = MPIDI_CH3U_SRBuf_size;		\
821  	    MPIDI_Request_set_srbuf_flag((req_), TRUE);			\
822  	}								\
823  	else								\
824  	{								\
825  	    (req_)->dev.tmpbuf_sz = 0;					\
826  	}								\
827     }
828 #endif
829 /*-------------------------------
830   END SEND/RECEIVE BUFFER SECTION
831   -------------------------------*/
832 
833 /* define ACC stream size as the SRBuf size */
834 #if !defined(MPIDI_CH3U_Acc_stream_size)
835 #define MPIDI_CH3U_Acc_stream_size MPIDI_CH3U_SRBuf_size
836 #endif
837 
838 /*----------------------------
839   BEGIN DEBUGGING TOOL SECTION
840   ----------------------------*/
841 
842 /* If there is no support for dynamic processes, there will be no
843    channel-specific connection state */
844 #ifdef MPL_USE_DBG_LOGGING
845 
846 #ifdef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
847 #define MPIDI_CH3_VC_GetStateString( _c ) "none"
848 #else
849 const char *MPIDI_CH3_VC_GetStateString(struct MPIDI_VC *);
850 const char *MPIDI_CH3_VC_SockGetStateString(struct MPIDI_VC *);
851 #endif
852 
853 /* These tw routines are in mpidi_pg.c and are used to print the
854    connection string (which is attached to a process group) */
855 int MPIDI_PrintConnStr( const char *file, int line,
856 			const char *label, const char *str );
857 int MPIDI_PrintConnStrToFile( FILE *fd, const char *file, int line,
858 			      const char *label, const char *str );
859 
860 /* Defined and used in sock channel. */
861 const char * MPIDI_Conn_GetStateString(int state);
862 #endif
863 
864 /* These macros simplify and unify the debugging of changes in the
865    connection state
866 
867    MPL_DBG_VCSTATECHANGE(vc,newstate) - use when changing the state
868    of a VC
869 
870    MPL_DBG_VCCHSTATECHANGE(vc,newstate) - use when changing the state
871    of the channel-specific part of the vc (e.g., vc->ch.state)
872 
873    MPL_DBG_CONNSTATECHANGE(vc,conn,newstate ) - use when changing the
874    state of a conn.  vc may be null
875 
876    MPL_DBG_CONNSTATECHANGEMSG(vc,conn,newstate,msg ) - use when changing the
877    state of a conn.  vc may be null.  Like CONNSTATECHANGE, but allows
878    an additional message
879 
880    MPL_DBG_PKT(conn,pkt,msg) - print out a short description of an
881    packet being sent/received on the designated connection, prefixed with
882    msg.
883 
884 */
885 #define MPL_DBG_VCSTATECHANGE(_vc,_newstate) do { \
886      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST, \
887      "vc=%p: Setting state (vc) from %s to %s, vcchstate is %s", \
888                  _vc, MPIDI_VC_GetStateString((_vc)->state), \
889                  #_newstate, MPIDI_CH3_VC_GetStateString( (_vc) ))); \
890 } while (0)
891 
892 #define MPL_DBG_VCCHSTATECHANGE(_vc,_newstate) \
893      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST, \
894      "vc=%p: Setting state (ch) from %s to %s, vc state is %s", \
895            _vc, MPIDI_CH3_VC_GetStateString((_vc)), \
896            #_newstate, MPIDI_VC_GetStateString( (_vc)->state )) )
897 
898 #define MPL_DBG_CONNSTATECHANGE(_vc,_conn,_newstate) \
899      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST, \
900      "vc=%p,conn=%p: Setting state (conn) from %s to %s, vcstate = %s", \
901              _vc, _conn, \
902              MPIDI_Conn_GetStateString((_conn)->state), #_newstate, \
903              _vc ? MPIDI_VC_GetStateString((_vc)->state) : "<no vc>" ))
904 
905 #define MPL_DBG_CONNSTATECHANGE_MSG(_vc,_conn,_newstate,_msg) \
906      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST, \
907      "vc=%p,conn=%p: Setting conn state from %s to %s, vcstate = %s %s", \
908              _vc, _conn, \
909              MPIDI_Conn_GetStateString((_conn)->state), #_newstate, \
910              _vc ? MPIDI_VC_GetStateString((_vc)->state) : "<no vc>", _msg ))
911 #define MPL_DBG_VCUSE(_vc,_msg) \
912      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_CONNECT,TYPICAL,(MPL_DBG_FDEST,\
913       "vc=%p: Using vc for %s", _vc, _msg ))
914 #define MPL_DBG_PKT(_conn,_pkt,_msg) \
915      MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_OTHER,TYPICAL,(MPL_DBG_FDEST,\
916      "conn=%p: %s %s", _conn, _msg, MPIDI_Pkt_GetDescString( _pkt ) ))
917 
918 const char *MPIDI_Pkt_GetDescString( MPIDI_CH3_Pkt_t *pkt );
919 
920 /* These macros help trace communication headers */
921 #define MPL_DBG_MSGPKT(_vc,_tag,_contextid,_dest,_size,_kind)	\
922     MPL_DBG_MSG_FMT(MPIDI_CH3_DBG_MSG,TYPICAL,(MPL_DBG_FDEST,\
923 		      "%s: vc=%p, tag=%d, context=%d, dest=%d, datasz=%" PRIdPTR,\
924 		      _kind,_vc,_tag,_contextid,_dest,_size) )
925 
926 /* FIXME: Switch this to use the common debug code */
927 void MPIDI_err_printf(char *, char *, ...);
928 
929 /* FIXME: This does not belong here */
930 #ifdef USE_MPIDI_DBG_PRINT_VC
931 extern char *MPIDI_DBG_parent_str;
932 #endif
933 
934 #define MPIDI_ERR_PRINTF(e) MPIDI_err_printf e
935 
936 #if defined(HAVE_MACRO_VA_ARGS)
937 #   define MPIDI_err_printf(func, fmt, ...)				\
938     {									\
939         MPL_error_printf("[%d] ERROR - %s(): " fmt "\n", MPIR_Process.comm_world->rank, func, __VA_ARGS__);    \
940         fflush(stdout);							\
941     }
942 #endif
943 
944 #ifdef MPICH_DBG_OUTPUT
945     void MPIDI_DBG_Print_packet(MPIDI_CH3_Pkt_t *pkt);
946 #else
947 #   define MPIDI_DBG_Print_packet(a)
948 #endif
949 
950 /* Given a state, return the string for this state (VC's and connections) */
951 const char * MPIDI_VC_GetStateString(int);
952 /*--------------------------
953   END DEBUGGING TOOL SECTION
954   --------------------------*/
955 
956 
957 /* Prototypes for internal device routines */
958 int MPIDI_Isend_self(const void *, MPI_Aint, MPI_Datatype, int, int, MPIR_Comm *,
959 		     int, int, MPIR_Request **);
960 
961 /*--------------------------
962   BEGIN MPI PORT SECTION
963   --------------------------*/
964 /* These are the default functions */
965 int MPIDI_Comm_connect(const char *, MPIR_Info *, int, MPIR_Comm *, MPIR_Comm **);
966 int MPIDI_Comm_accept(const char *, MPIR_Info *, int, MPIR_Comm *, MPIR_Comm **);
967 
968 int MPIDI_Comm_spawn_multiple(int, char **, char ***, const int *, MPIR_Info **,
969 			      int, MPIR_Comm *, MPIR_Comm **, int *);
970 
971 
972 /* This structure defines a module that handles the routines that
973    work with MPI port names */
974 typedef struct MPIDI_Port_Ops {
975     int (*OpenPort)( MPIR_Info *, char * );
976     int (*ClosePort)( const char * );
977     int (*CommAccept)( const char *, MPIR_Info *, int, MPIR_Comm *,
978 		       MPIR_Comm ** );
979     int (*CommConnect)( const char *, MPIR_Info *, int, MPIR_Comm *,
980 			MPIR_Comm ** );
981 } MPIDI_PortFns;
982 #define MPIDI_PORTFNS_VERSION 1
983 int MPIDI_CH3_PortFnsInit( MPIDI_PortFns * );
984 
985 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
986 /* Utility routines provided in src/ch3u_port.c for working with connection
987    queues */
988 int MPIDI_CH3I_Acceptq_enqueue(MPIDI_VC_t * vc, int port_name_tag);
989 int MPIDI_Port_finalize(void);
990 
991 int MPIDI_CH3I_Port_init(int port_name_tag);
992 int MPIDI_CH3I_Port_destroy(int port_name_tag);
993 #else
994 /* Need empty symbols to avoid failure at compile time if defined
995  * MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS. */
996 #define MPIDI_CH3I_Acceptq_enqueue(vc, port_name_tag) (MPI_SUCCESS)
997 #define MPIDI_Port_finalize() (MPI_SUCCESS)
998 
999 #define MPIDI_CH3I_Port_init(port_name_tag) (MPI_SUCCESS)
1000 #define MPIDI_CH3I_Port_destroy(port_name_tag) (MPI_SUCCESS)
1001 #endif
1002 /*--------------------------
1003   END MPI PORT SECTION
1004   --------------------------*/
1005 
1006 #define MPIDI_MAX_KVS_VALUE_LEN    4096
1007 
1008 /* ------------------------------------------------------------------------- */
1009 /* mpirma.h (in src/mpi/rma?) */
1010 /* ------------------------------------------------------------------------- */
1011 
1012 int MPIDI_RMA_init(void);
1013 void MPIDI_RMA_finalize(void);
1014 
1015 /* The Win_fns table contains pointers to the channel's implementation of the
1016  * RMA window creation routines.  The channel must provide the init function,
1017  * which can optionally override any defaults already set by CH3.
1018  */
1019 
1020 typedef struct {
1021     int (*create)(void *, MPI_Aint, int, MPIR_Info *, MPIR_Comm *, MPIR_Win **);
1022     int (*allocate)(MPI_Aint, int, MPIR_Info *, MPIR_Comm *, void *, MPIR_Win **);
1023     int (*allocate_shared)(MPI_Aint, int, MPIR_Info *, MPIR_Comm *, void *, MPIR_Win **);
1024     int (*allocate_shm)(MPI_Aint, int, MPIR_Info *, MPIR_Comm *, void *, MPIR_Win **);
1025     int (*create_dynamic)(MPIR_Info *, MPIR_Comm *, MPIR_Win **);
1026     int (*detect_shm)(MPIR_Win **);
1027     int (*gather_info)(void *, MPI_Aint, int, MPIR_Info *, MPIR_Comm *, MPIR_Win **);
1028     int (*shared_query)(MPIR_Win *, int, MPI_Aint *, int *, void *);
1029 } MPIDI_CH3U_Win_fns_t;
1030 
1031 extern MPIDI_CH3U_Win_fns_t MPIDI_CH3U_Win_fns;
1032 
1033 typedef struct {
1034     int (*win_init)(MPI_Aint, int, int, int, MPIR_Info *, MPIR_Comm *, MPIR_Win **);
1035     int (*win_free)(MPIR_Win **);
1036 } MPIDI_CH3U_Win_hooks_t;
1037 
1038 extern MPIDI_CH3U_Win_hooks_t MPIDI_CH3U_Win_hooks;
1039 
1040 typedef struct MPIDI_CH3U_Win_pkt_ordering {
1041 
1042     /* Ordered AM flush.
1043      * It means whether AM flush is guaranteed to be finished after all previous
1044      * RMA operations. It initialized by Nemesis and used by CH3.
1045      * Note that we use single global flag for all targets including both
1046      * intra-node and inter-node processes.*/
1047     int am_flush_ordered;
1048 } MPIDI_CH3U_Win_pkt_ordering_t;
1049 
1050 extern MPIDI_CH3U_Win_pkt_ordering_t MPIDI_CH3U_Win_pkt_orderings;
1051 
1052 /* CH3 and Channel window functions initializers */
1053 int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
1054 int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns);
1055 
1056 /* Channel window hooks initializer */
1057 int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t *win_hooks);
1058 
1059 int MPIDI_CH3_Win_pkt_orderings_init(MPIDI_CH3U_Win_pkt_ordering_t * win_pkt_orderings);
1060 
1061 /* Default window creation functions provided by CH3 */
1062 int MPIDI_CH3U_Win_create(void *, MPI_Aint, int, MPIR_Info *, MPIR_Comm *,
1063                          MPIR_Win **);
1064 int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPIR_Info *info,
1065                            MPIR_Comm *comm, void *baseptr, MPIR_Win **win);
1066 int MPIDI_CH3U_Win_allocate_no_shm(MPI_Aint size, int disp_unit, MPIR_Info *info,
1067                                    MPIR_Comm *comm_ptr, void *baseptr, MPIR_Win **win_ptr);
1068 int MPIDI_CH3U_Win_create_dynamic(MPIR_Info *info, MPIR_Comm *comm, MPIR_Win **win);
1069 int MPIDI_CH3U_Win_shared_query(MPIR_Win * win_ptr, int target_rank, MPI_Aint * size,
1070                                 int *disp_unit, void *baseptr);
1071 
1072 /* MPI RMA Utility functions */
1073 
1074 int MPIDI_CH3U_Win_gather_info(void *, MPI_Aint, int, MPIR_Info *, MPIR_Comm *,
1075                                  MPIR_Win **);
1076 
1077 
1078 #ifdef MPIDI_CH3I_HAS_ALLOC_MEM
1079 void* MPIDI_CH3I_Alloc_mem(size_t size, MPIR_Info *info_ptr);
1080 /* fallback to MPL_malloc if channel does not have its own RMA memory allocator */
1081 #else
1082 #define MPIDI_CH3I_Alloc_mem(size, info_ptr)    MPL_malloc(size, MPL_MEM_USER)
1083 #endif
1084 
1085 #ifdef MPIDI_CH3I_HAS_FREE_MEM
1086 int MPIDI_CH3I_Free_mem(void *ptr);
1087 #else
1088 #define MPIDI_CH3I_Free_mem(ptr)    MPL_free(ptr);
1089 #endif
1090 
1091 /* Pvars */
1092 void MPIDI_CH3_RMA_Init_sync_pvars(void);
1093 void MPIDI_CH3_RMA_Init_pkthandler_pvars(void);
1094 
1095 /* internal */
1096 int MPIDI_CH3I_Release_lock(MPIR_Win * win_ptr);
1097 int MPIDI_CH3I_Try_acquire_win_lock(MPIR_Win * win_ptr, int requested_lock);
1098 
1099 int MPIDI_CH3I_Progress_finalize(void);
1100 
1101 
1102 /* Internal RMA operation routines.
1103  * Called by normal RMA operations and request-based RMA operations . */
1104 int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
1105                    origin_datatype, int target_rank, MPI_Aint target_disp,
1106                    int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
1107                    MPIR_Request * ureq);
1108 int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
1109                    origin_datatype, int target_rank, MPI_Aint target_disp,
1110                    int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
1111                    MPIR_Request * ureq);
1112 int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
1113                           origin_datatype, int target_rank, MPI_Aint target_disp,
1114                           int target_count, MPI_Datatype target_datatype, MPI_Op op,
1115                           MPIR_Win * win_ptr, MPIR_Request * ureq);
1116 int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
1117                               MPI_Datatype origin_datatype, void *result_addr, int result_count,
1118                               MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
1119                               int target_count, MPI_Datatype target_datatype, MPI_Op op,
1120                               MPIR_Win * win_ptr, MPIR_Request * ureq);
1121 
1122 /*@
1123   MPIDI_CH3_Progress_signal_completion - Inform the progress engine that a
1124   pending request has completed.
1125 
1126   IMPLEMENTORS:
1127   In a single-threaded environment, this routine can be implemented by
1128   incrementing a request completion counter.  In a
1129   multi-threaded environment, the request completion counter must be atomically
1130   incremented, and any threaded blocking in the
1131   progress engine must be woken up when a request is completed.
1132 
1133   Notes on the implementation:
1134 
1135   This code is designed to support one particular model of thread-safety.
1136   It is common to many of the channels and was moved into this file because
1137   the MPIDI_CH3_Progress_signal_completion reference is used by the
1138   function the implements MPID_Request_complete.
1139 @*/
1140 
1141 /*
1142  * MPIDI_CH3_Progress_signal_completion() is used to notify the progress
1143  * engine that a completion has occurred.  The multi-threaded version will need
1144  * to wake up any (and all) threads blocking in MPIDI_CH3_Progress().
1145  */
1146 
1147 /* This allows the channel to define an alternate to the
1148    completion counter.  */
1149 #ifndef MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT
1150 #define MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT                                \
1151     do {                                                                         \
1152         MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_COMPLETION_MUTEX);                                       \
1153         ++MPIDI_CH3I_progress_completion_count;                                  \
1154         MPL_DBG_MSG_D(MPIDI_CH3_DBG_PROGRESS,VERBOSE,                                     \
1155                      "just incremented MPIDI_CH3I_progress_completion_count=%d", \
1156                      MPIDI_CH3I_progress_completion_count);                      \
1157         MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_COMPLETION_MUTEX);                                        \
1158     } while (0)
1159 #endif
1160 
1161 
1162 /* The following is part of an implementation of a control of a
1163    resource shared among threads - it needs to be managed more
1164    explicitly as such as shared resource */
1165 #ifndef MPICH_IS_THREADED
1166 #   define MPIDI_CH3_Progress_signal_completion()	\
1167     {							\
1168        MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT;		\
1169     }
1170 #else
1171     /* TODO these decls should probably move into each channel as appropriate */
1172     extern volatile int MPIDI_CH3I_progress_blocked;
1173     extern volatile int MPIDI_CH3I_progress_wakeup_signalled;
1174 
1175 /* This allows the channel to hook the MPIDI_CH3_Progress_signal_completion
1176  * macro when it is necessary to wake up some part of the progress engine from a
1177  * blocking operation.  Currently ch3:sock uses it, ch3:nemesis does not. */
1178 /* MT alternative implementations of this macro are responsible for providing any
1179  * synchronization (acquiring MPIDCOMM, etc) */
1180 #ifndef MPIDI_CH3I_PROGRESS_WAKEUP
1181 # define MPIDI_CH3I_PROGRESS_WAKEUP do {/*do nothing*/} while(0)
1182 #endif
1183 
1184     void MPIDI_CH3I_Progress_wakeup(void);
1185     /* MT TODO profiling is needed here.  We currently protect the completion
1186      * counter with the COMPLETION critical section, which could be a source of
1187      * contention.  It should be possible to peform these updates atomically via
1188      * OPA instead, but the additional complexity should be justified by
1189      * profiling evidence.  [goodell@ 2010-06-29] */
1190 #   define MPIDI_CH3_Progress_signal_completion()			\
1191     do {                                                                \
1192         MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT;                      \
1193         MPIDI_CH3I_PROGRESS_WAKEUP;                                     \
1194     } while (0)
1195 #endif
1196 
1197 /* Function that may be used to provide business card info */
1198 int MPIDI_CH3I_BCInit( char **bc_val_p, int *val_max_sz_p);
1199 /* Function to free the storage allocated by MPIDI_CH3I_BCInit */
1200 int MPIDI_CH3I_BCFree( char *publish_bc );
1201 
1202 /* Inform the process group of our connection information string (business
1203    card) */
1204 int MPIDI_PG_SetConnInfo( int rank, const char *connString );
1205 
1206 /* Fill in the node_id information for each VC in the given PG. */
1207 int MPIDI_Populate_vc_node_ids(MPIDI_PG_t *pg, int our_pg_rank);
1208 
1209 /* NOTE: Channel function prototypes are in mpidi_ch3_post.h since some of the
1210    macros require their declarations. */
1211 
1212 /* FIXME: These should be defined only when these particular utility
1213    packages are used.  Best would be to keep these prototypes in the
1214    related util/xxx directories, and either copy them into an include
1215    directory used only for builds or add (yet another) include path */
1216 /* from util/sock */
1217 int MPIDI_VC_InitSock( MPIDI_VC_t *);
1218 int MPIDI_CH3I_Connect_to_root_sock(const char *, MPIDI_VC_t **);
1219 
1220 
1221 int MPIDI_CH3I_VC_post_sockconnect(MPIDI_VC_t * );
1222 
1223 /* FIXME: Where should this go? */
1224 
1225 /* Used internally to broadcast process groups belonging to peercomm to
1226  all processes in comm*/
1227 int MPID_PG_BCast( MPIR_Comm *peercomm_p, MPIR_Comm *comm_p, int root );
1228 
1229 /* Channel defintitions */
1230 /*@
1231   MPIDI_CH3_iStartMsg - A non-blocking request to send a CH3 packet.  A r
1232   equest object is allocated only if the send could not be completed
1233   immediately.
1234 
1235   Input Parameters:
1236 + vc - virtual connection to send the message over
1237 . pkt - pointer to a MPIDI_CH3_Pkt_t structure containing the substructure to
1238   be sent
1239 - pkt_sz - size of the packet substucture
1240 
1241   Output Parameters:
1242 . sreq_ptr - send request or NULL if the send completed immediately
1243 
1244   Return value:
1245   An mpi error code.
1246 
1247   NOTE:
1248   The packet structure may be allocated on the stack.
1249 
1250   IMPLEMETORS:
1251   If the send can not be completed immediately, the CH3 packet structure must
1252   be stored internally until the request is complete.
1253 
1254   If the send completes immediately, the channel implementation should return
1255   NULL.
1256 @*/
1257 int MPIDI_CH3_iStartMsg(MPIDI_VC_t * vc, void * pkt, intptr_t pkt_sz,
1258 			MPIR_Request **sreq_ptr);
1259 
1260 
1261 /*@
1262   MPIDI_CH3_iStartMsgv - A non-blocking request to send a CH3 packet and
1263   associated data.  A request object is allocated only if
1264   the send could not be completed immediately.
1265 
1266   Input Parameters:
1267 + vc - virtual connection to send the message over
1268 . iov - a vector of a structure contains a buffer pointer and length
1269 - iov_n - number of elements in the vector
1270 
1271   Output Parameters:
1272 . sreq_ptr - send request or NULL if the send completed immediately
1273 
1274   Return value:
1275   An mpi error code.
1276 
1277   NOTE:
1278   The first element in the vector must point to the packet structure.   The
1279   packet structure and the vector may be allocated on
1280   the stack.
1281 
1282   IMPLEMENTORS:
1283   If the send can not be completed immediately, the CH3 packet structure and
1284   the vector must be stored internally until the
1285   request is complete.
1286 
1287   If the send completes immediately, the channel implementation should return
1288   NULL.
1289 @*/
1290 int MPIDI_CH3_iStartMsgv(MPIDI_VC_t * vc, struct iovec * iov, int iov_n,
1291 			 MPIR_Request **sreq_ptr);
1292 
1293 
1294 /*@
1295   MPIDI_CH3_iSend - A non-blocking request to send a CH3 packet using an
1296   existing request object.  When the send is complete
1297   the channel implementation will call the OnDataAvail routine in the
1298   request, if any (if not, the channel implementation will mark the
1299   request as complete).
1300 
1301   Input Parameters:
1302 + vc - virtual connection over which to send the CH3 packet
1303 . sreq - pointer to the send request object
1304 . pkt - pointer to a MPIDI_CH3_Pkt_t structure containing the substructure to
1305   be sent
1306 - pkt_sz - size of the packet substucture
1307 
1308   Return value:
1309   An mpi error code.
1310 
1311   NOTE:
1312   The packet structure may be allocated on the stack.
1313 
1314   IMPLEMETORS:
1315   If the send can not be completed immediately, the packet structure must be
1316   stored internally until the request is complete.
1317 
1318   If the send completes immediately, the channel implementation still must
1319   invoke the OnDataAvail routine in the request, if any; otherwise, is
1320   must set the request as complete.
1321 @*/
1322 int MPIDI_CH3_iSend(MPIDI_VC_t * vc, MPIR_Request * sreq, void * pkt,
1323 		    intptr_t pkt_sz);
1324 
1325 
1326 /*@
1327   MPIDI_CH3_iSendv - A non-blocking request to send a CH3 packet and
1328   associated data using an existing request object.  When
1329   the send is complete the channel implementation will call the
1330   OnDataAvail routine in the request, if any.
1331 
1332   Input Parameters:
1333 + vc - virtual connection over which to send the CH3 packet and data
1334 . sreq - pointer to the send request object
1335 . iov - a vector of a structure contains a buffer pointer and length
1336 - iov_n - number of elements in the vector
1337 
1338   Return value:
1339   An mpi error code.
1340 
1341   NOTE:
1342   The first element in the vector must point to the packet structure.   The
1343   packet structure and the vector may be allocated on
1344   the stack.
1345 
1346   IMPLEMENTORS:
1347   If the send can not be completed immediately, the packet structure and the
1348   vector must be stored internally until the request is
1349   complete.
1350 
1351   If the send completes immediately, the channel implementation still must
1352   call the OnDataAvail routine in the request, if any.
1353 @*/
1354 int MPIDI_CH3_iSendv(MPIDI_VC_t * vc, MPIR_Request * sreq, struct iovec * iov,
1355 		     int iov_n);
1356 
1357 /*@
1358   MPIDI_CH3_Connection_terminate - terminate the underlying connection
1359   associated with the specified VC
1360 
1361   Input Parameters:
1362 . vc - virtual connection
1363 
1364   Return value:
1365   An MPI error code
1366 @*/
1367 int MPIDI_CH3_Connection_terminate(MPIDI_VC_t * vc);
1368 
1369 /* MPIDI_CH3_Connect_to_root (really connect to peer) - channel routine
1370    for connecting to a process through a port, used in implementing
1371    MPID_Comm_connect and accept */
1372 int MPIDI_CH3_Connect_to_root(const char *, MPIDI_VC_t **);
1373 
1374 /*
1375  * Channel utility prototypes
1376  */
1377 int MPIDI_CH3U_Recvq_init(void);
1378 int MPIDI_CH3U_Recvq_FU(int, int, int, MPI_Status * );
1379 MPIR_Request * MPIDI_CH3U_Recvq_FDU(MPI_Request, MPIDI_Message_match *);
1380 MPIR_Request * MPIDI_CH3U_Recvq_FDU_matchonly(int source, int tag, int context_id, MPIR_Comm *comm,
1381                                    int *foundp);
1382 MPIR_Request * MPIDI_CH3U_Recvq_FDU_or_AEP(int source, int tag,
1383                                           int context_id, MPIR_Comm *comm, void *user_buf,
1384                                            MPI_Aint user_count, MPI_Datatype datatype, int * foundp);
1385 int MPIDI_CH3U_Recvq_DP(MPIR_Request * rreq);
1386 MPIR_Request * MPIDI_CH3U_Recvq_FDP_or_AEU(MPIDI_Message_match * match,
1387 					   int * found);
1388 int MPIDI_CH3U_Recvq_count_unexp(void);
1389 int MPIDI_CH3U_Complete_posted_with_error(MPIDI_VC_t *vc);
1390 int MPIDI_CH3U_Clean_recvq(MPIR_Comm *comm_ptr);
1391 
1392 
1393 int MPIDI_CH3U_Request_load_send_iov(MPIR_Request * const sreq,
1394 				     struct iovec * const iov, int * const iov_n);
1395 int MPIDI_CH3U_Request_load_recv_iov(MPIR_Request * const rreq);
1396 int MPIDI_CH3U_Request_unpack_uebuf(MPIR_Request * rreq);
1397 int MPIDI_CH3U_Request_unpack_srbuf(MPIR_Request * rreq);
1398 
1399 void MPIDI_CH3U_Buffer_copy(const void * const sbuf, MPI_Aint scount,
1400 			    MPI_Datatype sdt, int * smpi_errno,
1401 			    void * const rbuf, MPI_Aint rcount, MPI_Datatype rdt,
1402 			    intptr_t * rdata_sz, int * rmpi_errno);
1403 int MPIDI_CH3U_Post_data_receive(int found, MPIR_Request ** rreqp);
1404 int MPIDI_CH3U_Post_data_receive_found(MPIR_Request * rreqp);
1405 int MPIDI_CH3U_Post_data_receive_unexpected(MPIR_Request * rreqp);
1406 int MPIDI_CH3U_Receive_data_found(MPIR_Request *rreq, void *buf, intptr_t *buflen, int *complete);
1407 int MPIDI_CH3U_Receive_data_unexpected(MPIR_Request * rreq, void *buf, intptr_t *buflen, int *complete);
1408 
1409 /* Initialization routine for ch3u_comm.c */
1410 int MPIDI_CH3I_Comm_init(void);
1411 
1412 int MPIDI_CH3I_Comm_handle_failed_procs(MPIR_Group *new_failed_procs);
1413 void MPIDI_CH3I_Comm_find(MPIR_Context_id_t context_id, MPIR_Comm **comm);
1414 
1415 /* The functions below allow channels to register functions to be
1416    called immediately after a communicator has been created, and
1417    immediately before a communicator is to be destroyed.
1418  */
1419 int MPIDI_CH3U_Comm_register_create_hook(int (*hook_fn)(struct MPIR_Comm *, void *), void *param);
1420 int MPIDI_CH3U_Comm_register_destroy_hook(int (*hook_fn)(struct MPIR_Comm *, void *), void *param);
1421 
1422 /* FIXME: This is a macro! */
1423 #ifndef MPIDI_CH3_Request_add_ref
1424 /*@
1425   MPIDI_CH3_Request_add_ref - Increment the reference count associated with a
1426   request object
1427 
1428   Input Parameters:
1429 . req - pointer to the request object
1430 @*/
1431 void MPIDI_CH3_Request_add_ref(MPIR_Request * req);
1432 #endif
1433 
1434 /*@
1435   MPIDI_CH3_GetParentPort - obtain the port name associated with the parent
1436 
1437   Output Parameters:
1438 .  parent_port_name - the port name associated with the parent communicator
1439 
1440   Return value:
1441   A MPI error code.
1442 
1443   NOTE:
1444   'MPIDI_CH3_GetParentPort' should only be called if the initialization
1445   (in the current implementation, done with the static function
1446   'InitPGFromPMI' in 'mpid_init.c') has determined that this process
1447   in fact has a parent.
1448 @*/
1449 int MPIDI_CH3_GetParentPort(char ** parent_port_name);
1450 
1451 /*@
1452    MPIDI_CH3_FreeParentPort - This routine frees the storage associated with
1453    a parent port (allocted with MPIDH_CH3_GetParentPort).
1454 
1455   @*/
1456 void MPIDI_CH3_FreeParentPort( void );
1457 
1458 /*E
1459   MPIDI_CH3_Abort - Abort this process.
1460 
1461   Input Parameters:
1462 + exit_code - exit code to be returned by the process
1463 - error_msg - error message to print
1464 
1465   Return value:
1466   This function should not return.
1467 
1468   Notes:
1469   This routine is used only if the channel defines
1470   'MPIDI_CH3_IMPLEMENTS_ABORT'.  This allows the channel to handle
1471   aborting processes, particularly when the channel does not use the standard
1472   PMI interface.
1473 E*/
1474 int MPIDI_CH3_Abort(int exit_code, char * error_msg);
1475 
1476 /* FIXME: Move these prototypes into header files in the appropriate
1477    util directories  */
1478 /* added by brad.  upcalls for MPIDI_CH3_Init that contain code which could be
1479    executed by two or more channels */
1480 int MPIDI_CH3U_Init_sock(int has_parent, MPIDI_PG_t * pg_p, int pg_rank,
1481                          char **bc_val_p, int *val_max_sz_p);
1482 
1483 /* added by brad.  business card related global and functions */
1484 /* FIXME: Make these part of the channel support headers */
1485 #define MAX_HOST_DESCRIPTION_LEN 256
1486 int MPIDI_CH3U_Get_business_card_sock(int myRank,
1487 				      char **bc_val_p, int *val_max_sz_p);
1488 
1489 int MPIDI_CH3_Get_business_card(int myRank, char *value, int length);
1490 
1491 /*
1492  * Channel upcall prototypes
1493  */
1494 
1495 /*E
1496   MPIDI_CH3U_Handle_recv_pkt- Handle a freshly received CH3 packet.
1497 
1498   Input Parameters:
1499 + vc - virtual connection over which the packet was received
1500 - pkt - pointer to the CH3 packet header
1501 - data - pointer to the start address of data
1502 
1503   Output Parameter:
1504 . rreqp - receive request defining data to be received; may be NULL
1505 
1506   NOTE:
1507   Multiple threads may not simultaneously call this routine with the same
1508   virtual connection.  This constraint eliminates the
1509   need to lock the VC and thus improves performance.  If simultaneous upcalls
1510   for a single VC are a possible, then the calling
1511   routine must serialize the calls (perhaps by locking the VC).  Special
1512   consideration may need to be given to packet ordering
1513   if the channel has made guarantees about ordering.
1514 E*/
1515 int MPIDI_CH3U_Handle_recv_pkt(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, void *data,
1516 			       intptr_t *buflen, MPIR_Request ** rreqp);
1517 
1518 /*@
1519   MPIDI_CH3U_Handle_recv_req - Process a receive request for which all of the
1520   data has been received (and copied) into the
1521   buffers described by the request's IOV.
1522 
1523   Input Parameters:
1524 + vc - virtual connection over which the data was received
1525 - rreq - pointer to the receive request object
1526 
1527   Output Parameter:
1528 . complete - data transfer for the request has completed
1529 @*/
1530 int MPIDI_CH3U_Handle_recv_req(MPIDI_VC_t * vc, MPIR_Request * rreq,
1531 			       int * complete);
1532 
1533 /* Handle_send_req invokes the action (method/function) when data
1534    becomes available.  It is an obsolete routine; the completion
1535    function should be invoked directly.  */
1536 int MPIDI_CH3U_Handle_send_req(MPIDI_VC_t * vc, MPIR_Request * sreq,
1537 			       int *complete);
1538 
1539 int MPIDI_CH3U_Handle_connection(MPIDI_VC_t * vc, MPIDI_VC_Event_t event);
1540 
1541 int MPIDI_CH3U_VC_SendClose( MPIDI_VC_t *vc, int rank );
1542 int MPIDI_CH3U_VC_WaitForClose( void );
1543 #ifdef MPIDI_CH3_HAS_CHANNEL_CLOSE
1544 int MPIDI_CH3_Channel_close( void );
1545 #else
1546 #define MPIDI_CH3_Channel_close( )   MPI_SUCCESS
1547 #endif
1548 
1549 /* MPIDI_CH3U_Get_failed_group() generates a group of failed processes based
1550  * on the last list generated during MPIDI_CH3U_Check_for_failed_procs */
1551 int MPIDI_CH3U_Get_failed_group(int last_rank, MPIR_Group **failed_group);
1552 /* MPIDI_CH3U_Check_for_failed_procs() reads PMI_dead_processes key
1553    and marks VCs to those processes as failed */
1554 int MPIDI_CH3U_Check_for_failed_procs(void);
1555 
1556 /*@
1557   MPIDI_CH3_Pre_init - Allows the channel to initialize before PMI_init is
1558   called, and allows the
1559   channel to optionally set the rank, size, and whether this process has a
1560   parent.
1561 
1562   Output Parameters:
1563 + setvals - boolean value that is true if this function set has_parent, rank,
1564   and size
1565 . has_parent - boolean value that is true if this MPI job was spawned by
1566   another set of MPI processes
1567 . rank - rank of this process in the process group
1568 - size - number of processes in the process group
1569 
1570   Return value:
1571   A MPI error code.
1572 
1573   Notes:
1574   This function is optional, and is used only when HAVE_CH3_PRE_INIT is
1575   defined.  It is called by CH3 before PMI_Init.  If the function sets setvals
1576   to TRUE, CH3 will not use PMI to get the rank,  size, etc.
1577 @*/
1578 int MPIDI_CH3_Pre_init (int *setvals, int *has_parent, int *rank, int *size);
1579 
1580 /*@
1581   MPIDI_CH3_Init - Initialize the channel implementation.
1582 
1583   Input Parameters:
1584 + has_parent - boolean value that is true if this MPI job was spawned by
1585   another set of MPI processes
1586 . pg_ptr - the new process group representing MPI_COMM_WORLD
1587 - pg_rank - my rank in the process group
1588 
1589   Return value:
1590   A MPI error code.
1591 
1592   Notes:
1593   MPID_Init has called 'PMI_Init' and created the process group structure
1594   before this routine is called.
1595 @*/
1596 int MPIDI_CH3_Init(int has_parent, MPIDI_PG_t *pg_ptr, int pg_rank );
1597 
1598 /*@
1599   MPIDI_CH3_Finalize - Shutdown the channel implementation.
1600 
1601   Return value:
1602   A MPI error class.
1603 @*/
1604 int MPIDI_CH3_Finalize(void);
1605 
1606 /*@
1607   MPIDI_CH3_VC_Init - Perform channel-specific initialization of a VC
1608 
1609   Input Parameter:
1610 . vc - Virtual connection to initialize
1611   @*/
1612 int MPIDI_CH3_VC_Init( struct MPIDI_VC *vc );
1613 
1614 /*@
1615    MPIDI_CH3_PG_Destroy - Perform any channel-specific actions when freeing
1616    a process group
1617 
1618     Input Parameter:
1619 .   pg - Process group on which to act
1620 @*/
1621 int MPIDI_CH3_PG_Destroy( struct MPIDI_PG *pg );
1622 
1623 /*@ MPIDI_CH3_VC_Destroy - Perform and channel-specific actions when freeing a
1624     virtual connection.
1625 
1626     Input Parameter:
1627 .   vc - Virtual connection on which to act
1628 @*/
1629 int MPIDI_CH3_VC_Destroy( struct MPIDI_VC *vc );
1630 
1631 /*@ MPIDI_CH3_InitCompleted - Perform any channel-specific initialization
1632   actions after MPID_Init but before MPI_Init (or MPI_Initthread) returns
1633   @*/
1634 int MPIDI_CH3_InitCompleted( void );
1635 
1636 #ifdef MPIDI_CH3_HASIMPL_HEADER
1637 #include "mpidi_ch3_mpid.h"
1638 #endif
1639 /* Routines in support of ch3 */
1640 
1641 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
1642 /* Routine to return the tag associated with a port */
1643 int MPIDI_GetTagFromPort( const char *, int * );
1644 #else
1645 /* Need empty symbol to avoid failure at compile time if defined
1646  * MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS. */
1647 #define MPIDI_GetTagFromPort(port_name, port_name_tag) (MPI_SUCCESS)
1648 #endif
1649 
1650 /* Here are the packet handlers */
1651 int MPIDI_CH3_PktHandler_EagerSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1652 				   intptr_t *, MPIR_Request ** );
1653 #ifdef USE_EAGER_SHORT
1654 int MPIDI_CH3_PktHandler_EagerShortSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1655 					 intptr_t *, MPIR_Request ** );
1656 #endif
1657 int MPIDI_CH3_PktHandler_ReadySend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1658 				    intptr_t *, MPIR_Request ** );
1659 int MPIDI_CH3_PktHandler_EagerSyncSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1660 					intptr_t *, MPIR_Request ** );
1661 int MPIDI_CH3_PktHandler_EagerSyncAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1662 				       intptr_t *, MPIR_Request ** );
1663 int MPIDI_CH3_PktHandler_RndvReqToSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1664 					intptr_t *, MPIR_Request ** );
1665 int MPIDI_CH3_PktHandler_RndvClrToSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1666 					intptr_t *, MPIR_Request ** );
1667 int MPIDI_CH3_PktHandler_RndvSend( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1668 				   intptr_t *, MPIR_Request ** );
1669 int MPIDI_CH3_PktHandler_CancelSendReq( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1670 					intptr_t *, MPIR_Request ** );
1671 int MPIDI_CH3_PktHandler_CancelSendResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1672 					 intptr_t *, MPIR_Request ** );
1673 int MPIDI_CH3_PktHandler_Put( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1674 			      intptr_t *, MPIR_Request ** );
1675 int MPIDI_CH3_PktHandler_Accumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1676 				     intptr_t *, MPIR_Request ** );
1677 int MPIDI_CH3_PktHandler_GetAccumulate( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1678                                         intptr_t *, MPIR_Request ** );
1679 int MPIDI_CH3_PktHandler_CAS( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1680                               intptr_t *, MPIR_Request ** );
1681 int MPIDI_CH3_PktHandler_CASResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1682                                   intptr_t *, MPIR_Request ** );
1683 int MPIDI_CH3_PktHandler_FOP( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1684                               intptr_t *, MPIR_Request ** );
1685 int MPIDI_CH3_PktHandler_FOPResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1686                                   intptr_t *, MPIR_Request ** );
1687 int MPIDI_CH3_PktHandler_Get_AccumResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1688                                         intptr_t *, MPIR_Request ** );
1689 int MPIDI_CH3_PktHandler_Get( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1690 			      intptr_t *, MPIR_Request ** );
1691 int MPIDI_CH3_PktHandler_GetResp( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1692 				 intptr_t *, MPIR_Request ** );
1693 int MPIDI_CH3_PktHandler_Lock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1694 			      intptr_t *, MPIR_Request ** );
1695 int MPIDI_CH3_PktHandler_LockAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1696 				      intptr_t *, MPIR_Request ** );
1697 int MPIDI_CH3_PktHandler_LockOpAck( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1698                                     intptr_t *, MPIR_Request ** );
1699 int MPIDI_CH3_PktHandler_Unlock( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1700                                  intptr_t *, MPIR_Request ** );
1701 int MPIDI_CH3_PktHandler_Flush( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1702                                 intptr_t *, MPIR_Request ** );
1703 int MPIDI_CH3_PktHandler_Ack( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1704                               intptr_t *, MPIR_Request ** );
1705 int MPIDI_CH3_PktHandler_DecrAtCnt( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1706                                     intptr_t *, MPIR_Request ** );
1707 int MPIDI_CH3_PktHandler_FlowCntlUpdate( MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, void *,
1708 					 intptr_t *, MPIR_Request ** );
1709 int MPIDI_CH3_PktHandler_Close( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1710 				intptr_t *, MPIR_Request ** );
1711 
1712 #ifndef MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS
1713 /* packet handlers used in dynamic process connection. */
1714 int MPIDI_CH3_PktHandler_ConnAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, void * data,
1715                                  intptr_t * buflen, MPIR_Request ** rreqp);
1716 int MPIDI_CH3_PktHandler_AcceptAck(MPIDI_VC_t * vc, MPIDI_CH3_Pkt_t * pkt, void * data,
1717                                    intptr_t * buflen, MPIR_Request ** rreqp);
1718 #endif /* end of MPIDI_CH3_HAS_NO_DYNAMIC_PROCESS */
1719 
1720 int MPIDI_CH3_PktHandler_EndCH3( MPIDI_VC_t *, MPIDI_CH3_Pkt_t *, void *,
1721 				 intptr_t *, MPIR_Request ** );
1722 int MPIDI_CH3_PktHandler_Revoke(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, void * data,
1723                                 intptr_t *buflen, MPIR_Request **rreqp);
1724 int MPIDI_CH3_PktHandler_Init( MPIDI_CH3_PktHandler_Fcn *[], int );
1725 
1726 int MPIDI_CH3I_RMA_Make_progress_global(int *made_progress);
1727 
1728 #ifdef MPICH_DBG_OUTPUT
1729 int MPIDI_CH3_PktPrint_CancelSendReq( FILE *, MPIDI_CH3_Pkt_t * );
1730 int MPIDI_CH3_PktPrint_CancelSendResp( FILE *, MPIDI_CH3_Pkt_t * );
1731 int MPIDI_CH3_PktPrint_EagerSend( FILE *, MPIDI_CH3_Pkt_t * );
1732 int MPIDI_CH3_PktPrint_ReadySend( FILE *, MPIDI_CH3_Pkt_t * );
1733 int MPIDI_CH3_PktPrint_RndvReqToSend( FILE *, MPIDI_CH3_Pkt_t * );
1734 int MPIDI_CH3_PktPrint_RndvClrToSend( FILE *, MPIDI_CH3_Pkt_t * );
1735 int MPIDI_CH3_PktPrint_RndvSend( FILE *, MPIDI_CH3_Pkt_t * );
1736 int MPIDI_CH3_PktPrint_EagerSyncSend( FILE *fp, MPIDI_CH3_Pkt_t *pkt );
1737 int MPIDI_CH3_PktPrint_EagerSyncAck( FILE *fp, MPIDI_CH3_Pkt_t *pkt );
1738 #endif
1739 
1740 /* Routines to create packets (used in implementing MPI communications */
1741 int MPIDI_CH3_EagerNoncontigSend( MPIR_Request **, MPIDI_CH3_Pkt_type_t,
1742 				  const void *, MPI_Aint,
1743 				  MPI_Datatype, int, int, MPIR_Comm *,
1744 				  int );
1745 int MPIDI_CH3_EagerContigSend( MPIR_Request **, MPIDI_CH3_Pkt_type_t,
1746 			       const void *, intptr_t, int,
1747 			       int, MPIR_Comm *, int );
1748 int MPIDI_CH3_EagerContigShortSend( MPIR_Request **, MPIDI_CH3_Pkt_type_t,
1749 				    const void *, intptr_t,
1750 				    int, int, MPIR_Comm *, int );
1751 int MPIDI_CH3_EagerContigIsend( MPIR_Request **, MPIDI_CH3_Pkt_type_t,
1752 				const void *, intptr_t, int,
1753 				int, MPIR_Comm *, int );
1754 
1755 
1756 int MPIDI_CH3_RndvSend( MPIR_Request **, const void *, MPI_Aint, MPI_Datatype,
1757 			int, intptr_t, MPI_Aint, int, int, MPIR_Comm *, int );
1758 
1759 int MPIDI_CH3_EagerSyncNoncontigSend( MPIR_Request **, const void *, int,
1760 				      MPI_Datatype, intptr_t, int, MPI_Aint,
1761 				      int, int, MPIR_Comm *, int );
1762 int MPIDI_CH3_EagerSyncZero(MPIR_Request **, int, int, MPIR_Comm *, int );
1763 
1764 int MPIDI_CH3_SendNoncontig_iov( struct MPIDI_VC *vc, struct MPIR_Request *sreq,
1765                                  void *header, intptr_t hdr_sz,
1766                                  struct iovec *hdr_iov, int n_hdr_iov);
1767 
1768 /* Routines to ack packets, called in the receive routines when a
1769    message is matched */
1770 int MPIDI_CH3_EagerSyncAck( MPIDI_VC_t *, MPIR_Request * );
1771 int MPIDI_CH3_RecvFromSelf( MPIR_Request *, void *, MPI_Aint, MPI_Datatype );
1772 int MPIDI_CH3_RecvRndv( MPIDI_VC_t *, MPIR_Request * );
1773 
1774 /* Handler routines to continuing after an IOV is processed (assigned to the
1775    OnDataAvail field in the device part of a request) */
1776 int MPIDI_CH3_ReqHandler_RecvComplete( MPIDI_VC_t *, MPIR_Request *, int * );
1777 int MPIDI_CH3_ReqHandler_UnpackUEBufComplete( MPIDI_VC_t *, MPIR_Request *,
1778 					      int * );
1779 int MPIDI_CH3_ReqHandler_ReloadIOV( MPIDI_VC_t *, MPIR_Request *, int * );
1780 
1781 int MPIDI_CH3_ReqHandler_UnpackSRBufReloadIOV( MPIDI_VC_t *, MPIR_Request *,
1782 					       int * );
1783 int MPIDI_CH3_ReqHandler_UnpackSRBufComplete( MPIDI_VC_t *, MPIR_Request *,
1784 					      int * );
1785 int MPIDI_CH3_ReqHandler_PutDerivedDTRecvComplete( MPIDI_VC_t *,
1786 						   MPIR_Request *, int * );
1787 int MPIDI_CH3_ReqHandler_PutRecvComplete( MPIDI_VC_t *, MPIR_Request *,
1788                                           int * );
1789 int MPIDI_CH3_ReqHandler_AccumRecvComplete( MPIDI_VC_t *, MPIR_Request *,
1790                                             int * );
1791 int MPIDI_CH3_ReqHandler_GaccumRecvComplete( MPIDI_VC_t *, MPIR_Request *,
1792                                              int * );
1793 int MPIDI_CH3_ReqHandler_FOPRecvComplete( MPIDI_VC_t *, MPIR_Request *,
1794                                           int * );
1795 int MPIDI_CH3_ReqHandler_AccumMetadataRecvComplete( MPIDI_VC_t *,
1796                                                     MPIR_Request *,
1797                                                     int * );
1798 int MPIDI_CH3_ReqHandler_GaccumMetadataRecvComplete( MPIDI_VC_t *,
1799                                                      MPIR_Request *,
1800                                                      int * );
1801 int MPIDI_CH3_ReqHandler_GetDerivedDTRecvComplete( MPIDI_VC_t *,
1802 						   MPIR_Request *, int * );
1803 int MPIDI_CH3_ReqHandler_PiggybackLockOpRecvComplete( MPIDI_VC_t *,
1804                                                       MPIR_Request *, int * );
1805 /* Send Handlers */
1806 int MPIDI_CH3_ReqHandler_SendReloadIOV( MPIDI_VC_t *vc, MPIR_Request *sreq,
1807 					int *complete );
1808 int MPIDI_CH3_ReqHandler_GetSendComplete( MPIDI_VC_t *, MPIR_Request *,
1809                                           int * );
1810 int MPIDI_CH3_ReqHandler_GaccumSendComplete( MPIDI_VC_t *, MPIR_Request *,
1811                                              int * );
1812 int MPIDI_CH3_ReqHandler_CASSendComplete( MPIDI_VC_t *, MPIR_Request *,
1813                                           int * );
1814 int MPIDI_CH3_ReqHandler_FOPSendComplete( MPIDI_VC_t *, MPIR_Request *,
1815                                           int * );
1816 /* RMA operation request handler */
1817 int MPIDI_CH3_Req_handler_rma_op_complete(MPIR_Request *);
1818 
1819 #define MPIDI_CH3_GET_EAGER_THRESHOLD(eager_threshold_p, comm, vc)  \
1820     do {                                                            \
1821         if ((comm)->hints[MPIR_COMM_HINT_EAGER_THRESH] != -1)                     \
1822             *(eager_threshold_p) = (comm)->hints[MPIR_COMM_HINT_EAGER_THRESH];    \
1823         else                                                        \
1824             *(eager_threshold_p) = (vc)->eager_max_msg_sz;          \
1825     } while (0)
1826 
1827 
1828 #endif /* MPIDIMPL_H_INCLUDED */
1829