1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
2 /*
3  *  (C) 2001 by Argonne National Laboratory.
4  *      See COPYRIGHT in top-level directory.
5  *
6  * Portions of this code were written by Microsoft. Those portions are
7  * Copyright (c) 2007 Microsoft Corporation. Microsoft grants
8  * permission to use, reproduce, prepare derivative works, and to
9  * redistribute to others. The code is licensed "as is." The User
10  * bears the risk of using it. Microsoft gives no express warranties,
11  * guarantees or conditions. To the extent permitted by law, Microsoft
12  * excludes the implied warranties of merchantability, fitness for a
13  * particular purpose and non-infringement.
14  */
15 #ifndef MPIIMPL_H_INCLUDED
16 #define MPIIMPL_H_INCLUDED
17 
18 /*
19  * This file is the temporary home of most of the definitions used to
20  * implement MPICH.  We will eventually divide this file into logical
21  * pieces once we are certain of the relationships between the components.
22  */
23 
24 /* style: define:vsnprintf:1 sig:0 */
25 /* style: allow:printf:3 sig:0 */
26 
27 /* Include the mpi definitions */
28 #include "mpi.h"
29 
30 /* There are a few definitions that must be made *before* the mpichconf.h
31    file is included.  These include the definitions of the error levels and some
32    thread granularity constants */
33 #include "mpichconfconst.h"
34 
35 /* Data computed by configure.  This is included *after* mpi.h because we
36    do not want mpi.h to depend on any other files or configure flags */
37 #include "mpichconf.h"
38 
39 /* if we are defining this, we must define it before including mpl.h */
40 #if defined(MPICH_DEBUG_MEMINIT)
41 #define MPL_VG_ENABLED 1
42 #endif
43 #include "mpl.h"
44 
45 #include <stdio.h>
46 #ifdef STDC_HEADERS
47 #include <stdlib.h>
48 #include <stdarg.h>
49 #include <string.h>
50 #else
51 #ifdef HAVE_STDLIB_H
52 #include <stdlib.h>
53 #endif
54 #ifdef HAVE_STDARG_H
55 #include <stdarg.h>
56 #endif
57 #ifdef HAVE_STRING_H
58 #include <string.h>
59 #endif
60 #endif
61 
62 #ifdef HAVE_SYS_TYPES_H
63 #include <sys/types.h>
64 #endif
65 
66 /* for MAXHOSTNAMELEN under Linux and OSX */
67 #ifdef HAVE_SYS_PARAM_H
68 #include <sys/param.h>
69 #endif
70 
71 #if defined (HAVE_USLEEP)
72 #include <unistd.h>
73 #if defined (NEEDS_USLEEP_DECL)
74 int usleep(useconds_t usec);
75 #endif
76 #endif
77 
78 #if (!defined MAXHOSTNAMELEN) && (!defined MAX_HOSTNAME_LEN)
79 #define MAX_HOSTNAME_LEN 256
80 #elif !defined MAX_HOSTNAME_LEN
81 #define MAX_HOSTNAME_LEN MAXHOSTNAMELEN
82 #endif
83 
84 /* Default PMI version to use */
85 #define MPIU_DEFAULT_PMI_VERSION 1
86 #define MPIU_DEFAULT_PMI_SUBVERSION 1
87 
88 /* This allows us to keep names local to a single file when we can use
89    weak symbols */
90 #ifdef  USE_WEAK_SYMBOLS
91 #define PMPI_LOCAL static
92 #else
93 #define PMPI_LOCAL
94 #endif
95 
96 /* Fix for universal endianess added in autoconf 2.62 */
97 #ifdef WORDS_UNIVERSAL_ENDIAN
98 #if defined(__BIG_ENDIAN__)
99 #elif defined(__LITTLE_ENDIAN__)
100 #define WORDS_LITTLEENDIAN
101 #else
102 #error 'Universal endianess defined without __BIG_ENDIAN__ or __LITTLE_ENDIAN__'
103 #endif
104 #endif
105 
106 /* Include some basic (and easily shared) definitions */
107 #include "mpibase.h"
108 
109 /* FIXME: The code base should not define two of these */
110 /* This is used to quote a name in a definition (see FUNCNAME/FCNAME below) */
111 #ifndef MPIDI_QUOTE
112 #define MPIDI_QUOTE(A) MPIDI_QUOTE2(A)
113 #define MPIDI_QUOTE2(A) #A
114 #endif
115 
116 /*
117    Include the implementation definitions (e.g., error reporting, thread
118    portability)
119    More detailed documentation is contained in the MPICH2 and ADI3 manuals.
120  */
121 /* FIXME: ... to do ... */
122 #include "mpitypedefs.h"
123 
124 /* This is the default implementation of MPIU_Memcpy.  We define this
125    before including mpidpre.h so that it can be used when a device or
126    channel can use it if it's overriding MPIU_Memcpy.  */
MPIU_DBG_ATTRIBUTE_NOINLINE(unused)127 MPIU_DBG_ATTRIBUTE_NOINLINE
128 ATTRIBUTE((unused))
129 static MPIU_DBG_INLINE_KEYWORD void MPIUI_Memcpy(void * dst, const void * src, size_t len)
130 {
131     memcpy(dst, src, len);
132 }
133 
134 /* Include definitions from the device which must exist before items in this
135    file (mpiimpl.h) can be defined. mpidpre.h must be included before any
136    files that allow the device to override or extend any terms; this includes
137    mpiimplthread.h and mpiutil.h */
138 /* ------------------------------------------------------------------------- */
139 #include "mpidpre.h"
140 /* ------------------------------------------------------------------------- */
141 
142 /* Overriding memcpy:
143    Devices and channels can override the default implementation of
144    MPIU_Memcpy by defining the MPIU_Memcpy macro.  The implementation
145    can call MPIUI_Memcpy for the default memcpy implementation.
146    Note that MPIU_Memcpy and MPIUI_Memcpy return void rather than a
147    pointer to the destination buffer.  This is different from C89
148    memcpy.
149 */
150 #ifndef MPIU_Memcpy
151 #define MPIU_Memcpy(dst, src, len)                \
152     do {                                          \
153         MPIU_MEM_CHECK_MEMCPY((dst),(src),(len)); \
154         MPIUI_Memcpy((dst), (src), (len));        \
155     } while (0)
156 #endif
157 
158 #include "mpiimplthread.h"
159 #include "mpiutil.h"
160 
161 /* ------------------------------------------------------------------------- */
162 /* mpidebug.h */
163 /* ------------------------------------------------------------------------- */
164 /* Debugging and printf control */
165 /* Use these *only* for debugging output intended for the implementors
166    and maintainers of MPICH.  Do *not* use these for any output that
167    general users may normally see.  Use either the error code creation
168    routines for error messages or MPIU_msg_printf etc. for general messages
169    (MPIU_msg_printf will go through gettext).
170 
171    FIXME: Document all of these macros
172 
173    NOTE: These macros and values are deprecated.  See
174    www.mcs.anl.gov/mpi/mpich2/developer/design/debugmsg.htm for
175    the new design (only partially implemented at this time).
176 
177    The implementation is in mpidbg.h
178 */
179 #include "mpidbg.h"
180 
181 #if defined(MPICH_DBG_OUTPUT)
182 #define MPIU_DBG_PRINTF(e)			\
183 {						\
184     if (MPIU_dbg_state != MPIU_DBG_STATE_NONE)	\
185     {						\
186 	MPIU_dbg_printf e;			\
187     }						\
188 }
189 /* The first argument is a place holder to allow the selection of a subset
190    of debugging events.  The second is a placeholder to allow a numeric
191    level of debugging within that class.  The third is the debugging text */
192 #define MPIU_DBG_PRINTF_CLASS(_c,_l,_e) MPIU_DBG_PRINTF(_e)
193 #else
194 #define MPIU_DBG_PRINTF(e)
195 #define MPIU_DBG_PRINTF_CLASS(_c,_l,_e)
196 #endif
197 
198 /* The follow is temporarily provided for backward compatibility.  Any code
199    using dbg_printf should be updated to use MPIU_DBG_PRINTF. */
200 #define dbg_printf MPIU_dbg_printf
201 
202 /* ------------------------------------------------------------------------- */
203 /* end of mpidebug.h */
204 /* ------------------------------------------------------------------------- */
205 
206 /* Routines for memory management */
207 #include "mpimem.h"
208 
209 /*
210  * Use MPIU_SYSCALL to wrap system calls; this provides a convenient point
211  * for timing the calls and keeping track of the use of system calls.
212  * This macro simply invokes the system call and does not even handle
213  * EINTR.
214  * To use,
215  *    MPIU_SYSCALL( return-value, name-of-call, args-in-parenthesis )
216  * e.g., change "n = read(fd,buf,maxn);" into
217  *    MPIU_SYSCALL( n,read,(fd,buf,maxn) );
218  * An example that prints each syscall to stdout is shown below.
219  */
220 #ifdef USE_LOG_SYSCALLS
221 #define MPIU_SYSCALL(a_,b_,c_) { \
222     printf( "[%d]about to call %s\n", MPIR_Process.comm_world->rank,#b_);\
223           fflush(stdout); errno = 0;\
224     a_ = b_ c_; \
225     if ((a_)>=0 || errno==0) {\
226     printf( "[%d]%s returned %d\n", \
227           MPIR_Process.comm_world->rank, #b_, a_ );\
228     } \
229  else { \
230     printf( "[%d]%s returned %d (errno = %d,%s)\n", \
231           MPIR_Process.comm_world->rank, \
232           #b_, a_, errno, MPIU_Strerror(errno));\
233     };           fflush(stdout);}
234 #else
235 #define MPIU_SYSCALL(a_,b_,c_) a_ = b_ c_
236 #endif
237 
238 /*TDSOverview.tex
239 
240   MPI has a number of data structures, most of which are represented by
241   an opaque handle in an MPI program.  In the MPICH implementation of MPI,
242   these handles are represented
243   as integers; this makes implementation of the C/Fortran handle transfer
244   calls (part of MPI-2) easy.
245 
246   MPID objects (again with the possible exception of 'MPI_Request's)
247   are allocated by a common set of object allocation functions.
248   These are
249 .vb
250     void *MPIU_Handle_obj_create( MPIU_Object_alloc_t *objmem )
251     void MPIU_Handle_obj_destroy( MPIU_Object_alloc_t *objmem, void *object )
252 .ve
253   where 'objmem' is a pointer to a memory allocation object that knows
254   enough to allocate objects, including the
255   size of the object and the location of preallocated memory, as well
256   as the type of memory allocator.  By providing the routines to allocate and
257   free the memory, we make it easy to use the same interface to allocate both
258   local and shared memory for objects (always using the same kind for each
259   type of object).
260 
261   The names create/destroy were chosen because they are different from
262   new/delete (C++ operations) and malloc/free.
263   Any name choice will have some conflicts with other uses, of course.
264 
265   Reference Counts:
266   Many MPI objects have reference count semantics.
267   The semantics of MPI require that many objects that have been freed by the
268   user
269   (e.g., with 'MPI_Type_free' or 'MPI_Comm_free') remain valid until all
270   pending
271   references to that object (e.g., by an 'MPI_Irecv') are complete.  There
272   are several ways to implement this; MPICH uses `reference counts` in the
273   objects.  To support the 'MPI_THREAD_MULTIPLE' level of thread-safety, these
274   reference counts must be accessed and updated atomically.
275   A reference count for
276   `any` object can be incremented (atomically)
277   with 'MPIU_Object_add_ref(objptr)'
278   and decremented with 'MPIU_Object_release_ref(objptr,newval_ptr)'.
279   These have been designed so that then can be implemented as inlined
280   macros rather than function calls, even in the multithreaded case, and
281   can use special processor instructions that guarantee atomicity to
282   avoid thread locks.
283   The decrement routine sets the value pointed at by 'inuse_ptr' to 0 if
284   the postdecrement value of the reference counter is zero, and to a non-zero
285   value otherwise.  If this value is zero, then the routine that decremented
286   the
287   reference count should free the object.  This may be as simple as
288   calling 'MPIU_Handle_obj_destroy' (for simple objects with no other allocated
289   storage) or may require calling a separate routine to destroy the object.
290   Because MPI uses 'MPI_xxx_free' to both decrement the reference count and
291   free the object if the reference count is zero, we avoid the use of 'free'
292   in the MPID routines.
293 
294   The 'inuse_ptr' approach is used rather than requiring the post-decrement
295   value because, for reference-count semantics, all that is necessary is
296   to know when the reference count reaches zero, and this can sometimes
297   be implemented more cheaply that requiring the post-decrement value (e.g.,
298   on IA32, there is an instruction for this operation).
299 
300   Question:
301   Should we state that this is a macro so that we can use a register for
302   the output value?  That avoids a store.  Alternately, have the macro
303   return the value as if it was a function?
304 
305   Structure Definitions:
306   The structure definitions in this document define `only` that part of
307   a structure that may be used by code that is making use of the ADI.
308   Thus, some structures, such as 'MPID_Comm', have many defined fields;
309   these are used to support MPI routines such as 'MPI_Comm_size' and
310   'MPI_Comm_remote_group'.  Other structures may have few or no defined
311   members; these structures have no fields used outside of the ADI.
312   In C++ terms,  all members of these structures are 'private'.
313 
314   For the initial implementation, we expect that the structure definitions
315   will be designed for the multimethod device.  However, all items that are
316   specific to a particular device (including the multi-method device)
317   will be placed at the end of the structure;
318   the document will clearly identify the members that all implementations
319   will provide.  This simplifies much of the code in both the ADI and the
320   implementation of the MPI routines because structure member can be directly
321   accessed rather than using some macro or C++ style method interface.
322 
323  T*/
324 
325 /* mpi_lang.h - Prototypes for language specific routines. Currently used to
326  * set keyval attribute callbacks
327  */
328 #include "mpi_lang.h"
329 /* Known language bindings */
330 /*E
331   MPID_Lang_t - Known language bindings for MPI
332 
333   A few operations in MPI need to know what language they were called from
334   or created by.  This type enumerates the possible languages so that
335   the MPI implementation can choose the correct behavior.  An example of this
336   are the keyval attribute copy and delete functions.
337 
338   Module:
339   Attribute-DS
340   E*/
341 typedef enum MPID_Lang_t { MPID_LANG_C
342 #ifdef HAVE_FORTRAN_BINDING
343 			   , MPID_LANG_FORTRAN
344 			   , MPID_LANG_FORTRAN90
345 #endif
346 #ifdef HAVE_CXX_BINDING
347 			   , MPID_LANG_CXX
348 #endif
349 } MPID_Lang_t;
350 
351 /* Macros for the MPI handles (e.g., the object that encodes an
352    MPI_Datatype) */
353 #include "mpihandlemem.h"
354 
355 /* This routine is used to install an attribute free routine for datatypes
356    at finalize-time */
357 void MPIR_DatatypeAttrFinalize( void );
358 
359 /* ------------------------------------------------------------------------- */
360 /* Should the following be moved into mpihandlemem.h ?*/
361 /* ------------------------------------------------------------------------- */
362 
363 /* Routines to initialize handle allocations */
364 /* These are now internal to the handlemem package
365 void *MPIU_Handle_direct_init( void *, int, int, int );
366 void *MPIU_Handle_indirect_init( void *(**)[], int *, int, int, int, int );
367 int MPIU_Handle_free( void *((*)[]), int );
368 */
369 /* Convert Handles to objects for MPI types that have predefined objects */
370 /* TODO examine generated assembly for this construct, it's probably suboptimal
371  * on Blue Gene.  An if/else if/else might help the compiler out.  It also lets
372  * us hint that one case is likely(), usually the BUILTIN case. */
373 #define MPID_Getb_ptr(kind,a,bmsk,ptr)                                  \
374 {                                                                       \
375    switch (HANDLE_GET_KIND(a)) {                                        \
376       case HANDLE_KIND_BUILTIN:                                         \
377           ptr=MPID_##kind##_builtin+((a)&(bmsk));                       \
378           break;                                                        \
379       case HANDLE_KIND_DIRECT:                                          \
380           ptr=MPID_##kind##_direct+HANDLE_INDEX(a);                     \
381           break;                                                        \
382       case HANDLE_KIND_INDIRECT:                                        \
383           ptr=((MPID_##kind*)                                           \
384                MPIU_Handle_get_ptr_indirect(a,&MPID_##kind##_mem));     \
385           break;                                                        \
386       case HANDLE_KIND_INVALID:                                         \
387       default:								\
388           ptr=0;							\
389           break;							\
390     }                                                                   \
391 }
392 
393 /* Convert handles to objects for MPI types that do _not_ have any predefined
394    objects */
395 #define MPID_Get_ptr(kind,a,ptr)					\
396 {									\
397    switch (HANDLE_GET_KIND(a)) {					\
398       case HANDLE_KIND_DIRECT:						\
399           ptr=MPID_##kind##_direct+HANDLE_INDEX(a);			\
400           break;							\
401       case HANDLE_KIND_INDIRECT:					\
402           ptr=((MPID_##kind*)						\
403                MPIU_Handle_get_ptr_indirect(a,&MPID_##kind##_mem));	\
404           break;							\
405       case HANDLE_KIND_INVALID:						\
406       case HANDLE_KIND_BUILTIN:						\
407       default:								\
408           ptr=0;							\
409           break;							\
410      }									\
411 }
412 
413 /* FIXME: the masks should be defined with the handle definitions instead
414    of inserted here as literals */
415 #define MPID_Comm_get_ptr(a,ptr)       MPID_Getb_ptr(Comm,a,0x03ffffff,ptr)
416 #define MPID_Group_get_ptr(a,ptr)      MPID_Getb_ptr(Group,a,0x03ffffff,ptr)
417 #define MPID_File_get_ptr(a,ptr)       MPID_Get_ptr(File,a,ptr)
418 #define MPID_Errhandler_get_ptr(a,ptr) MPID_Getb_ptr(Errhandler,a,0x3,ptr)
419 #define MPID_Op_get_ptr(a,ptr)         MPID_Getb_ptr(Op,a,0x000000ff,ptr)
420 #define MPID_Info_get_ptr(a,ptr)       MPID_Get_ptr(Info,a,ptr)
421 #define MPID_Win_get_ptr(a,ptr)        MPID_Get_ptr(Win,a,ptr)
422 #define MPID_Request_get_ptr(a,ptr)    MPID_Get_ptr(Request,a,ptr)
423 #define MPID_Grequest_class_get_ptr(a,ptr) MPID_Get_ptr(Grequest_class,a,ptr)
424 /* Keyvals have a special format. This is roughly MPID_Get_ptrb, but
425    the handle index is in a smaller bit field.  In addition,
426    there is no storage for the builtin keyvals.
427    For the indirect case, we mask off the part of the keyval that is
428    in the bits normally used for the indirect block index.
429 */
430 #define MPID_Keyval_get_ptr(a,ptr)     \
431 {                                                                       \
432    switch (HANDLE_GET_KIND(a)) {                                        \
433       case HANDLE_KIND_BUILTIN:                                         \
434           ptr=0;                                                        \
435           break;                                                        \
436       case HANDLE_KIND_DIRECT:                                          \
437           ptr=MPID_Keyval_direct+((a)&0x3fffff);                        \
438           break;                                                        \
439       case HANDLE_KIND_INDIRECT:                                        \
440           ptr=((MPID_Keyval*)                                           \
441              MPIU_Handle_get_ptr_indirect((a)&0xfc3fffff,&MPID_Keyval_mem)); \
442           break;                                                        \
443       case HANDLE_KIND_INVALID:                                         \
444       default:								\
445           ptr=0;							\
446           break;							\
447     }                                                                   \
448 }
449 
450 /* Valid pointer checks */
451 /* This test is lame.  Should eventually include cookie test
452    and in-range addresses */
453 #define MPID_Valid_ptr(kind,ptr,err) \
454   {if (!(ptr)) { err = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, \
455                                              "**nullptrtype", "**nullptrtype %s", #kind ); } }
456 #define MPID_Valid_ptr_class(kind,ptr,errclass,err) \
457   {if (!(ptr)) { err = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, errclass, \
458                                              "**nullptrtype", "**nullptrtype %s", #kind ); } }
459 
460 #define MPID_Info_valid_ptr(ptr,err) MPID_Valid_ptr_class(Info,ptr,MPI_ERR_INFO,err)
461 /* Check not only for a null pointer but for an invalid communicator,
462    such as one that has been freed.  Let's try the ref_count as the test
463    for now */
464 /* ticket #1441: check (refcount<=0) to cover the case of 0, an "over-free" of
465  * -1 or similar, and the 0xecec... case when --enable-g=mem is used */
466 #define MPID_Comm_valid_ptr(ptr,err) {                \
467      MPID_Valid_ptr_class(Comm,ptr,MPI_ERR_COMM,err); \
468      if ((ptr) && MPIU_Object_get_ref(ptr) <= 0) {    \
469          MPIU_ERR_SET(err,MPI_ERR_COMM,"**comm");     \
470          ptr = 0;                                     \
471      }                                                \
472 }
473 #define MPID_Group_valid_ptr(ptr,err) MPID_Valid_ptr_class(Group,ptr,MPI_ERR_GROUP,err)
474 #define MPID_Win_valid_ptr(ptr,err) MPID_Valid_ptr_class(Win,ptr,MPI_ERR_WIN,err)
475 #define MPID_Op_valid_ptr(ptr,err) MPID_Valid_ptr_class(Op,ptr,MPI_ERR_OP,err)
476 #define MPID_Errhandler_valid_ptr(ptr,err) MPID_Valid_ptr_class(Errhandler,ptr,MPI_ERR_ARG,err)
477 #define MPID_File_valid_ptr(ptr,err) MPID_Valid_ptr_class(File,ptr,MPI_ERR_FILE,err)
478 #define MPID_Request_valid_ptr(ptr,err) MPID_Valid_ptr_class(Request,ptr,MPI_ERR_REQUEST,err)
479 #define MPID_Keyval_valid_ptr(ptr,err) MPID_Valid_ptr_class(Keyval,ptr,MPI_ERR_KEYVAL,err)
480 
481 /* FIXME:
482    Generic pointer test.  This is applied to any address, not just one from
483    an MPI object.
484    Currently unimplemented (returns success except for null pointers.
485    With a little work, could check that the pointer is properly aligned,
486    using something like
487    ((p) == 0 || ((char *)(p) & MPID_Alignbits[alignment] != 0)
488    where MPID_Alignbits is set with a mask whose bits must be zero in a
489    properly aligned quantity.  For systems with no alignment rules,
490    all of these masks are zero, and this part of test can be eliminated.
491  */
492 #define MPID_Pointer_is_invalid(p,alignment) ((p) == 0)
493 /* Fixme: The following MPID_ALIGNED_xxx values are temporary.  They
494    need to be computed by configure and included in the mpichconf.h file.
495    Note that they cannot be set conservatively (i.e., as sizeof(object)),
496    since the runtime system may generate objects with lesser alignment
497    rules if the processor allows them.
498  */
499 #define MPID_ALIGNED_PTR_INT   1
500 #define MPID_ALIGNED_PTR_LONG  1
501 #define MPID_ALIGNED_PTR_VOIDP 1
502 /* ------------------------------------------------------------------------- */
503 /* end of code that should the following be moved into mpihandlemem.h ?*/
504 /* ------------------------------------------------------------------------- */
505 
506 /* ------------------------------------------------------------------------- */
507 /* Info */
508 /*TInfoOverview.tex
509 
510   'MPI_Info' provides a way to create a list of '(key,value)' pairs
511   where the 'key' and 'value' are both strings.  Because many routines, both
512   in the MPI implementation and in related APIs such as the PMI process
513   management interface, require 'MPI_Info' arguments, we define a simple
514   structure for each 'MPI_Info' element.  Elements are allocated by the
515   generic object allocator; the head element is always empty (no 'key'
516   or 'value' is defined on the head element).
517 
518   For simplicity, we have not abstracted the info data structures;
519   routines that want to work with the linked list may do so directly.
520   Because the 'MPI_Info' type is a handle and not a pointer, an MPIU
521   (utility) routine is provided to handle the
522   deallocation of 'MPID_Info' elements.  See the implementation of
523   'MPI_Info_create' for how an Info type is allocated.
524 
525   Thread Safety:
526 
527   The info interface itself is not thread-robust.  In particular, the routines
528   'MPI_INFO_GET_NKEYS' and 'MPI_INFO_GET_NTHKEY' assume that no other
529   thread modifies the info key.  (If the info routines had the concept
530   of a next value, they would not be thread safe.  As it stands, a user
531   must be careful if several threads have access to the same info object.)
532   Further, 'MPI_INFO_DUP', while not
533   explicitly advising implementers to be careful of one thread modifying the
534   'MPI_Info' structure while 'MPI_INFO_DUP' is copying it, requires that the
535   operation take place in a thread-safe manner.
536   There isn'' much that we can do about these cases.  There are other cases
537   that must be handled.  In particular, multiple threads are allowed to
538   update the same info value.  Thus, all of the update routines must be thread
539   safe; the simple implementation used in the MPICH implementation uses locks.
540   Note that the 'MPI_Info_delete' call does not need a lock; the defintion of
541   thread-safety means that any order of the calls functions correctly; since
542   it invalid either to delete the same 'MPI_Info' twice or to modify an
543   'MPI_Info' that has been deleted, only one thread at a time can call
544   'MPI_Info_free' on any particular 'MPI_Info' value.
545 
546   T*/
547 /*S
548   MPID_Info - Structure of an MPID info
549 
550   Notes:
551   There is no reference count because 'MPI_Info' values, unlike other MPI
552   objects, may be changed after they are passed to a routine without
553   changing the routine''s behavior.  In other words, any routine that uses
554   an 'MPI_Info' object must make a copy or otherwise act on any info value
555   that it needs.
556 
557   A linked list is used because the typical 'MPI_Info' list will be short
558   and a simple linked list is easy to implement and to maintain.  Similarly,
559   a single structure rather than separate header and element structures are
560   defined for simplicity.  No separate thread lock is provided because
561   info routines are not performance critical; they may use the single
562   critical section lock in the 'MPIR_Process' structure when they need a
563   thread lock.
564 
565   This particular form of linked list (in particular, with this particular
566   choice of the first two members) is used because it allows us to use
567   the same routines to manage this list as are used to manage the
568   list of free objects (in the file 'src/util/mem/handlemem.c').  In
569   particular, if lock-free routines for updating a linked list are
570   provided, they can be used for managing the 'MPID_Info' structure as well.
571 
572   The MPI standard requires that keys can be no less that 32 characters and
573   no more than 255 characters.  There is no mandated limit on the size
574   of values.
575 
576   Module:
577   Info-DS
578   S*/
579 typedef struct MPID_Info {
580     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
581     struct MPID_Info   *next;
582     char               *key;
583     char               *value;
584 } MPID_Info;
585 extern MPIU_Object_alloc_t MPID_Info_mem;
586 /* Preallocated info objects */
587 extern MPID_Info MPID_Info_direct[];
588 /* ------------------------------------------------------------------------- */
589 
590 /* ------------------------------------------------------------------------- */
591 /* Error Handlers */
592 /*E
593   MPID_Errhandler_fn - MPID Structure to hold an error handler function
594 
595   Notes:
596   The MPI-1 Standard declared only the C version of this, implicitly
597   assuming that 'int' and 'MPI_Fint' were the same.
598 
599   Since Fortran does not have a C-style variable number of arguments
600   interface, the Fortran interface simply accepts two arguments.  Some
601   calling conventions for Fortran (particularly under Windows) require
602   this.
603 
604   Module:
605   ErrHand-DS
606 
607   Questions:
608   What do we want to do about C++?  Do we want a hook for a routine that can
609   be called to throw an exception in C++, particularly if we give C++ access
610   to this structure?  Does the C++ handler need to be different (not part
611   of the union)?
612 
613   E*/
614 typedef union MPID_Errhandler_fn {
615    void (*C_Comm_Handler_function) ( MPI_Comm *, int *, ... );
616    void (*F77_Handler_function) ( MPI_Fint *, MPI_Fint * );
617    void (*C_Win_Handler_function) ( MPI_Win *, int *, ... );
618    void (*C_File_Handler_function) ( MPI_File *, int *, ... );
619 } MPID_Errhandler_fn;
620 
621 /*S
622   MPID_Errhandler - Description of the error handler structure
623 
624   Notes:
625   Device-specific information may indicate whether the error handler is active;
626   this can help prevent infinite recursion in error handlers caused by
627   user-error without requiring the user to be as careful.  We might want to
628   make this part of the interface so that the 'MPI_xxx_call_errhandler'
629   routines would check.
630 
631   It is useful to have a way to indicate that the errhandler is no longer
632   valid, to help catch the case where the user has freed the errhandler but
633   is still using a copy of the 'MPI_Errhandler' value.  We may want to
634   define the 'id' value for deleted errhandlers.
635 
636   Module:
637   ErrHand-DS
638   S*/
639 typedef struct MPID_Errhandler {
640   MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
641   MPID_Lang_t        language;
642   MPID_Object_kind   kind;
643   MPID_Errhandler_fn errfn;
644   /* Other, device-specific information */
645 #ifdef MPID_DEV_ERRHANDLER_DECL
646     MPID_DEV_ERRHANDLER_DECL
647 #endif
648 } MPID_Errhandler;
649 extern MPIU_Object_alloc_t MPID_Errhandler_mem;
650 /* Preallocated errhandler objects */
651 extern MPID_Errhandler MPID_Errhandler_builtin[];
652 extern MPID_Errhandler MPID_Errhandler_direct[];
653 
654 /* We never reference count the builtin error handler objects, regardless of how
655  * we decide to reference count the other predefined objects.  If we get to the
656  * point where we never reference count *any* of the builtin objects then we
657  * should probably remove these checks and let them fall through to the checks
658  * for BUILTIN down in the MPIU_Object_* routines. */
659 #define MPIR_Errhandler_add_ref( _errhand )                               \
660     do {                                                                  \
661         if (HANDLE_GET_KIND((_errhand)->handle) != HANDLE_KIND_BUILTIN) { \
662             MPIU_Object_add_ref( _errhand );                              \
663         }                                                                 \
664     } while (0)
665 #define MPIR_Errhandler_release_ref( _errhand, _inuse )                   \
666     do {                                                                  \
667         if (HANDLE_GET_KIND((_errhand)->handle) != HANDLE_KIND_BUILTIN) { \
668             MPIU_Object_release_ref( (_errhand), (_inuse) );              \
669         }                                                                 \
670         else {                                                            \
671             *(_inuse) = 1;                                                \
672         }                                                                 \
673     } while (0)
674 /* ------------------------------------------------------------------------- */
675 
676 /* ------------------------------------------------------------------------- */
677 /* Keyvals and attributes */
678 /*TKyOverview.tex
679 
680   Keyvals are MPI objects that, unlike most MPI objects, are defined to be
681   integers rather than a handle (e.g., 'MPI_Comm').  However, they really
682   `are` MPI opaque objects and are handled by the MPICH implementation in
683   the same way as all other MPI opaque objects.  The only difference is that
684   there is no 'typedef int MPI_Keyval;' in 'mpi.h'.  In particular, keyvals
685   are encoded (for direct and indirect references) in the same way that
686   other MPI opaque objects are
687 
688   Each keyval has a copy and a delete function associated with it.
689   Unfortunately, these have a slightly different calling sequence for
690   each language, particularly when the size of a pointer is
691   different from the size of a Fortran integer.  The unions
692   'MPID_Copy_function' and 'MPID_Delete_function' capture the differences
693   in a single union type.
694 
695   The above comment is out of date but has never been updated as it should
696   have to match the introduction of a different interface.  Beware!
697 
698   Notes:
699 
700   In the original design, retrieving a attribute from a different
701   language that set it was thought to be an error.  The MPI Forum
702   decided that this should be allowed, and after much discussion, the
703   behavior was defined.  Thus, we need to record what sort of
704   attribute was provided, and be able to properly return the correct
705   value in each case.  See MPI 2.2, Section 16.3.7 (Attributes) for
706   specific requirements.  One consequence of this is that the value
707   that is returned may have a different length that how it was set.
708   On little-endian platforms (e.g., x86), this doesn't cause much of a
709   problem, because the address is that of the least significant byte,
710   and the lower bytes have the data that is needed in the case that
711   the desired attribute type is shorter than the stored attribute.
712   However, on a big-endian platform (e.g., IBM POWER), since the most
713   significant bytes are stored first, depending on the length of the
714   result type, the address of the result may not be the beginning of
715   the memory area.  For example, assume that an MPI_Fint is 4 bytes
716   and a void * (and a Fortran INTEGER of kind MPI_ADDRESS_KIND) is 8
717   bytes, and let the attribute store the value in an 8 byte integer in
718   a field named "value".  On a little-endian platform, the address of
719   the value is always the beginning of the field "value".  On a
720   big-endian platform, the address of the value is the beginning of
721   the field if the return type is a pointer (e.g., from C) or Fortran
722   (KIND=MPI_ADDRESS_KIND), and the address of the beginning of the
723   field + 4 if the return type is a Fortran 77 integer (and, as
724   specified above, an MPI_Fint is 4 bytes shorter than a void *).
725 
726   For the big-endian case, it is possible to manage these shifts (using
727   WORDS_LITTLEENDIAN to detect the big-endian case).  Alternatively,
728   at a small cost in space, copies in variables of the correct length
729   can be maintained.  At this writing, the code in src/mpi/attr makes
730   use of WORDS_LITTLEENDIAN to provide the appropriate code for the most
731   common cases.
732 
733   T*/
734 /*TAttrOverview.tex
735  *
736  * The MPI standard allows `attributes`, essentially an '(integer,pointer)'
737  * pair, to be attached to communicators, windows, and datatypes.
738  * The integer is a `keyval`, which is allocated by a call (at the MPI level)
739  * to 'MPI_Comm/Type/Win_create_keyval'.  The pointer is the value of
740  * the attribute.
741  * Attributes are primarily intended for use by the user, for example, to save
742  * information on a communicator, but can also be used to pass data to the
743  * MPI implementation.  For example, an attribute may be used to pass
744  * Quality of Service information to an implementation to be used with
745  * communication on a particular communicator.
746  * To provide the most general access by the ADI to all attributes, the
747  * ADI defines a collection of routines that are used by the implementation
748  * of the MPI attribute routines (such as 'MPI_Comm_get_attr').
749  * In addition, the MPI routines involving attributes will invoke the
750  * corresponding 'hook' functions (e.g., 'MPID_Dev_comm_attr_set_hook')
751  * should the device define them.
752  *
753  * Attributes on windows and datatypes are defined by MPI but not of
754  * interest (as yet) to the device.
755  *
756  * In addition, there are seven predefined attributes that the device must
757  * supply to the implementation.  This is accomplished through
758  * data values that are part of the 'MPIR_Process' data block.
759  *  The predefined keyvals on 'MPI_COMM_WORLD' are\:
760  *.vb
761  * Keyval                     Related Module
762  * MPI_APPNUM                 Dynamic
763  * MPI_HOST                   Core
764  * MPI_IO                     Core
765  * MPI_LASTUSEDCODE           Error
766  * MPI_TAG_UB                 Communication
767  * MPI_UNIVERSE_SIZE          Dynamic
768  * MPI_WTIME_IS_GLOBAL        Timer
769  *.ve
770  * The values stored in the 'MPIR_Process' block are the actual values.  For
771  * example, the value of 'MPI_TAG_UB' is the integer value of the largest tag.
772  * The
773  * value of 'MPI_WTIME_IS_GLOBAL' is a '1' for true and '0' for false.  Likely
774  * values for 'MPI_IO' and 'MPI_HOST' are 'MPI_ANY_SOURCE' and 'MPI_PROC_NULL'
775  * respectively.
776  *
777  T*/
778 
779 /* Include the attribute access routines that permit access to the
780    attribute or its pointer, needed for cross-language access to attributes */
781 #include "mpi_attr.h"
782 
783 /* Because Comm, Datatype, and File handles are all ints, and because
784    attributes are otherwise identical between the three types, we
785    only store generic copy and delete functions.  This allows us to use
786    common code for the attribute set, delete, and dup functions */
787 /*E
788   MPID_Copy_function - MPID Structure to hold an attribute copy function
789 
790   Notes:
791   The appropriate element of this union is selected by using the language
792   field of the 'keyval'.
793 
794   Because 'MPI_Comm', 'MPI_Win', and 'MPI_Datatype' are all 'int's in
795   MPICH2, we use a single C copy function rather than have separate
796   ones for the Communicator, Window, and Datatype attributes.
797 
798   There are no corresponding typedefs for the Fortran functions.  The
799   F77 function corresponds to the Fortran 77 binding used in MPI-1 and the
800   F90 function corresponds to the Fortran 90 binding used in MPI-2.
801 
802   Module:
803   Attribute-DS
804 
805   E*/
806 int
807 MPIR_Attr_copy_c_proxy(
808     MPI_Comm_copy_attr_function* user_function,
809     int handle,
810     int keyval,
811     void* extra_state,
812     MPIR_AttrType attrib_type,
813     void* attrib,
814     void** attrib_copy,
815     int* flag
816     );
817 
818 typedef struct MPID_Copy_function {
819   int  (*C_CopyFunction)( int, int, void *, void *, void *, int * );
820   void (*F77_CopyFunction)  ( MPI_Fint *, MPI_Fint *, MPI_Fint *, MPI_Fint *,
821                               MPI_Fint *, MPI_Fint *, MPI_Fint * );
822   void (*F90_CopyFunction)  ( MPI_Fint *, MPI_Fint *, MPI_Aint *, MPI_Aint *,
823                               MPI_Aint *, MPI_Fint *, MPI_Fint * );
824   /* The generic lang-independent user_function and proxy will
825    * replace the lang dependent copy funcs above
826    * Currently the lang-indpendent funcs are used only for keyvals
827    */
828   MPI_Comm_copy_attr_function *user_function;
829   MPID_Attr_copy_proxy *proxy;
830   /* The C++ function is the same as the C function */
831 } MPID_Copy_function;
832 
833 /*E
834   MPID_Delete_function - MPID Structure to hold an attribute delete function
835 
836   Notes:
837   The appropriate element of this union is selected by using the language
838   field of the 'keyval'.
839 
840   Because 'MPI_Comm', 'MPI_Win', and 'MPI_Datatype' are all 'int's in
841   MPICH2, we use a single C delete function rather than have separate
842   ones for the Communicator, Window, and Datatype attributes.
843 
844   There are no corresponding typedefs for the Fortran functions.  The
845   F77 function corresponds to the Fortran 77 binding used in MPI-1 and the
846   F90 function corresponds to the Fortran 90 binding used in MPI-2.
847 
848   Module:
849   Attribute-DS
850 
851   E*/
852 int
853 MPIR_Attr_delete_c_proxy(
854     MPI_Comm_delete_attr_function* user_function,
855     int handle,
856     int keyval,
857     MPIR_AttrType attrib_type,
858     void* attrib,
859     void* extra_state
860     );
861 
862 typedef struct MPID_Delete_function {
863   int  (*C_DeleteFunction)  ( int, int, void *, void * );
864   void (*F77_DeleteFunction)( MPI_Fint *, MPI_Fint *, MPI_Fint *, MPI_Fint *,
865                               MPI_Fint * );
866   void (*F90_DeleteFunction)( MPI_Fint *, MPI_Fint *, MPI_Aint *, MPI_Aint *,
867                               MPI_Fint * );
868   /* The generic lang-independent user_function and proxy will
869    * replace the lang dependent copy funcs above
870    * Currently the lang-indpendent funcs are used only for keyvals
871    */
872   MPI_Comm_delete_attr_function *user_function;
873   MPID_Attr_delete_proxy *proxy;
874 } MPID_Delete_function;
875 
876 /*S
877   MPID_Keyval - Structure of an MPID keyval
878 
879   Module:
880   Attribute-DS
881 
882   S*/
883 typedef struct MPID_Keyval {
884     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
885     MPID_Object_kind     kind;
886     int                  was_freed;
887     void                 *extra_state;
888     MPID_Copy_function   copyfn;
889     MPID_Delete_function delfn;
890   /* other, device-specific information */
891 #ifdef MPID_DEV_KEYVAL_DECL
892     MPID_DEV_KEYVAL_DECL
893 #endif
894 } MPID_Keyval;
895 
896 #define MPIR_Keyval_add_ref( _keyval )                                  \
897     do {                                                                \
898         MPIU_Object_add_ref( _keyval );                                 \
899     } while(0)
900 
901 #define MPIR_Keyval_release_ref( _keyval, _inuse )                      \
902     do {                                                                \
903         MPIU_Object_release_ref( _keyval, _inuse );                     \
904     } while(0)
905 
906 
907 /* Attribute values in C/C++ are void * and in Fortran are ADDRESS_SIZED
908    integers.  Normally, these are the same size, but in at least one
909    case, the address-sized integers was selected as longer than void *
910    to work with the datatype code used in the I/O library.  While this
911    is really a limitation in the current Datatype implementation. */
912 #ifdef USE_AINT_FOR_ATTRVAL
913 typedef MPI_Aint MPID_AttrVal_t;
914 #else
915 typedef void * MPID_AttrVal_t;
916 #endif
917 
918 /* Attributes need no ref count or handle, but since we want to use the
919    common block allocator for them, we must provide those elements
920 */
921 /*S
922   MPID_Attribute - Structure of an MPID attribute
923 
924   Notes:
925   Attributes don''t have 'ref_count's because they don''t have reference
926   count semantics.  That is, there are no shallow copies or duplicates
927   of an attibute.  An attribute is copied when the communicator that
928   it is attached to is duplicated.  Subsequent operations, such as
929   'MPI_Comm_attr_free', can change the attribute list for one of the
930   communicators but not the other, making it impractical to keep the
931   same list.  (We could defer making the copy until the list is changed,
932   but even then, there would be no reference count on the individual
933   attributes.)
934 
935   A pointer to the keyval, rather than the (integer) keyval itself is
936   used since there is no need within the attribute structure to make
937   it any harder to find the keyval structure.
938 
939   The attribute value is a 'void *'.  If 'sizeof(MPI_Fint)' > 'sizeof(void*)',
940   then this must be changed (no such system has been encountered yet).
941   For the Fortran 77 routines in the case where 'sizeof(MPI_Fint)' <
942   'sizeof(void*)', the high end of the 'void *' value is used.  That is,
943   we cast it to 'MPI_Fint *' and use that value.
944 
945   MPI defines three kinds of attributes (see MPI 2.1, Section 16.3, pages
946   487-488 (the standard says two, but there are really three, as discussed
947   below).  These are pointer-valued attributes and two types of integer-valued
948   attributes.
949   Pointer-valued attributes are used in C.
950   Integer-valued attributes are used in Fortran.  These are of type either
951   INTEGER or INTEGER(KIND=MPI_ADDRESS_KIND).
952 
953   The predefined attributes are a combination of INTEGER and pointers.
954 
955   Module:
956   Attribute-DS
957 
958  S*/
959 typedef struct MPID_Attribute {
960     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
961     MPID_Keyval  *keyval;           /* Keyval structure for this attribute */
962 
963     struct MPID_Attribute *next;    /* Pointer to next in the list */
964     MPIR_AttrType attrType;         /* Type of the attribute */
965     long        pre_sentinal;       /* Used to detect user errors in accessing
966 				       the value */
967     MPID_AttrVal_t value;           /* Stored value. An Aint must be at least
968 				       as large as an address - some builds
969 				       may make an Aint larger than a void * */
970     long        post_sentinal;      /* Like pre_sentinal */
971     /* other, device-specific information */
972 #ifdef MPID_DEV_ATTR_DECL
973     MPID_DEV_ATTR_DECL
974 #endif
975 } MPID_Attribute;
976 /* ------------------------------------------------------------------------- */
977 
978 /*---------------------------------------------------------------------------
979  * Groups are *not* a major data structure in MPICH-2.  They are provided
980  * only because they are required for the group operations (e.g.,
981  * MPI_Group_intersection) and for the scalable RMA synchronization
982  *---------------------------------------------------------------------------*/
983 /* This structure is used to implement the group operations such as
984    MPI_Group_translate_ranks */
985 typedef struct MPID_Group_pmap_t {
986     int          lrank;     /* Local rank in group (between 0 and size-1) */
987     int          lpid;      /* local process id, from VCONN */
988     int          next_lpid; /* Index of next lpid (in lpid order) */
989     int          flag;      /* marker, used to implement group operations */
990 } MPID_Group_pmap_t;
991 
992 /* Any changes in the MPID_Group structure must be made to the
993    predefined value in MPID_Group_builtin for MPI_GROUP_EMPTY in
994    src/mpi/group/grouputil.c */
995 /*S
996  MPID_Group - Description of the Group data structure
997 
998  The processes in the group of 'MPI_COMM_WORLD' have lpid values 0 to 'size'-1,
999  where 'size' is the size of 'MPI_COMM_WORLD'.  Processes created by
1000  'MPI_Comm_spawn' or 'MPI_Comm_spawn_multiple' or added by 'MPI_Comm_attach'
1001  or
1002  'MPI_Comm_connect'
1003  are numbered greater than 'size - 1' (on the calling process). See the
1004  discussion of LocalPID values.
1005 
1006  Note that when dynamic process creation is used, the pids are `not` unique
1007  across the universe of connected MPI processes.  This is ok, as long as
1008  pids are interpreted `only` on the process that owns them.
1009 
1010  Only for MPI-1 are the lpid''s equal to the `global` pids.  The local pids
1011  can be thought of as a reference not to the remote process itself, but
1012  how the remote process can be reached from this process.  We may want to
1013  have a structure 'MPID_Lpid_t' that contains information on the remote
1014  process, such as (for TCP) the hostname, ip address (it may be different if
1015  multiple interfaces are supported; we may even want plural ip addresses for
1016  stripping communication), and port (or ports).  For shared memory connected
1017  processes, it might have the address of a remote queue.  The lpid number
1018  is an index into a table of 'MPID_Lpid_t'''s that contain this (device- and
1019  method-specific) information.
1020 
1021  Module:
1022  Group-DS
1023 
1024  S*/
1025 typedef struct MPID_Group {
1026     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1027     int          size;           /* Size of a group */
1028     int          rank;           /* rank of this process relative to this
1029 				    group */
1030     int          idx_of_first_lpid;
1031     MPID_Group_pmap_t *lrank_to_lpid; /* Array mapping a local rank to local
1032 					 process number */
1033     int          is_local_dense_monotonic; /* see NOTE-G1 */
1034 
1035     /* We may want some additional data for the RMA syncrhonization calls */
1036   /* Other, device-specific information */
1037 #ifdef MPID_DEV_GROUP_DECL
1038     MPID_DEV_GROUP_DECL
1039 #endif
1040 } MPID_Group;
1041 
1042 /* NOTE-G1: is_local_dense_monotonic will be true iff the group meets the
1043  * following criteria:
1044  * 1) the lpids are all in the range [0,size-1], i.e. a subset of comm world
1045  * 2) the pids are sequentially numbered in increasing order, without any gaps,
1046  *    stride, or repetitions
1047  *
1048  * This additional information allows us to handle the common case (insofar as
1049  * group ops are common) for MPI_Group_translate_ranks where group2 is
1050  * group_of(MPI_COMM_WORLD), or some simple subset.  This is an important use
1051  * case for many MPI tool libraries, such as Scalasca.
1052  */
1053 
1054 extern MPIU_Object_alloc_t MPID_Group_mem;
1055 /* Preallocated group objects */
1056 #define MPID_GROUP_N_BUILTIN 1
1057 extern MPID_Group MPID_Group_builtin[MPID_GROUP_N_BUILTIN];
1058 extern MPID_Group MPID_Group_direct[];
1059 
1060 /* Object for empty group */
1061 extern MPID_Group * const MPID_Group_empty;
1062 
1063 #define MPIR_Group_add_ref( _group ) \
1064     do { MPIU_Object_add_ref( _group ); } while (0)
1065 
1066 #define MPIR_Group_release_ref( _group, _inuse ) \
1067      do { MPIU_Object_release_ref( _group, _inuse ); } while (0)
1068 
1069 void MPIR_Group_setup_lpid_list( MPID_Group * );
1070 int MPIR_GroupCheckVCRSubset( MPID_Group *group_ptr, int vsize, MPID_VCR *vcr, int *idx );
1071 
1072 /* ------------------------------------------------------------------------- */
1073 
1074 /*E
1075   MPID_Comm_kind_t - Name the two types of communicators
1076   E*/
1077 typedef enum MPID_Comm_kind_t {
1078     MPID_INTRACOMM = 0,
1079     MPID_INTERCOMM = 1 } MPID_Comm_kind_t;
1080 
1081 /* ideally we could add these to MPID_Comm_kind_t, but there's too much existing
1082  * code that assumes that the only valid values are INTRACOMM or INTERCOMM */
1083 typedef enum MPID_Comm_hierarchy_kind_t {
1084     MPID_HIERARCHY_FLAT = 0,        /* no hierarchy */
1085     MPID_HIERARCHY_PARENT = 1,      /* has subcommunicators */
1086     MPID_HIERARCHY_NODE_ROOTS = 2,  /* is the subcomm for node roots */
1087     MPID_HIERARCHY_NODE = 3,        /* is the subcomm for a node */
1088     MPID_HIERARCHY_SIZE             /* cardinality of this enum */
1089 } MPID_Comm_hierarchy_kind_t;
1090 /* Communicators */
1091 
1092 /*S
1093   MPID_Comm - Description of the Communicator data structure
1094 
1095   Notes:
1096   Note that the size and rank duplicate data in the groups that
1097   make up this communicator.  These are used often enough that this
1098   optimization is valuable.
1099 
1100   This definition provides only a 16-bit integer for context id''s .
1101   This should be sufficient for most applications.  However, extending
1102   this to a 32-bit (or longer) integer should be easy.
1103 
1104   There are two context ids.  One is used for sending and one for
1105   receiving.  In the case of an Intracommunicator, they are the same
1106   context id.  They differ in the case of intercommunicators, where
1107   they may come from processes in different comm worlds (in the
1108   case of MPI-2 dynamic process intercomms).
1109 
1110   The virtual connection table is an explicit member of this structure.
1111   This contains the information used to contact a particular process,
1112   indexed by the rank relative to this communicator.
1113 
1114   Groups are allocated lazily.  That is, the group pointers may be
1115   null, created only when needed by a routine such as 'MPI_Comm_group'.
1116   The local process ids needed to form the group are available within
1117   the virtual connection table.
1118   For intercommunicators, we may want to always have the groups.  If not,
1119   we either need the 'local_group' or we need a virtual connection table
1120   corresponding to the 'local_group' (we may want this anyway to simplify
1121   the implementation of the intercommunicator collective routines).
1122 
1123   The pointer to the structure 'MPID_Collops' containing pointers to the
1124   collective
1125   routines allows an implementation to replace each routine on a
1126   routine-by-routine basis.  By default, this pointer is null, as are the
1127   pointers within the structure.  If either pointer is null, the implementation
1128   uses the generic provided implementation.  This choice, rather than
1129   initializing the table with pointers to all of the collective routines,
1130   is made to reduce the space used in the communicators and to eliminate the
1131   need to include the implementation of all collective routines in all MPI
1132   executables, even if the routines are not used.
1133 
1134   The macro 'MPID_HAS_HETERO' may be defined by a device to indicate that
1135   the device supports MPI programs that must communicate between processes with
1136   different data representations (e.g., different sized integers or different
1137   byte orderings).  If the device does need to define this value, it should
1138   be defined in the file 'mpidpre.h'.
1139 
1140   Please note that the local_size and remote_size fields can be confusing.  For
1141   intracommunicators both fields are always equal to the size of the
1142   communicator.  For intercommunicators local_size is equal to the size of
1143   local_group while remote_size is equal to the size of remote_group.
1144 
1145   Module:
1146   Communicator-DS
1147 
1148   Question:
1149   For fault tolerance, do we want to have a standard field for communicator
1150   health?  For example, ok, failure detected, all (live) members of failed
1151   communicator have acked.
1152   S*/
1153 typedef struct MPID_Comm {
1154     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1155     MPIR_Context_id_t context_id; /* Send context id.  See notes */
1156     MPIR_Context_id_t recvcontext_id; /* Send context id.  See notes */
1157     int           remote_size;   /* Value of MPI_Comm_(remote)_size */
1158     int           rank;          /* Value of MPI_Comm_rank */
1159     MPID_VCRT     vcrt;          /* virtual connecton reference table */
1160     MPID_VCR *    vcr;           /* alias to the array of virtual connections
1161 				    in vcrt */
1162     MPID_VCRT     local_vcrt;    /* local virtual connecton reference table */
1163     MPID_VCR *    local_vcr;     /* alias to the array of local virtual
1164 				    connections in local vcrt */
1165     MPID_Attribute *attributes;  /* List of attributes */
1166     int           local_size;    /* Value of MPI_Comm_size for local group */
1167     MPID_Group   *local_group,   /* Groups in communicator. */
1168                  *remote_group;  /* The local and remote groups are the
1169                                     same for intra communicators */
1170     MPID_Comm_kind_t comm_kind;  /* MPID_INTRACOMM or MPID_INTERCOMM */
1171     char          name[MPI_MAX_OBJECT_NAME];  /* Required for MPI-2 */
1172     MPID_Errhandler *errhandler; /* Pointer to the error handler structure */
1173     struct MPID_Comm    *local_comm; /* Defined only for intercomms, holds
1174 				        an intracomm for the local group */
1175 
1176     MPID_Comm_hierarchy_kind_t hierarchy_kind; /* flat, parent, node, or node_roots */
1177     struct MPID_Comm *node_comm; /* Comm of processes in this comm that are on
1178                                     the same node as this process. */
1179     struct MPID_Comm *node_roots_comm; /* Comm of root processes for other nodes. */
1180     int *intranode_table;        /* intranode_table[i] gives the rank in
1181                                     node_comm of rank i in this comm or -1 if i
1182                                     is not in this process' node_comm.
1183                                     It is of size 'local_size'. */
1184     int *internode_table;        /* internode_table[i] gives the rank in
1185                                     node_roots_comm of rank i in this comm.
1186                                     It is of size 'local_size'. */
1187 
1188     int           is_low_group;  /* For intercomms only, this boolean is
1189 				    set for all members of one of the
1190 				    two groups of processes and clear for
1191 				    the other.  It enables certain
1192 				    intercommunicator collective operations
1193 				    that wish to use half-duplex operations
1194 				    to implement a full-duplex operation */
1195     struct MPID_Comm     *comm_next;/* Provides a chain through all active
1196 				       communicators */
1197     struct MPID_Collops  *coll_fns; /* Pointer to a table of functions
1198                                               implementing the collective
1199                                               routines */
1200     struct MPID_TopoOps  *topo_fns; /* Pointer to a table of functions
1201 				       implementting the topology routines
1202 				    */
1203     int next_sched_tag;             /* used by the NBC schedule code to allocate tags */
1204 #ifdef MPID_HAS_HETERO
1205     int is_hetero;
1206 #endif
1207   /* Other, device-specific information */
1208 #ifdef MPID_DEV_COMM_DECL
1209     MPID_DEV_COMM_DECL
1210 #endif
1211 } MPID_Comm;
1212 extern MPIU_Object_alloc_t MPID_Comm_mem;
1213 
1214 /* this function should not be called by normal code! */
1215 int MPIR_Comm_delete_internal(MPID_Comm * comm_ptr, int isDisconnect);
1216 
1217 #define MPIR_Comm_add_ref(_comm) \
1218     do { MPIU_Object_add_ref((_comm)); } while (0)
1219 #define MPIR_Comm_release_ref( _comm, _inuse ) \
1220     do { MPIU_Object_release_ref( _comm, _inuse ); } while (0)
1221 
1222 
1223 /* Release a reference to a communicator.  If there are no pending
1224    references, delete the communicator and recover all storage and
1225    context ids.
1226 
1227    This routine has been inlined because keeping it as a separate routine
1228    results in a >5% performance hit for the SQMR benchmark.
1229 */
1230 #undef FUNCNAME
1231 #define FUNCNAME MPIR_Comm_release
1232 #undef FCNAME
1233 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Comm_release(MPID_Comm * comm_ptr,int isDisconnect)1234 static inline int MPIR_Comm_release(MPID_Comm * comm_ptr, int isDisconnect)
1235 {
1236     int mpi_errno = MPI_SUCCESS;
1237     int in_use;
1238 
1239     MPIR_Comm_release_ref(comm_ptr, &in_use);
1240     if (unlikely(!in_use)) {
1241         /* the following routine should only be called by this function and its
1242          * "_always" variant. */
1243         mpi_errno = MPIR_Comm_delete_internal(comm_ptr, isDisconnect);
1244         /* not ERR_POPing here to permit simpler inlining.  Our caller will
1245          * still report the error from the comm_delete level. */
1246     }
1247 
1248     return mpi_errno;
1249 }
1250 #undef FUNCNAME
1251 #undef FCNAME
1252 
1253 /* MPIR_Comm_release_always is the same as MPIR_Comm_release except it uses
1254    MPIR_Comm_release_ref_always instead.
1255 */
1256 int MPIR_Comm_release_always(MPID_Comm *comm_ptr, int isDisconnect);
1257 
1258 
1259 /* Preallocated comm objects.  There are 3: comm_world, comm_self, and
1260    a private (non-user accessible) dup of comm world that is provided
1261    if needed in MPI_Finalize.  Having a separate version of comm_world
1262    avoids possible interference with User code */
1263 #define MPID_COMM_N_BUILTIN 3
1264 extern MPID_Comm MPID_Comm_builtin[MPID_COMM_N_BUILTIN];
1265 extern MPID_Comm MPID_Comm_direct[];
1266 /* This is the handle for the internal MPI_COMM_WORLD .  The "2" at the end
1267    of the handle is 3-1 (e.g., the index in the builtin array) */
1268 #define MPIR_ICOMM_WORLD  ((MPI_Comm)0x44000002)
1269 
1270 /* The following preprocessor macros provide bitfield access information for
1271  * context ID values.  They follow a uniform naming pattern:
1272  *
1273  * MPID_CONTEXT_foo_WIDTH - the width in bits of the field
1274  * MPID_CONTEXT_foo_MASK  - A valid bit mask for bit-wise AND and OR operations
1275  *                          with exactly all of the bits in the field set.
1276  * MPID_CONTEXT_foo_SHIFT - The number of bits that the field should be shifted
1277  *                          rightwards to place it in the least significant bits
1278  *                          of the ID.  There may still be higher order bits
1279  *                          from other fields, so the _MASK should be used first
1280  *                          if you want to reliably retrieve the exact value of
1281  *                          the field.
1282  */
1283 
1284 /* yields an rvalue that is the value of the field_name_ in the least significant bits */
1285 #define MPID_CONTEXT_READ_FIELD(field_name_,id_) \
1286     (((id_) & MPID_CONTEXT_##field_name_##_MASK) >> MPID_CONTEXT_##field_name_##_SHIFT)
1287 /* yields an rvalue that is the old_id_ with field_name_ set to field_val_ */
1288 #define MPID_CONTEXT_SET_FIELD(field_name_,old_id_,field_val_) \
1289     ((old_id_ & ~MPID_CONTEXT_##field_name_##_MASK) | ((field_val_) << MPID_CONTEXT_##field_name_##_SHIFT))
1290 
1291 /* Context suffixes for separating pt2pt and collective communication */
1292 #define MPID_CONTEXT_SUFFIX_WIDTH (1)
1293 #define MPID_CONTEXT_SUFFIX_SHIFT (0)
1294 #define MPID_CONTEXT_SUFFIX_MASK ((1 << MPID_CONTEXT_SUFFIX_WIDTH) - 1)
1295 #define MPID_CONTEXT_INTRA_PT2PT (0)
1296 #define MPID_CONTEXT_INTRA_COLL  (1)
1297 #define MPID_CONTEXT_INTER_PT2PT (0)
1298 #define MPID_CONTEXT_INTER_COLL  (1)
1299 
1300 /* Used to derive context IDs for sub-communicators from a parent communicator's
1301    context ID value.  This field comes after the one bit suffix.
1302    values are shifted left by 1. */
1303 #define MPID_CONTEXT_SUBCOMM_WIDTH (2)
1304 #define MPID_CONTEXT_SUBCOMM_SHIFT (MPID_CONTEXT_SUFFIX_WIDTH + MPID_CONTEXT_SUFFIX_SHIFT)
1305 #define MPID_CONTEXT_SUBCOMM_MASK      (((1 << MPID_CONTEXT_SUBCOMM_WIDTH) - 1) << MPID_CONTEXT_SUBCOMM_SHIFT)
1306 
1307 /* these values may be added/subtracted directly to/from an existing context ID
1308  * in order to determine the context ID of the child/parent */
1309 #define MPID_CONTEXT_PARENT_OFFSET    (0 << MPID_CONTEXT_SUBCOMM_SHIFT)
1310 #define MPID_CONTEXT_INTRANODE_OFFSET (1 << MPID_CONTEXT_SUBCOMM_SHIFT)
1311 #define MPID_CONTEXT_INTERNODE_OFFSET (2 << MPID_CONTEXT_SUBCOMM_SHIFT)
1312 
1313 /* this field (IS_LOCALCOM) is used to derive a context ID for local
1314  * communicators of intercommunicators without communication */
1315 #define MPID_CONTEXT_IS_LOCALCOMM_WIDTH (1)
1316 #define MPID_CONTEXT_IS_LOCALCOMM_SHIFT (MPID_CONTEXT_SUBCOMM_SHIFT + MPID_CONTEXT_SUBCOMM_WIDTH)
1317 #define MPID_CONTEXT_IS_LOCALCOMM_MASK (((1 << MPID_CONTEXT_IS_LOCALCOMM_WIDTH) - 1) << MPID_CONTEXT_IS_LOCALCOMM_SHIFT)
1318 
1319 /* MPIR_MAX_CONTEXT_MASK is the number of ints that make up the bit vector that
1320  * describes the context ID prefix space.
1321  *
1322  * The following must hold:
1323  * (num_bits_in_vector) <= (maximum_context_id_prefix)
1324  *   which is the following in concrete terms:
1325  * MPIR_MAX_CONTEXT_MASK*MPIR_CONTEXT_INT_BITS <= 2**(MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))
1326  *
1327  * We currently always assume MPIR_CONTEXT_INT_BITS is 32, regardless of the
1328  * value of sizeof(int)*CHAR_BITS.  We also make the assumption that CHAR_BITS==8.
1329  *
1330  * For a 16-bit context id field and CHAR_BITS==8, this implies MPIR_MAX_CONTEXT_MASK <= 256
1331  */
1332 
1333 /* number of bits to shift right by in order to obtain the context ID prefix */
1334 #define MPID_CONTEXT_PREFIX_SHIFT (MPID_CONTEXT_IS_LOCALCOMM_SHIFT + MPID_CONTEXT_IS_LOCALCOMM_WIDTH)
1335 #define MPID_CONTEXT_PREFIX_WIDTH (MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))
1336 #define MPID_CONTEXT_PREFIX_MASK (((1 << MPID_CONTEXT_PREFIX_WIDTH) - 1) << MPID_CONTEXT_PREFIX_SHIFT)
1337 
1338 #define MPID_CONTEXT_DYNAMIC_PROC_WIDTH (1) /* the upper half is reserved for dynamic procs */
1339 #define MPID_CONTEXT_DYNAMIC_PROC_SHIFT (MPIR_CONTEXT_ID_BITS - MPID_CONTEXT_DYNAMIC_PROC_WIDTH) /* the upper half is reserved for dynamic procs */
1340 #define MPID_CONTEXT_DYNAMIC_PROC_MASK (((1 << MPID_CONTEXT_DYNAMIC_PROC_WIDTH) - 1) << MPID_CONTEXT_DYNAMIC_PROC_SHIFT)
1341 
1342 /* should probably be (sizeof(int)*CHAR_BITS) once we make the code CHAR_BITS-clean */
1343 #define MPIR_CONTEXT_INT_BITS (32)
1344 #define MPIR_CONTEXT_ID_BITS (sizeof(MPIR_Context_id_t)*8) /* 8 --> CHAR_BITS eventually */
1345 #define MPIR_MAX_CONTEXT_MASK \
1346     ((1 << (MPIR_CONTEXT_ID_BITS - (MPID_CONTEXT_PREFIX_SHIFT + MPID_CONTEXT_DYNAMIC_PROC_WIDTH))) / MPIR_CONTEXT_INT_BITS)
1347 
1348 /* Utility routines.  Where possible, these are kept in the source directory
1349    with the other comm routines (src/mpi/comm, in mpicomm.h).  However,
1350    to create a new communicator after a spawn or connect-accept operation,
1351    the device may need to create a new contextid */
1352 int MPIR_Get_contextid( MPID_Comm *, MPIR_Context_id_t *context_id );
1353 int MPIR_Get_contextid_sparse(MPID_Comm *comm_ptr, MPIR_Context_id_t *context_id, int ignore_id);
1354 int MPIR_Get_contextid_sparse_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr, int tag, MPIR_Context_id_t *context_id, int ignore_id);
1355 void MPIR_Free_contextid( MPIR_Context_id_t );
1356 
1357 /* ------------------------------------------------------------------------- */
1358 
1359 /* Requests */
1360 /* This currently defines a single structure type for all requests.
1361    Eventually, we may want a union type, as used in MPICH-1 */
1362 /* NOTE-R1: MPID_REQUEST_MPROBE signifies that this is a request created by
1363  * MPI_Mprobe or MPI_Improbe.  Since we use MPI_Request objects as our
1364  * MPI_Message objects, we use this separate kind in order to provide stronger
1365  * error checking.  Once a message (backed by a request) is promoted to a real
1366  * request by calling MPI_Mrecv/MPI_Imrecv, we actually modify the kind to be
1367  * MPID_REQUEST_RECV in order to keep completion logic as simple as possible. */
1368 /*E
1369   MPID_Request_kind - Kinds of MPI Requests
1370 
1371   Module:
1372   Request-DS
1373 
1374   E*/
1375 typedef enum MPID_Request_kind_t {
1376     MPID_REQUEST_UNDEFINED,
1377     MPID_REQUEST_SEND,
1378     MPID_REQUEST_RECV,
1379     MPID_PREQUEST_SEND,
1380     MPID_PREQUEST_RECV,
1381     MPID_UREQUEST,
1382     MPID_COLL_REQUEST,
1383     MPID_REQUEST_MPROBE, /* see NOTE-R1 */
1384     MPID_LAST_REQUEST_KIND
1385 #ifdef MPID_DEV_REQUEST_KIND_DECL
1386     , MPID_DEV_REQUEST_KIND_DECL
1387 #endif
1388 } MPID_Request_kind_t;
1389 
1390 /* Typedefs for Fortran generalized requests */
1391 typedef void (MPIR_Grequest_f77_cancel_function)(void *, MPI_Fint*, MPI_Fint *);
1392 typedef void (MPIR_Grequest_f77_free_function)(void *, MPI_Fint *);
1393 typedef void (MPIR_Grequest_f77_query_function)(void *, MPI_Fint *, MPI_Fint *);
1394 
1395 /* vtable-ish structure holding generalized request function pointers and other
1396  * state.  Saves ~48 bytes in pt2pt requests on many platforms. */
1397 struct MPID_Grequest_fns {
1398     MPI_Grequest_cancel_function *cancel_fn;
1399     MPI_Grequest_free_function   *free_fn;
1400     MPI_Grequest_query_function  *query_fn;
1401     MPIX_Grequest_poll_function   *poll_fn;
1402     MPIX_Grequest_wait_function   *wait_fn;
1403     void             *grequest_extra_state;
1404     MPIX_Grequest_class         greq_class;
1405     MPID_Lang_t                  greq_lang;         /* language that defined
1406                                                        the generalize req */
1407 };
1408 
1409 /* see mpiimplthread.h for the def of MPID_cc_t and related functions/macros */
1410 #define MPID_Request_is_complete(req_) (MPID_cc_is_complete((req_)->cc_ptr))
1411 
1412 /*S
1413   MPID_Request - Description of the Request data structure
1414 
1415   Module:
1416   Request-DS
1417 
1418   Notes:
1419   If it is necessary to remember the MPI datatype, this information is
1420   saved within the device-specific fields provided by 'MPID_DEV_REQUEST_DECL'.
1421 
1422   Requests come in many flavors, as stored in the 'kind' field.  It is
1423   expected that each kind of request will have its own structure type
1424   (e.g., 'MPID_Request_send_t') that extends the 'MPID_Request'.
1425 
1426   S*/
1427 typedef struct MPID_Request {
1428     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1429     MPID_Request_kind_t kind;
1430     /* pointer to the completion counter */
1431     /* This is necessary for the case when an operation is described by a
1432        list of requests */
1433     MPID_cc_t *cc_ptr;
1434     /* A comm is needed to find the proper error handler */
1435     MPID_Comm *comm;
1436     /* completion counter.  Ensure cc and status are in the same cache
1437        line, assuming the cache line size is a multiple of 32 bytes
1438        and 32-bit integers */
1439     MPID_cc_t cc;
1440     /* Status is needed for wait/test/recv */
1441     MPI_Status status;
1442     /* Persistent requests have their own "real" requests.  Receive requests
1443        have partnering send requests when src=dest. etc. */
1444     struct MPID_Request *partner_request;
1445 
1446     /* User-defined request support via a "vtable".  Saves space in the already
1447      * bloated request for regular pt2pt and NBC requests. */
1448     struct MPID_Grequest_fns *greq_fns;
1449 
1450     /* Other, device-specific information */
1451 #ifdef MPID_DEV_REQUEST_DECL
1452     MPID_DEV_REQUEST_DECL
1453 #endif
1454 } MPID_Request ATTRIBUTE((__aligned__(32)));
1455 
1456 extern MPIU_Object_alloc_t MPID_Request_mem;
1457 /* Preallocated request objects */
1458 extern MPID_Request MPID_Request_direct[];
1459 
1460 #define MPIR_Request_add_ref( _req ) \
1461     do { MPIU_Object_add_ref( _req ); } while (0)
1462 
1463 #define MPIR_Request_release_ref( _req, _inuse ) \
1464     do { MPIU_Object_release_ref( _req, _inuse ); } while (0)
1465 
1466 /* These macros allow us to implement a sendq when debugger support is
1467    selected.  As there is extra overhead for this, we only do this
1468    when specifically requested
1469 */
1470 #ifdef HAVE_DEBUGGER_SUPPORT
1471 void MPIR_WaitForDebugger( void );
1472 void MPIR_DebuggerSetAborting( const char * );
1473 void MPIR_Sendq_remember(MPID_Request *, int, int, int );
1474 void MPIR_Sendq_forget(MPID_Request *);
1475 void MPIR_CommL_remember( MPID_Comm * );
1476 void MPIR_CommL_forget( MPID_Comm * );
1477 
1478 #define MPIR_SENDQ_REMEMBER(_a,_b,_c,_d) MPIR_Sendq_remember(_a,_b,_c,_d)
1479 #define MPIR_SENDQ_FORGET(_a) MPIR_Sendq_forget(_a)
1480 #define MPIR_COMML_REMEMBER(_a) MPIR_CommL_remember( _a )
1481 #define MPIR_COMML_FORGET(_a) MPIR_CommL_forget( _a )
1482 #else
1483 #define MPIR_SENDQ_REMEMBER(a,b,c,d)
1484 #define MPIR_SENDQ_FORGET(a)
1485 #define MPIR_COMML_REMEMBER(_a)
1486 #define MPIR_COMML_FORGET(_a)
1487 #endif
1488 
1489 /* must come after MPID_Comm is declared/defined */
1490 int MPIR_Get_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req);
1491 int MPIR_Get_intercomm_contextid_nonblock(MPID_Comm *comm_ptr, MPID_Comm *newcommp, MPID_Request **req);
1492 
1493 /* ------------------------------------------------------------------------- */
1494 /* Prototypes and definitions for the node ID code.  This is used to support
1495    hierarchical collectives in a (mostly) device-independent way. */
1496 #if defined(MPID_USE_NODE_IDS)
1497 /* MPID_Node_id_t is a signed integer type defined by the device in mpidpre.h. */
1498 int MPID_Get_node_id(MPID_Comm *comm, int rank, MPID_Node_id_t *id_p);
1499 int MPID_Get_max_node_id(MPID_Comm *comm, MPID_Node_id_t *max_id_p);
1500 #endif
1501 
1502 /* ------------------------------------------------------------------------- */
1503 /*S
1504   MPID_Progress_state - object to hold progress state when using the blocking
1505   progress routines.
1506 
1507   Module:
1508   Misc
1509 
1510   Notes:
1511   The device must define MPID_PROGRESS_STATE_DECL.  It should  include any state
1512   that needs to be maintained between calls to MPID_Progress_{start,wait,end}.
1513   S*/
1514 typedef struct MPID_Progress_state
1515 {
1516     MPID_PROGRESS_STATE_DECL
1517 }
1518 MPID_Progress_state;
1519 /* ------------------------------------------------------------------------- */
1520 
1521 /* ------------------------------------------------------------------------- */
1522 /* end of mpirma.h (in src/mpi/rma?) */
1523 /* ------------------------------------------------------------------------- */
1524 
1525 /* Windows */
1526 #ifdef USE_MPID_RMA_TABLE
1527 struct MPID_Win;
1528 typedef struct MPID_RMA_Ops {
1529     int (*Win_free)(struct MPID_Win **);
1530 
1531     int (*Put)(const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1532 		struct MPID_Win *);
1533     int (*Get)(void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1534 		struct MPID_Win *);
1535     int (*Accumulate)(const void *, int, MPI_Datatype, int, MPI_Aint, int,
1536 		       MPI_Datatype, MPI_Op, struct MPID_Win *);
1537 
1538     int (*Win_fence)(int, struct MPID_Win *);
1539     int (*Win_post)(MPID_Group *, int, struct MPID_Win *);
1540     int (*Win_start)(MPID_Group *, int, struct MPID_Win *);
1541     int (*Win_complete)(struct MPID_Win *);
1542     int (*Win_wait)(struct MPID_Win *);
1543     int (*Win_test)(struct MPID_Win *, int *);
1544 
1545     int (*Win_lock)(int, int, int, struct MPID_Win *);
1546     int (*Win_unlock)(int, struct MPID_Win *);
1547 
1548     /* MPI-3 Functions */
1549     int (*Win_attach)(struct MPID_Win *, void *, MPI_Aint);
1550     int (*Win_detach)(struct MPID_Win *, const void *);
1551     int (*Win_shared_query)(struct MPID_Win *, int, MPI_Aint *, int *, void *);
1552 
1553     int (*Win_lock_all)(int, struct MPID_Win *);
1554     int (*Win_unlock_all)(struct MPID_Win *);
1555 
1556     int (*Win_flush)(int, struct MPID_Win *);
1557     int (*Win_flush_all)(struct MPID_Win *);
1558     int (*Win_flush_local)(int, struct MPID_Win *);
1559     int (*Win_flush_local_all)(struct MPID_Win *);
1560     int (*Win_sync)(struct MPID_Win *);
1561 
1562     int (*Get_accumulate)(const void *, int , MPI_Datatype, void *, int,
1563                           MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPI_Op,
1564                           struct MPID_Win *);
1565     int (*Fetch_and_op)(const void *, void *, MPI_Datatype, int, MPI_Aint, MPI_Op,
1566                         struct MPID_Win *);
1567     int (*Compare_and_swap)(const void *, const void *, void *, MPI_Datatype, int,
1568                             MPI_Aint, struct MPID_Win *);
1569 
1570     int (*Rput)(const void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1571                 struct MPID_Win *, MPID_Request**);
1572     int (*Rget)(void *, int, MPI_Datatype, int, MPI_Aint, int, MPI_Datatype,
1573                 struct MPID_Win *, MPID_Request**);
1574     int (*Raccumulate)(const void *, int, MPI_Datatype, int, MPI_Aint, int,
1575                        MPI_Datatype, MPI_Op, struct MPID_Win *, MPID_Request**);
1576     int (*Rget_accumulate)(const void *, int , MPI_Datatype, void *, int,
1577                            MPI_Datatype, int, MPI_Aint, int, MPI_Datatype, MPI_Op,
1578                            struct MPID_Win *, MPID_Request**);
1579 
1580 } MPID_RMAFns;
1581 #define MPID_RMAFNS_VERSION 2
1582 /* Note that the memory allocation/free routines do not take a window,
1583    so they must be initialized separately, and are a per-run, not per-window
1584    object.  If the device can manage different kinds of memory allocations,
1585    these routines must internally provide that flexibility. */
1586 /*
1587     void *(*Alloc_mem)(size_t, MPID_Info *);
1588     int (*Free_mem)(void *);
1589 */
1590 #endif
1591 
1592 /*S
1593   MPID_Win - Description of the Window Object data structure.
1594 
1595   Module:
1596   Win-DS
1597 
1598   Notes:
1599   The following 3 keyvals are defined for attributes on all MPI
1600   Window objects\:
1601 .vb
1602  MPI_WIN_SIZE
1603  MPI_WIN_BASE
1604  MPI_WIN_DISP_UNIT
1605 .ve
1606   These correspond to the values in 'length', 'start_address', and
1607   'disp_unit'.
1608 
1609   The communicator in the window is the same communicator that the user
1610   provided to 'MPI_Win_create' (not a dup).  However, each intracommunicator
1611   has a special context id that may be used if MPI communication is used
1612   by the implementation to implement the RMA operations.
1613 
1614   There is no separate window group; the group of the communicator should be
1615   used.
1616 
1617   Question:
1618   Should a 'MPID_Win' be defined after 'MPID_Segment' in case the device
1619   wants to
1620   store a queue of pending put/get operations, described with 'MPID_Segment'
1621   (or 'MPID_Request')s?
1622 
1623   S*/
1624 typedef struct MPID_Win {
1625     MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1626     int fence_cnt;     /* 0 = no fence has been called;
1627                           1 = fence has been called */
1628     MPID_Errhandler *errhandler;  /* Pointer to the error handler structure */
1629     void *base;
1630     MPI_Aint    size;
1631     int          disp_unit;      /* Displacement unit of *local* window */
1632     MPID_Attribute *attributes;
1633     MPID_Group *start_group_ptr; /* group passed in MPI_Win_start */
1634     int start_assert;            /* assert passed to MPI_Win_start */
1635     MPID_Comm *comm_ptr;         /* Pointer to comm of window (dup) */
1636     int         myrank;          /* Rank of this process in comm (used to
1637 				    detect operations on self) */
1638     int lockRank;                /* If within an MPI_Win_lock epoch,
1639 				    the rank that we locked */
1640 #ifdef USE_THREADED_WINDOW_CODE
1641     /* These were causing compilation errors.  We need to figure out how to
1642        integrate threads into MPICH2 before including these fields. */
1643     /* FIXME: The test here should be within a test for threaded support */
1644 #ifdef HAVE_PTHREAD_H
1645     pthread_t wait_thread_id; /* id of thread handling MPI_Win_wait */
1646     pthread_t passive_target_thread_id; /* thread for passive target RMA */
1647 #elif defined(HAVE_WINTHREADS)
1648     HANDLE wait_thread_id;
1649     HANDLE passive_target_thread_id;
1650 #endif
1651 #endif
1652     /* */
1653 #ifdef USE_MPID_RMA_TABLE
1654     MPID_RMAFns RMAFns;
1655 #endif
1656     /* These are COPIES of the values so that addresses to them
1657        can be returned as attributes.  They are initialized by the
1658        MPI_Win_get_attr function.
1659 
1660        These values are constant for the lifetime of the window, so
1661        this is thread-safe.
1662      */
1663     int  copyDispUnit;
1664     MPI_Aint copySize;
1665 
1666     char          name[MPI_MAX_OBJECT_NAME];
1667 
1668     MPIR_Win_flavor_t create_flavor;
1669     MPIR_Win_model_t  model;
1670     MPIR_Win_flavor_t copyCreateFlavor;
1671     MPIR_Win_model_t  copyModel;
1672 
1673   /* Other, device-specific information */
1674 #ifdef MPID_DEV_WIN_DECL
1675     MPID_DEV_WIN_DECL
1676 #endif
1677 } MPID_Win;
1678 extern MPIU_Object_alloc_t MPID_Win_mem;
1679 /* Preallocated win objects */
1680 extern MPID_Win MPID_Win_direct[];
1681 
1682 enum MPID_Win_lock_states {
1683     /* LOCKED = 0, 1, ... */
1684     MPID_WIN_STATE_UNLOCKED   = -1,
1685     MPID_WIN_STATE_LOCKED_ALL = -2
1686 };
1687 
1688 /* ------------------------------------------------------------------------- */
1689 /* also in mpirma.h ?*/
1690 /* ------------------------------------------------------------------------- */
1691 
1692 /*
1693  * Good Memory (may be required for passive target operations on MPI_Win)
1694  */
1695 
1696 /*@
1697   MPID_Alloc_mem - Allocate memory suitable for passive target RMA operations
1698 
1699   Input Parameter:
1700 + size - Number of types to allocate.
1701 - info - Info object
1702 
1703   Return value:
1704   Pointer to the allocated memory.  If the memory is not available,
1705   returns null.
1706 
1707   Notes:
1708   This routine is used to implement 'MPI_Alloc_mem'.  It is for that reason
1709   that there is no communicator argument.
1710 
1711   This memory may `only` be freed with 'MPID_Free_mem'.
1712 
1713   This is a `local`, not a collective operation.  It functions more like a
1714   good form of 'malloc' than collective shared-memory allocators such as
1715   the 'shmalloc' found on SGI systems.
1716 
1717   Implementations of this routine may wish to use 'MPID_Memory_register'.
1718   However, this routine has slighly different requirements, so a separate
1719   entry point is provided.
1720 
1721   Question:
1722   Since this takes an info object, should there be an error routine in the
1723   case that the info object contains an error?
1724 
1725   Module:
1726   Win
1727   @*/
1728 void *MPID_Alloc_mem( size_t size, MPID_Info *info );
1729 
1730 /*@
1731   MPID_Free_mem - Frees memory allocated with 'MPID_Alloc_mem'
1732 
1733   Input Parameter:
1734 . ptr - Pointer to memory allocated by 'MPID_Alloc_mem'.
1735 
1736   Return value:
1737   'MPI_SUCCESS' if memory was successfully freed; an MPI error code otherwise.
1738 
1739   Notes:
1740   The return value is provided because it may not be easy to validate the
1741   value of 'ptr' without attempting to free the memory.
1742 
1743   Module:
1744   Win
1745   @*/
1746 int MPID_Free_mem( void *ptr );
1747 
1748 /*@
1749   MPID_Mem_was_alloced - Return true if this memory was allocated with
1750   'MPID_Alloc_mem'
1751 
1752   Input Parameters:
1753 + ptr  - Address of memory
1754 - size - Size of reqion in bytes.
1755 
1756   Return value:
1757   True if the memory was allocated with 'MPID_Alloc_mem', false otherwise.
1758 
1759   Notes:
1760   This routine may be needed by 'MPI_Win_create' to ensure that the memory
1761   for passive target RMA operations was allocated with 'MPI_Mem_alloc'.
1762   This may be used, for example, for ensuring that memory used with
1763   passive target operations was allocated with 'MPID_Alloc_mem'.
1764 
1765   Module:
1766   Win
1767   @*/
1768 int MPID_Mem_was_alloced( void *ptr );  /* brad : this isn't used or implemented anywhere */
1769 
1770 /* ------------------------------------------------------------------------- */
1771 /* end of also in mpirma.h ? */
1772 /* ------------------------------------------------------------------------- */
1773 
1774 /* ------------------------------------------------------------------------- */
1775 /* Reduction and accumulate operations */
1776 /*E
1777   MPID_Op_kind - Enumerates types of MPI_Op types
1778 
1779   Notes:
1780   These are needed for implementing 'MPI_Accumulate', since only predefined
1781   operations are allowed for that operation.
1782 
1783   A gap in the enum values was made allow additional predefined operations
1784   to be inserted.  This might include future additions to MPI or experimental
1785   extensions (such as a Read-Modify-Write operation).
1786 
1787   Module:
1788   Collective-DS
1789   E*/
1790 typedef enum MPID_Op_kind { MPID_OP_NULL=0, MPID_OP_MAX=1, MPID_OP_MIN=2,
1791 			    MPID_OP_SUM=3, MPID_OP_PROD=4,
1792 	       MPID_OP_LAND=5, MPID_OP_BAND=6, MPID_OP_LOR=7, MPID_OP_BOR=8,
1793 	       MPID_OP_LXOR=9, MPID_OP_BXOR=10, MPID_OP_MAXLOC=11,
1794                MPID_OP_MINLOC=12, MPID_OP_REPLACE=13,
1795                MPID_OP_NO_OP=14,
1796                MPID_OP_USER_NONCOMMUTE=32, MPID_OP_USER=33 }
1797   MPID_Op_kind;
1798 
1799 /*S
1800   MPID_User_function - Definition of a user function for MPI_Op types.
1801 
1802   Notes:
1803   This includes a 'const' to make clear which is the 'in' argument and
1804   which the 'inout' argument, and to indicate that the 'count' and 'datatype'
1805   arguments are unchanged (they are addresses in an attempt to allow
1806   interoperation with Fortran).  It includes 'restrict' to emphasize that
1807   no overlapping operations are allowed.
1808 
1809   We need to include a Fortran version, since those arguments will
1810   have type 'MPI_Fint *' instead.  We also need to add a test to the
1811   test suite for this case; in fact, we need tests for each of the handle
1812   types to ensure that the transfered handle works correctly.
1813 
1814   This is part of the collective module because user-defined operations
1815   are valid only for the collective computation routines and not for
1816   RMA accumulate.
1817 
1818   Yes, the 'restrict' is in the correct location.  C compilers that
1819   support 'restrict' should be able to generate code that is as good as a
1820   Fortran compiler would for these functions.
1821 
1822   We should note on the manual pages for user-defined operations that
1823   'restrict' should be used when available, and that a cast may be
1824   required when passing such a function to 'MPI_Op_create'.
1825 
1826   Question:
1827   Should each of these function types have an associated typedef?
1828 
1829   Should there be a C++ function here?
1830 
1831   Module:
1832   Collective-DS
1833   S*/
1834 typedef union MPID_User_function {
1835     void (*c_function) ( const void *, void *,
1836 			 const int *, const MPI_Datatype * );
1837     void (*f77_function) ( const void *, void *,
1838 			  const MPI_Fint *, const MPI_Fint * );
1839 } MPID_User_function;
1840 /* FIXME: Should there be "restrict" in the definitions above, e.g.,
1841    (*c_function)( const void restrict * , void restrict *, ... )? */
1842 
1843 /*S
1844   MPID_Op - MPI_Op structure
1845 
1846   Notes:
1847   All of the predefined functions are commutative.  Only user functions may
1848   be noncummutative, so there are two separate op types for commutative and
1849   non-commutative user-defined operations.
1850 
1851   Operations do not require reference counts because there are no nonblocking
1852   operations that accept user-defined operations.  Thus, there is no way that
1853   a valid program can free an 'MPI_Op' while it is in use.
1854 
1855   Module:
1856   Collective-DS
1857   S*/
1858 typedef struct MPID_Op {
1859      MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
1860      MPID_Op_kind       kind;
1861      MPID_Lang_t        language;
1862      MPID_User_function function;
1863   } MPID_Op;
1864 #define MPID_OP_N_BUILTIN 15
1865 extern MPID_Op MPID_Op_builtin[MPID_OP_N_BUILTIN];
1866 extern MPID_Op MPID_Op_direct[];
1867 extern MPIU_Object_alloc_t MPID_Op_mem;
1868 
1869 #define MPIR_Op_add_ref(_op) \
1870     do { MPIU_Object_add_ref(_op); } while (0)
1871 #define MPIR_Op_release_ref( _op, _inuse ) \
1872     do { MPIU_Object_release_ref( _op, _inuse ); } while (0)
1873 
1874 /* release and free-if-not-in-use helper */
1875 #define MPIR_Op_release(op_p_)                           \
1876     do {                                                 \
1877         int in_use_;                                     \
1878         MPIR_Op_release_ref((op_p_), &in_use_);          \
1879         if (!in_use_) {                                  \
1880             MPIU_Handle_obj_free(&MPID_Op_mem, (op_p_)); \
1881         }                                                \
1882     } while (0)
1883 
1884 /* ------------------------------------------------------------------------- */
1885 
1886 /* ------------------------------------------------------------------------- */
1887 /* mpicoll.h (in src/mpi/coll?) */
1888 /* ------------------------------------------------------------------------- */
1889 
1890 /* Collective operations */
1891 typedef struct MPID_Collops {
1892     int ref_count;   /* Supports lazy copies */
1893     /* Contains pointers to the functions for the MPI collectives */
1894     int (*Barrier) (MPID_Comm *, int *);
1895     int (*Bcast) (void*, int, MPI_Datatype, int, MPID_Comm *, int *);
1896     int (*Gather) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1897                    int, MPID_Comm *, int *);
1898     int (*Gatherv) (const void*, int, MPI_Datatype, void*, const int *, const int *,
1899                     MPI_Datatype, int, MPID_Comm *, int *);
1900     int (*Scatter) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1901                     int, MPID_Comm *, int *);
1902     int (*Scatterv) (const void*, const int *, const int *, MPI_Datatype,
1903                      void*, int, MPI_Datatype, int, MPID_Comm *, int *);
1904     int (*Allgather) (const void*, int, MPI_Datatype, void*, int,
1905                       MPI_Datatype, MPID_Comm *, int *);
1906     int (*Allgatherv) (const void*, int, MPI_Datatype, void*, const int *,
1907                        const int *, MPI_Datatype, MPID_Comm *, int *);
1908     int (*Alltoall) (const void*, int, MPI_Datatype, void*, int, MPI_Datatype,
1909                                MPID_Comm *, int *);
1910     int (*Alltoallv) (const void*, const int *, const int *, MPI_Datatype,
1911                       void*, const int *, const int *, MPI_Datatype, MPID_Comm *,
1912                       int *);
1913     int (*Alltoallw) (const void*, const int *, const int *, const MPI_Datatype *, void*,
1914                       const int *, const int *, const MPI_Datatype *, MPID_Comm *, int *);
1915     int (*Reduce) (const void*, void*, int, MPI_Datatype, MPI_Op, int,
1916                    MPID_Comm *, int *);
1917     int (*Allreduce) (const void*, void*, int, MPI_Datatype, MPI_Op,
1918                       MPID_Comm *, int *);
1919     int (*Reduce_scatter) (const void*, void*, const int *, MPI_Datatype, MPI_Op,
1920                            MPID_Comm *, int *);
1921     int (*Scan) (const void*, void*, int, MPI_Datatype, MPI_Op, MPID_Comm *, int * );
1922     int (*Exscan) (const void*, void*, int, MPI_Datatype, MPI_Op, MPID_Comm *, int * );
1923     int (*Reduce_scatter_block) (const void*, void*, int, MPI_Datatype, MPI_Op,
1924                            MPID_Comm *, int *);
1925 
1926     /* MPI-3 nonblocking collectives */
1927     int (*Ibarrier)(MPID_Comm *comm_ptr, MPID_Sched_t s);
1928     int (*Ibcast)(void *buffer, int count, MPI_Datatype datatype, int root,
1929                   MPID_Comm *comm_ptr, MPID_Sched_t s);
1930     int (*Igather)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1931                    int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr,
1932                    MPID_Sched_t s);
1933     int (*Igatherv)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1934                     const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
1935                     MPID_Comm *comm_ptr, MPID_Sched_t s);
1936     int (*Iscatter)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1937                     int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr,
1938                     MPID_Sched_t s);
1939     int (*Iscatterv)(const void *sendbuf, const int *sendcounts, const int *displs,
1940                      MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
1941                      int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
1942     int (*Iallgather)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1943                       int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
1944                       MPID_Sched_t s);
1945     int (*Iallgatherv)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1946                        const int *recvcounts, const int *displs, MPI_Datatype recvtype,
1947                        MPID_Comm *comm_ptr, MPID_Sched_t s);
1948     int (*Ialltoall)(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
1949                      int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
1950                      MPID_Sched_t s);
1951     int (*Ialltoallv)(const void *sendbuf, const int *sendcounts, const int *sdispls,
1952                       MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
1953                       const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
1954                       MPID_Sched_t s);
1955     int (*Ialltoallw)(const void *sendbuf, const int *sendcounts, const int *sdispls,
1956                       const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcounts,
1957                       const int *rdispls, const MPI_Datatype *recvtypes,
1958                       MPID_Comm *comm_ptr, MPID_Sched_t s);
1959     int (*Ireduce)(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
1960                    int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
1961     int (*Iallreduce)(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
1962                       MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
1963     int (*Ireduce_scatter)(const void *sendbuf, void *recvbuf, const int *recvcounts,
1964                            MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
1965     int (*Ireduce_scatter_block)(const void *sendbuf, void *recvbuf, int recvcount,
1966                                  MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr,
1967                                  MPID_Sched_t s);
1968     int (*Iscan)(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
1969                  MPID_Comm *comm_ptr, MPID_Sched_t s);
1970     int (*Iexscan)(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
1971                    MPID_Comm *comm_ptr, MPID_Sched_t s);
1972 
1973     struct MPID_Collops *prev_coll_fns; /* when overriding this table, set this to point to the old table */
1974 
1975     /* MPI-3 neighborhood collectives (blocking & nonblocking) */
1976     int (*Neighbor_allgather)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1977                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
1978                               MPID_Comm *comm_ptr);
1979     int (*Neighbor_allgatherv)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1980                                void *recvbuf, const int recvcounts[], const int displs[],
1981                                MPI_Datatype recvtype, MPID_Comm *comm_ptr);
1982     int (*Neighbor_alltoall)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1983                              void *recvbuf, int recvcount, MPI_Datatype recvtype,
1984                              MPID_Comm *comm_ptr);
1985     int (*Neighbor_alltoallv)(const void *sendbuf, const int sendcounts[], const int sdispls[],
1986                               MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
1987                               const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr);
1988     int (*Neighbor_alltoallw)(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
1989                               const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
1990                               const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
1991                               MPID_Comm *comm_ptr);
1992     int (*Ineighbor_allgather)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1993                                void *recvbuf, int recvcount, MPI_Datatype recvtype,
1994                                MPID_Comm *comm_ptr, MPID_Sched_t s);
1995     int (*Ineighbor_allgatherv)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1996                                 void *recvbuf, const int recvcounts[], const int displs[],
1997                                 MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
1998     int (*Ineighbor_alltoall)(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
1999                               void *recvbuf, int recvcount, MPI_Datatype recvtype,
2000                               MPID_Comm *comm_ptr, MPID_Sched_t s);
2001     int (*Ineighbor_alltoallv)(const void *sendbuf, const int sendcounts[], const int sdispls[],
2002                                MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
2003                                const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr,
2004                                MPID_Sched_t s);
2005     int (*Ineighbor_alltoallw)(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
2006                                const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
2007                                const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
2008                                MPID_Comm *comm_ptr, MPID_Sched_t s);
2009 } MPID_Collops;
2010 
2011 #define MPIR_BARRIER_TAG 1
2012 /* ------------------------------------------------------------------------- */
2013 /* end of mpicoll.h (in src/mpi/coll? */
2014 /* ------------------------------------------------------------------------- */
2015 
2016 /* ------------------------------------------------------------------------- */
2017 /* mpitopo.h (in src/mpi/topo? */
2018 /*
2019  * The following struture allows the device detailed control over the
2020  * functions that are used to implement the topology routines.  If either
2021  * the pointer to this structure is null or any individual entry is null,
2022  * the default function is used (this follows exactly the same rules as the
2023  * collective operations, provided in the MPID_Collops structure).
2024  */
2025 /* ------------------------------------------------------------------------- */
2026 
2027 typedef struct MPID_TopoOps {
2028     int (*cartCreate)( const MPID_Comm *, int, const int[], const int [],
2029 		       int, MPI_Comm * );
2030     int (*cartMap)   ( const MPID_Comm *, int, const int[], const int [],
2031 		       int * );
2032     int (*graphCreate)( const MPID_Comm *, int, const int[], const int [],
2033 			int, MPI_Comm * );
2034     int (*graphMap)   ( const MPID_Comm *, int, const int[], const int[],
2035 			int * );
2036 } MPID_TopoOps;
2037 /* ------------------------------------------------------------------------- */
2038 /* end of mpitopo.h (in src/mpi/topo? */
2039 /* ------------------------------------------------------------------------- */
2040 
2041 
2042 typedef struct MPID_CommOps {
2043     int (*split_type)(MPID_Comm *, int, int, MPID_Info *, MPID_Comm **);
2044 } MPID_CommOps;
2045 extern struct MPID_CommOps  *MPID_Comm_fns; /* Communicator creation functions */
2046 
2047 
2048 /* Per process data */
2049 typedef enum MPIR_MPI_State_t { MPICH_PRE_INIT=0, MPICH_WITHIN_MPI=1,
2050                MPICH_POST_FINALIZED=2 } MPIR_MPI_State_t;
2051 
2052 typedef struct PreDefined_attrs {
2053     int appnum;          /* Application number provided by mpiexec (MPI-2) */
2054     int host;            /* host */
2055     int io;              /* standard io allowed */
2056     int lastusedcode;    /* last used error code (MPI-2) */
2057     int tag_ub;          /* Maximum message tag */
2058     int universe;        /* Universe size from mpiexec (MPI-2) */
2059     int wtime_is_global; /* Wtime is global over processes in COMM_WORLD */
2060 } PreDefined_attrs;
2061 
2062 struct MPID_Datatype;
2063 
2064 typedef struct MPICH_PerProcess_t {
2065     MPIR_MPI_State_t  initialized;      /* Is MPI initalized? */
2066     int               do_error_checks;  /* runtime error check control */
2067     struct MPID_Comm  *comm_world;      /* Easy access to comm_world for
2068                                            error handler */
2069     struct MPID_Comm  *comm_self;       /* Easy access to comm_self */
2070     struct MPID_Comm  *comm_parent;     /* Easy access to comm_parent */
2071     struct MPID_Comm  *icomm_world;     /* An internal version of comm_world
2072 					   that is separate from user's
2073 					   versions */
2074     PreDefined_attrs  attrs;            /* Predefined attribute values */
2075     int               tagged_coll_mask; /* Tag space mask for tagged collectives */
2076 
2077     /* The topology routines dimsCreate is independent of any communicator.
2078        If this pointer is null, the default routine is used */
2079     int (*dimsCreate)( int, int, int *);
2080 
2081     /* Attribute dup functions.  Here for lazy initialization */
2082     int (*attr_dup)( int, MPID_Attribute *, MPID_Attribute ** );
2083     int (*attr_free)( int, MPID_Attribute ** );
2084     /* There is no win_attr_dup function because there can be no MPI_Win_dup
2085        function */
2086     /* Routine to get the messages corresponding to dynamically created
2087        error messages */
2088     const char *(*errcode_to_string)( int );
2089 #ifdef HAVE_CXX_BINDING
2090     /* Routines to call C++ functions from the C implementation of the
2091        MPI reduction and attribute routines */
2092     void (*cxx_call_op_fn)(const void *, void *, int, MPI_Datatype,
2093 			    MPI_User_function * );
2094     /* Error handling functions.  As for the attribute functions,
2095        we pass the integer file/comm/win, the address of the error code,
2096        and the C function to call (itself a function defined by the
2097        C++ interface and exported to C).  The first argument is used
2098        to specify the kind (comm,file,win) */
2099     void  (*cxx_call_errfn) ( int, int *, int *, void (*)(void) );
2100 #endif /* HAVE_CXX_BINDING */
2101 } MPICH_PerProcess_t;
2102 extern MPICH_PerProcess_t MPIR_Process;
2103 
2104 /* ------------------------------------------------------------------------- */
2105 /* In MPICH2, each function has an "enter" and "exit" macro.  These can be
2106  * used to add various features to each function at compile time, or they
2107  * can be set to empty to provide the fastest possible production version.
2108  *
2109  * There are at this time three choices of features (beyond the empty choice)
2110  * 1. timing (controlled by macros in mpitimerimpl.h)
2111  *    These collect data on when each function began and finished; the
2112  *    resulting data can be displayed using special programs
2113  * 2. Debug logging (selected with --enable-g=log)
2114  *    Invokes MPIU_DBG_MSG at the entry and exit for each routine
2115  * 3. Additional memory validation of the memory arena (--enable-g=memarena)
2116  */
2117 /* ------------------------------------------------------------------------- */
2118 /* allow the timing module the opportunity to define the macros */
2119 #include "mpifunc.h"
2120 #if !defined(NEEDS_FUNC_ENTER_EXIT_DEFS)
2121     /* If no timing choice is selected, this sets the entry/exit macros
2122        to empty */
2123 #   include "mpitimerimpl.h"
2124 #endif
2125 
2126 #ifdef NEEDS_FUNC_ENTER_EXIT_DEFS
2127 /* mpich layer definitions */
2128 #define MPID_MPI_FUNC_ENTER(a)			MPIR_FUNC_ENTER(a)
2129 #define MPID_MPI_FUNC_EXIT(a)			MPIR_FUNC_EXIT(a)
2130 #define MPID_MPI_PT2PT_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2131 #define MPID_MPI_PT2PT_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2132 #define MPID_MPI_PT2PT_FUNC_ENTER_FRONT(a)	MPIR_FUNC_ENTER(a)
2133 #define MPID_MPI_PT2PT_FUNC_EXIT_FRONT(a)	MPIR_FUNC_EXIT(a)
2134 #define MPID_MPI_PT2PT_FUNC_ENTER_BACK(a)	MPIR_FUNC_ENTER(a)
2135 #define MPID_MPI_PT2PT_FUNC_ENTER_BOTH(a)	MPIR_FUNC_ENTER(a)
2136 #define MPID_MPI_PT2PT_FUNC_EXIT_BACK(a)	MPIR_FUNC_EXIT(a)
2137 #define MPID_MPI_PT2PT_FUNC_EXIT_BOTH(a)	MPIR_FUNC_EXIT(a)
2138 #define MPID_MPI_COLL_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2139 #define MPID_MPI_COLL_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2140 #define MPID_MPI_RMA_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2141 #define MPID_MPI_RMA_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2142 #define MPID_MPI_INIT_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2143 #define MPID_MPI_INIT_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2144 #define MPID_MPI_FINALIZE_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2145 #define MPID_MPI_FINALIZE_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2146 
2147 /* device layer definitions */
2148 #define MPIDI_FUNC_ENTER(a)			MPIR_FUNC_ENTER(a)
2149 #define MPIDI_FUNC_EXIT(a)			MPIR_FUNC_EXIT(a)
2150 #define MPIDI_PT2PT_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2151 #define MPIDI_PT2PT_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2152 #define MPIDI_PT2PT_FUNC_ENTER_FRONT(a)		MPIR_FUNC_ENTER(a)
2153 #define MPIDI_PT2PT_FUNC_EXIT_FRONT(a)		MPIR_FUNC_EXIT(a)
2154 #define MPIDI_PT2PT_FUNC_ENTER_BACK(a)		MPIR_FUNC_ENTER(a)
2155 #define MPIDI_PT2PT_FUNC_ENTER_BOTH(a)		MPIR_FUNC_ENTER(a)
2156 #define MPIDI_PT2PT_FUNC_EXIT_BACK(a)		MPIR_FUNC_EXIT(a)
2157 #define MPIDI_PT2PT_FUNC_EXIT_BOTH(a)		MPIR_FUNC_EXIT(a)
2158 #define MPIDI_COLL_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2159 #define MPIDI_COLL_FUNC_EXIT(a)			MPIR_FUNC_EXIT(a)
2160 #define MPIDI_RMA_FUNC_ENTER(a)			MPIR_FUNC_ENTER(a)
2161 #define MPIDI_RMA_FUNC_EXIT(a)			MPIR_FUNC_EXIT(a)
2162 #define MPIDI_INIT_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2163 #define MPIDI_INIT_FUNC_EXIT(a)			MPIR_FUNC_EXIT(a)
2164 #define MPIDI_FINALIZE_FUNC_ENTER(a)		MPIR_FUNC_ENTER(a)
2165 #define MPIDI_FINALIZE_FUNC_EXIT(a)		MPIR_FUNC_EXIT(a)
2166 
2167 /* evaporate the timing macros since timing is not selected */
2168 #define MPIU_Timer_init(rank, size)
2169 #define MPIU_Timer_finalize()
2170 #endif /* NEEDS_FUNC_ENTER_EXIT_DEFS */
2171 
2172 /* Definitions for error handling and reporting */
2173 #include "mpierror.h"
2174 #include "mpierrs.h"
2175 
2176 /* Definitions for instrumentation (currently used within RMA code) */
2177 #include "mpiinstr.h"
2178 
2179 /* FIXME: This routine is only used within mpi/src/err/errutil.c and
2180    smpd.  We may not want to export it.  */
2181 void MPIR_Err_print_stack(FILE *, int);
2182 
2183 /* ------------------------------------------------------------------------- */
2184 
2185 /* FIXME: Move these to the communicator block; make sure that all
2186    objects have such hooks */
2187 #ifndef HAVE_DEV_COMM_HOOK
2188 #define MPID_Dev_comm_create_hook( a ) MPI_SUCCESS
2189 #define MPID_Dev_comm_destroy_hook( a ) MPI_SUCCESS
2190 #endif
2191 
2192 /* ------------------------------------------------------------------------- */
2193 /* FIXME: What is the scope of these functions?  Can they be moved into
2194    src/mpi/pt2pt? */
2195 /* ------------------------------------------------------------------------- */
2196 
2197 /* Do not set MPI_ERROR (only set if ERR_IN_STATUS is returned */
2198 #define MPIR_Status_set_empty(status_)			\
2199 {							\
2200     if ((status_) != MPI_STATUS_IGNORE)			\
2201     {							\
2202 	(status_)->MPI_SOURCE = MPI_ANY_SOURCE;		\
2203 	(status_)->MPI_TAG = MPI_ANY_TAG;		\
2204 	(status_)->count = 0;				\
2205 	(status_)->cancelled = FALSE;			\
2206     }							\
2207 }
2208 /* See MPI 1.1, section 3.11, Null Processes */
2209 /* Do not set MPI_ERROR (only set if ERR_IN_STATUS is returned */
2210 #define MPIR_Status_set_procnull(status_)		\
2211 {							\
2212     if ((status_) != MPI_STATUS_IGNORE)			\
2213     {							\
2214 	(status_)->MPI_SOURCE = MPI_PROC_NULL;		\
2215 	(status_)->MPI_TAG = MPI_ANY_TAG;		\
2216 	(status_)->count = 0;				\
2217 	(status_)->cancelled = FALSE;			\
2218     }							\
2219 }
2220 
2221 #define MPIR_Request_extract_status(request_ptr_, status_)								\
2222 {															\
2223     if ((status_) != MPI_STATUS_IGNORE)											\
2224     {															\
2225 	int error__;													\
2226 															\
2227 	/* According to the MPI 1.1 standard page 22 lines 9-12, the MPI_ERROR field may not be modified except by the	\
2228 	   functions in section 3.7.5 which return MPI_ERR_IN_STATUSES (MPI_Wait{all,some} and MPI_Test{all,some}). */	\
2229 	error__ = (status_)->MPI_ERROR;											\
2230 	*(status_) = (request_ptr_)->status;										\
2231 	(status_)->MPI_ERROR = error__;											\
2232     }															\
2233 }
2234 /* ------------------------------------------------------------------------- */
2235 
2236 /* FIXME: The bindings should be divided into three groups:
2237    1. ADI3 routines.  These should have structure comment documentation, e.g.,
2238    the text from doc/adi3/adi3.c
2239    2. General utility routines.  These should have a short description
2240    3. Local utility routines, e.g., routines used within a single subdirectory.
2241    These should be moved into an include file in that subdirectory
2242 */
2243 /* Bindings for internal routines */
2244 /*@ MPIR_Add_finalize - Add a routine to be called when MPI_Finalize is invoked
2245 
2246 + routine - Routine to call
2247 . extra   - Void pointer to data to pass to the routine
2248 - priority - Indicates the priority of this callback and controls the order
2249   in which callbacks are executed.  Use a priority of zero for most handlers;
2250   higher priorities will be executed first.
2251 
2252 Notes:
2253   The routine 'MPID_Finalize' is executed with priority
2254   'MPIR_FINALIZE_CALLBACK_PRIO' (currently defined as 5).  Handlers with
2255   a higher priority execute before 'MPID_Finalize' is called; those with
2256   a lower priority after 'MPID_Finalize' is called.
2257 @*/
2258 void MPIR_Add_finalize( int (*routine)( void * ), void *extra, int priority );
2259 
2260 #define MPIR_FINALIZE_CALLBACK_PRIO 5
2261 #define MPIR_FINALIZE_CALLBACK_HANDLE_CHECK_PRIO 1
2262 #define MPIR_FINALIZE_CALLBACK_DEFAULT_PRIO 0
2263 #define MPIR_FINALIZE_CALLBACK_MAX_PRIO 10
2264 
2265 /*int MPIR_Comm_attr_dup(MPID_Comm *, MPID_Attribute **);
2266   int MPIR_Comm_attr_delete(MPID_Comm *, MPID_Attribute *);*/
2267 int MPIR_Comm_copy( MPID_Comm *, int, MPID_Comm ** );
2268 int MPIR_Comm_copy_data(MPID_Comm *comm_ptr, MPID_Comm **outcomm_ptr);
2269 
2270 /* Fortran keyvals are set with functions in mpi_f77interface.h */
2271 #ifdef HAVE_CXX_BINDING
2272 extern void MPIR_Keyval_set_cxx( int, void (*)(void), void (*)(void) );
2273 extern void MPIR_Op_set_cxx( MPI_Op, void (*)(void) );
2274 extern void MPIR_Errhandler_set_cxx( MPI_Errhandler, void (*)(void) );
2275 #endif
2276 
2277 int MPIR_Group_create( int, MPID_Group ** );
2278 int MPIR_Group_release(MPID_Group *group_ptr);
2279 
2280 int MPIR_dup_fn ( MPI_Comm, int, void *, void *, void *, int * );
2281 /* marks a request as complete, extracting the status */
2282 int MPIR_Request_complete(MPI_Request *, MPID_Request *, MPI_Status *, int *);
2283 
2284 int MPIR_Request_get_error(MPID_Request *);
2285 /* run the progress engine until the given request is complete */
2286 int MPIR_Progress_wait_request(MPID_Request *req);
2287 
2288 /* The following routines perform the callouts to the user routines registered
2289    as part of a generalized request.  They handle any language binding issues
2290    that are necessary. They are used when completing, freeing, cancelling or
2291    extracting the status from a generalized request. */
2292 int MPIR_Grequest_cancel(MPID_Request * request_ptr, int complete);
2293 int MPIR_Grequest_query(MPID_Request * request_ptr);
2294 int MPIR_Grequest_free(MPID_Request * request_ptr);
2295 
2296 /* this routine was added to support our extension relaxing the progress rules
2297  * for generalized requests */
2298 int MPIR_Grequest_progress_poke(int count, MPID_Request **request_ptrs,
2299 		MPI_Status array_of_statuses[] );
2300 int MPIR_Grequest_waitall(int count, MPID_Request * const *  request_ptrs);
2301 
2302 /* ------------------------------------------------------------------------- */
2303 /* Prototypes for language-specific routines, such as routines to set
2304    Fortran keyval attributes */
2305 #ifdef HAVE_FORTRAN_BINDING
2306 #include "mpi_f77interface.h"
2307 #endif
2308 
2309 /* ADI Bindings */
2310 /*@
2311   MPID_Init - Initialize the device
2312 
2313   Input Parameters:
2314 + argc_p - Pointer to the argument count
2315 . argv_p - Pointer to the argument list
2316 - requested - Requested level of thread support.  Values are the same as
2317   for the 'required' argument to 'MPI_Init_thread', except that we define
2318   an enum for these values.
2319 
2320   Output Parameters:
2321 + provided - Provided level of thread support.  May be less than the
2322   requested level of support.
2323 . has_args - Set to true if 'argc_p' and 'argv_p' contain the command
2324   line arguments.  See below.
2325 - has_env  - Set to true if the environment of the process has been
2326   set as the user expects.  See below.
2327 
2328   Return value:
2329   Returns 'MPI_SUCCESS' on success and an MPI error code on failure.  Failure
2330   can happen when, for example, the device is unable  to start or contact the
2331   number of processes specified by the 'mpiexec' command.
2332 
2333   Notes:
2334   Null arguments for 'argc_p' and 'argv_p' `must` be valid (see MPI-2, section
2335   4.2)
2336 
2337   Multi-method devices should initialize each method within this call.
2338   They can use environment variables and/or command-line arguments
2339   to decide which methods to initialize (but note that they must not
2340   `depend` on using command-line arguments).
2341 
2342   This call also initializes all MPID data needed by the device.  This
2343   includes the 'MPID_Request's and any other data structures used by
2344   the device.
2345 
2346   The arguments 'has_args' and 'has_env' indicate whether the process was
2347   started with command-line arguments or environment variables.  In some
2348   cases, only the root process is started with these values; in others,
2349   the startup environment ensures that each process receives the
2350   command-line arguments and environment variables that the user expects.
2351   While the MPI standard makes no requirements that command line arguments or
2352   environment variables are provided to all processes, most users expect a
2353   common environment.  These variables allow an MPI implementation (that is
2354   based on ADI-3) to provide both of these by making use of MPI communication
2355   after 'MPID_Init' is called but before 'MPI_Init' returns to the user, if
2356   the process management environment does not provide this service.
2357 
2358 
2359   This routine is used to implement both 'MPI_Init' and 'MPI_Init_thread'.
2360 
2361   Setting the environment requires a 'setenv' function.  Some
2362   systems may not have this.  In that case, the documentation must make
2363   clear that the environment may not be propagated to the generated processes.
2364 
2365   Module:
2366   MPID_CORE
2367 
2368   Questions:
2369 
2370   The values for 'has_args' and 'has_env' are boolean.
2371   They could be more specific.  For
2372   example, the value could indicate the rank in 'MPI_COMM_WORLD' of a
2373   process that has the values; the value 'MPI_ANY_SOURCE' (or a '-1') could
2374   indicate that the value is available on all processes (including this one).
2375   We may want this since otherwise the processes may need to determine whether
2376   any process needs the command line.  Another option would be to use positive
2377   values in the same way that the 'color' argument is used in 'MPI_Comm_split';
2378   a negative value indicates the member of the processes with that color that
2379   has the values of the command line arguments (or environment).  This allows
2380   for non-SPMD programs.
2381 
2382   Do we require that the startup environment (e.g., whatever 'mpiexec' is
2383   using to start processes) is responsible for delivering
2384   the command line arguments and environment variables that the user expects?
2385   That is, if the user is running an SPMD program, and expects each process
2386   to get the same command line argument, who is responsible for this?
2387   The 'has_args' and 'has_env' values are intended to allow the ADI to
2388   handle this while taking advantage of any support that the process
2389   manager framework may provide.
2390 
2391   Alternately, how do we find out from the process management environment
2392   whether it took care of the environment or the command line arguments?
2393   Do we need a 'PMI_Env_query' function that can answer these questions
2394   dynamically (in case a different process manager is used through the same
2395   interface)?
2396 
2397   Can we fix the Fortran command-line arguments?  That is, can we arrange for
2398   'iargc' and 'getarg' (and the POSIX equivalents) to return the correct
2399   values?  See, for example, the Absoft implementations of 'getarg'.
2400   We could also contact PGI about the Portland Group compilers, and of
2401   course the 'g77' source code is available.
2402   Does each process have the same values for the environment variables
2403   when this routine returns?
2404 
2405   If we don''t require that all processes get the same argument list,
2406   we need to find out if they did anyway so that 'MPI_Init_thread' can
2407   fixup the list for the user.  This argues for another return value that
2408   flags how much of the environment the 'MPID_Init' routine set up
2409   so that the 'MPI_Init_thread' call can provide the rest.  The reason
2410   for this is that, even though the MPI standard does not require it,
2411   a user-friendly implementation should, in the SPMD mode, give each
2412   process the same environment and argument lists unless the user
2413   explicitly directed otherwise.
2414 
2415   How does this interface to PMI?  Do we need to know anything?  Should
2416   this call have an info argument to support PMI?
2417 
2418   The following questions involve how environment variables and command
2419   line arguments are used to control the behavior of the implementation.
2420   Many of these values must be determined at the time that 'MPID_Init'
2421   is called.  These all should be considered in the context of the
2422   parameter routines described in the MPICH2 Design Document.
2423 
2424   Are there recommended environment variable names?  For example, in ADI-2,
2425   there are many debugging options that are part of the common device.
2426   In MPI-2, we can''t require command line arguments, so any such options
2427   must also have environment variables.  E.g., 'MPICH_ADI_DEBUG' or
2428   'MPICH_ADI_DB'.
2429 
2430   Names that are explicitly prohibited?  For example, do we want to
2431   reserve any names that 'MPI_Init_thread' (as opposed to 'MPID_Init')
2432   might use?
2433 
2434   How does information on command-line arguments and environment variables
2435   recognized by the device get added to the documentation?
2436 
2437   What about control for other impact on the environment?  For example,
2438   what signals should the device catch (e.g., 'SIGFPE'? 'SIGTRAP'?)?
2439   Which of these should be optional (e.g., ignore or leave signal alone)
2440   or selectable (e.g., port to listen on)?  For example, catching 'SIGTRAP'
2441   causes problems for 'gdb', so we''d like to be able to leave 'SIGTRAP'
2442   unchanged in some cases.
2443 
2444   Another environment variable should control whether fault-tolerance is
2445   desired.  If fault-tolerance is selected, then some collective operations
2446   will need to use different algorithms and most fatal errors detected by the
2447   MPI implementation should abort only the affected process, not all processes.
2448   @*/
2449 int MPID_Init( int *argc_p, char ***argv_p, int requested,
2450 	       int *provided, int *has_args, int *has_env );
2451 
2452 /* was:
2453  int MPID_Init( int *argc_p, char ***argv_p,
2454 	       int requested, int *provided,
2455 	       MPID_Comm **parent_comm, int *has_args, int *has_env ); */
2456 
2457 /*@
2458   MPID_InitCompleted - Notify the device that the MPI_Init or MPI_Initthread
2459   call has completed setting up MPI
2460 
2461  Notes:
2462  This call allows the device to complete any setup that it wishes to perform
2463  and for which it needs to access any of the structures (such as 'MPIR_Process')
2464  that are initialized after 'MPID_Init' is called.  If the device does not need
2465  any extra operations, then it may provide either an empty function or even
2466  define this as a macro with the value 'MPI_SUCCESS'.
2467   @*/
2468 int MPID_InitCompleted( void );
2469 
2470 /*@
2471   MPID_Finalize - Perform the device-specific termination of an MPI job
2472 
2473   Return Value:
2474   'MPI_SUCCESS' or a valid MPI error code.  Normally, this routine will
2475   return 'MPI_SUCCESS'.  Only in extrordinary circumstances can this
2476   routine fail; for example, if some process stops responding during the
2477   finalize step.  In this case, 'MPID_Finalize' should return an MPI
2478   error code indicating the reason that it failed.
2479 
2480   Notes:
2481 
2482   Module:
2483   MPID_CORE
2484 
2485   Questions:
2486   Need to check the MPI-2 requirements on 'MPI_Finalize' with respect to
2487   things like which process must remain after 'MPID_Finalize' is called.
2488   @*/
2489 int MPID_Finalize(void);
2490 /*@
2491   MPID_Abort - Abort at least the processes in the specified communicator.
2492 
2493   Input Parameters:
2494 + comm        - Communicator of processes to abort
2495 . mpi_errno   - MPI error code containing the reason for the abort
2496 . exit_code   - Exit code to return to the calling environment.  See notes.
2497 - error_msg   - Optional error message
2498 
2499   Return value:
2500   'MPI_SUCCESS' or an MPI error code.  Normally, this routine should not
2501   return, since the calling process must be a member of the communicator.
2502   However, under some circumstances, the 'MPID_Abort' might fail; in this
2503   case, returning an error indication is appropriate.
2504 
2505   Notes:
2506 
2507   In a fault-tolerant MPI implementation, this operation should abort `only`
2508   the processes in the specified communicator.  Any communicator that shares
2509   processes with the aborted communicator becomes invalid.  For more
2510   details, see (paper not yet written on fault-tolerant MPI).
2511 
2512   In particular, if the communicator is 'MPI_COMM_SELF', only the calling
2513   process should be aborted.
2514 
2515   The 'exit_code' is the exit code that this particular process will
2516   attempt to provide to the 'mpiexec' or other program invocation
2517   environment.  See 'mpiexec' for a discussion of how exit codes from
2518   many processes may be combined.
2519 
2520   If the error_msg field is non-NULL this string will be used as the message
2521   with the abort output.  Otherwise, the output message will be base on the
2522   error message associated with the mpi_errno.
2523 
2524   An external agent that is aborting processes can invoke this with either
2525   'MPI_COMM_WORLD' or 'MPI_COMM_SELF'.  For example, if the process manager
2526   wishes to abort a group of processes, it should cause 'MPID_Abort' to
2527   be invoked with 'MPI_COMM_SELF' on each process in the group.
2528 
2529   Question:
2530   An alternative design is to provide an 'MPID_Group' instead of a
2531   communicator.  This would allow a process manager to ask the ADI
2532   to kill an entire group of processes without needing a communicator.
2533   However, the implementation of 'MPID_Abort' will either do this by
2534   communicating with other processes or by requesting the process manager
2535   to kill the processes.  That brings up this question: should
2536   'MPID_Abort' use 'PMI' to kill processes?  Should it be required to
2537   notify the process manager?  What about persistent resources (such
2538   as SYSV segments or forked processes)?
2539 
2540   This suggests that for any persistent resource, an exit handler be
2541   defined.  These would be executed by 'MPID_Abort' or 'MPID_Finalize'.
2542   See the implementation of 'MPI_Finalize' for an example of exit callbacks.
2543   In addition, code that registered persistent resources could use persistent
2544   storage (i.e., a file) to record that information, allowing cleanup
2545   utilities (such as 'mpiexec') to remove any resources left after the
2546   process exits.
2547 
2548   'MPI_Finalize' requires that attributes on 'MPI_COMM_SELF' be deleted
2549   before anything else happens; this allows libraries to attach end-of-job
2550   actions to 'MPI_Finalize'.  It is valuable to have a similar
2551   capability on 'MPI_Abort', with the caveat that 'MPI_Abort' may not
2552   guarantee that the run-on-abort routines were called.  This provides a
2553   consistent way for the MPICH implementation to handle freeing any
2554   persistent resources.  However, such callbacks must be limited since
2555   communication may not be possible once 'MPI_Abort' is called.  Further,
2556   any callbacks must guarantee that they have finite termination.
2557 
2558   One possible extension would be to allow `users` to add actions to be
2559   run when 'MPI_Abort' is called, perhaps through a special attribute value
2560   applied to 'MPI_COMM_SELF'.  Note that is is incorrect to call the delete
2561   functions for the normal attributes on 'MPI_COMM_SELF' because MPI
2562   only specifies that those are run on 'MPI_Finalize' (i.e., normal
2563   termination).
2564 
2565   Module:
2566   MPID_CORE
2567   @*/
2568 
2569 /* FIXME: the 4th argument isn't part of the original design and isn't documented */
2570 
2571 # if 0
2572 int MPID_Abort( MPID_Comm *comm, int mpi_errno, int exit_code, const char *error_msg );
2573 #endif
2574 /* FIXME: Should we turn off this flag and only declare MPID_Abort in mpiutil.h? */
2575 /* We want to also declare MPID_Abort in mpiutil.h if mpiimpl.h is not used */
2576 #define HAS_MPID_ABORT_DECL
2577 
2578 int MPID_Open_port(MPID_Info *, char *);
2579 int MPID_Close_port(const char *);
2580 
2581 /*@
2582    MPID_Comm_accept - MPID entry point for MPI_Comm_accept
2583 
2584    Input Parameters:
2585 +  port_name - port name
2586 .  info - info
2587 .  root - root
2588 -  comm - communicator
2589 
2590    Output Parameters:
2591 .  MPI_Comm *newcomm - new communicator
2592 
2593   Return Value:
2594   'MPI_SUCCESS' or a valid MPI error code.
2595 @*/
2596 int MPID_Comm_accept(const char *, MPID_Info *, int, MPID_Comm *, MPID_Comm **);
2597 
2598 /*@
2599    MPID_Comm_connect - MPID entry point for MPI_Comm_connect
2600 
2601    Input Parameters:
2602 +  port_name - port name
2603 .  info - info
2604 .  root - root
2605 -  comm - communicator
2606 
2607    Output Parameters:
2608 .  newcomm_ptr - new intercommunicator
2609 
2610   Return Value:
2611   'MPI_SUCCESS' or a valid MPI error code.
2612 @*/
2613 int MPID_Comm_connect(const char *, MPID_Info *, int, MPID_Comm *, MPID_Comm **);
2614 
2615 int MPID_Comm_disconnect(MPID_Comm *);
2616 
2617 int MPID_Comm_spawn_multiple(int, char *[], char **[], const int [], MPID_Info* [],
2618                              int, MPID_Comm *, MPID_Comm **, int []);
2619 
2620 /*@
2621   MPID_Comm_group_failed - MPID entry point for MPI_Comm_group_failed
2622 
2623   Input Parameters:
2624 . comm - communicator
2625 
2626   Output Parameters
2627 . failed_group_ptr - group of failed processes
2628 
2629   Return Value:
2630   'MPI_SUCCESS' or a valid MPI error code.
2631 @*/
2632 int MPID_Comm_group_failed(MPID_Comm *comm, MPID_Group **failed_group_ptr);
2633 
2634 /*@
2635   MPID_Comm_remote_group_failed - MPID entry point for MPI_Comm_remote_group_failed
2636 
2637   Input Parameters:
2638 . comm - intercommunicator
2639 
2640   Output Parameters
2641 . failed_group_ptr - group of failed processes in comm's remote group
2642 
2643   Return Value:
2644   'MPI_SUCCESS' or a valid MPI error code.
2645 @*/
2646 int MPID_Comm_remote_group_failed(MPID_Comm *comm, MPID_Group **failed_group_ptr);
2647 
2648 /*@
2649   MPID_Comm_reenable_anysource - MPID entry point for MPI_Comm_reenable_anysource
2650 
2651   Input Parameters:
2652 . comm - communicator
2653 
2654   Output Parameters
2655 . failed_group_ptr - group of failed processes
2656 
2657   Return Value:
2658   'MPI_SUCCESS' or a valid MPI error code.
2659 @*/
2660 int MPID_Comm_reenable_anysource(MPID_Comm *comm, MPID_Group **failed_group_ptr);
2661 
2662 /*@
2663   MPID_Send - MPID entry point for MPI_Send
2664 
2665   Notes:
2666   The only difference between this and 'MPI_Send' is that the basic
2667   error checks (e.g., valid communicator, datatype, dest, and tag)
2668   have been made, the MPI opaque objects have been replaced by
2669   MPID objects, a context id offset is provided in addition to the
2670   communicator, and a request may be returned.  The context offset is
2671   added to the context of the communicator
2672   to get the context it used by the message.
2673   A request is returned only if the ADI implementation was unable to
2674   complete the send of the message.  In that case, the usual 'MPI_Wait'
2675   logic should be used to complete the request.  This approach is used to
2676   allow a simple implementation of the ADI.  The ADI is free to always
2677   complete the message and never return a request.
2678 
2679   Module:
2680   Communication
2681 
2682   @*/
2683 int MPID_Send( const void *buf, int count, MPI_Datatype datatype,
2684 	       int dest, int tag, MPID_Comm *comm, int context_offset,
2685 	       MPID_Request **request );
2686 
2687 /*@
2688   MPID_Rsend - MPID entry point for MPI_Rsend
2689 
2690   Notes:
2691   The only difference between this and 'MPI_Rsend' is that the basic
2692   error checks (e.g., valid communicator, datatype, dest, and tag)
2693   have been made, the MPI opaque objects have been replaced by
2694   MPID objects, a context id offset is provided in addition to the
2695   communicator, and a request may be returned.  The context offset is
2696   added to the context of the communicator
2697   to get the context it used by the message.
2698   A request is returned only if the ADI implementation was unable to
2699   complete the send of the message.  In that case, the usual 'MPI_Wait'
2700   logic should be used to complete the request.  This approach is used to
2701   allow a simple implementation of the ADI.  The ADI is free to always
2702   complete the message and never return a request.
2703 
2704   Module:
2705   Communication
2706 
2707   @*/
2708 int MPID_Rsend( const void *buf, int count, MPI_Datatype datatype,
2709 		int dest, int tag, MPID_Comm *comm, int context_offset,
2710 		MPID_Request **request );
2711 
2712 /*@
2713   MPID_Ssend - MPID entry point for MPI_Ssend
2714 
2715   Notes:
2716   The only difference between this and 'MPI_Ssend' is that the basic
2717   error checks (e.g., valid communicator, datatype, dest, and tag)
2718   have been made, the MPI opaque objects have been replaced by
2719   MPID objects, a context id offset is provided in addition to the
2720   communicator, and a request may be returned.  The context offset is
2721   added to the context of the communicator
2722   to get the context it used by the message.
2723   A request is returned only if the ADI implementation was unable to
2724   complete the send of the message.  In that case, the usual 'MPI_Wait'
2725   logic should be used to complete the request.  This approach is used to
2726   allow a simple implementation of the ADI.  The ADI is free to always
2727   complete the message and never return a request.
2728 
2729   Module:
2730   Communication
2731 
2732   @*/
2733 int MPID_Ssend( const void *buf, int count, MPI_Datatype datatype,
2734 		int dest, int tag, MPID_Comm *comm, int context_offset,
2735 		MPID_Request **request );
2736 
2737 /*@
2738   MPID_tBsend - Attempt a send and return if it would block
2739 
2740   Notes:
2741   This has the semantics of 'MPI_Bsend', except that it returns the internal
2742   error code 'MPID_WOULD_BLOCK' if the message can''t be sent immediately
2743   (t is for "try").
2744 
2745   The reason that this interface is chosen over a query to check whether
2746   a message `can` be sent is that the query approach is not
2747   thread-safe.  Since the decision on whether a message can be sent
2748   without blocking depends (among other things) on the state of flow
2749   control managed by the device, this approach also gives the device
2750   the greatest freedom in implementing flow control.  In particular,
2751   if another MPI process can change the flow control parameters, then
2752   even in a single-threaded implementation, it would not be safe to
2753   return, for example, a message size that could be sent with 'MPI_Bsend'.
2754 
2755   This routine allows an MPI implementation to optimize 'MPI_Bsend'
2756   for the case when the message can be delivered without blocking the
2757   calling process.  An ADI implementation is free to have this routine
2758   always return 'MPID_WOULD_BLOCK', but is encouraged not to.
2759 
2760   To allow the MPI implementation to avoid trying this routine when it
2761   is not implemented by the ADI, the C preprocessor constant 'MPID_HAS_TBSEND'
2762   should be defined if this routine has a nontrivial implementation.
2763 
2764   This is an optional routine.  The MPI code for 'MPI_Bsend' will attempt
2765   to call this routine only if the device defines 'MPID_HAS_TBSEND'.
2766 
2767   Module:
2768   Communication
2769   @*/
2770 int MPID_tBsend( const void *buf, int count, MPI_Datatype datatype,
2771 		 int dest, int tag, MPID_Comm *comm, int context_offset );
2772 
2773 /*@
2774   MPID_Isend - MPID entry point for MPI_Isend
2775 
2776   Notes:
2777   The only difference between this and 'MPI_Isend' is that the basic
2778   error checks (e.g., valid communicator, datatype, dest, and tag)
2779   have been made, the MPI opaque objects have been replaced by
2780   MPID objects, and a context id offset is provided in addition to the
2781   communicator.  This offset is added to the context of the communicator
2782   to get the context it used by the message.
2783 
2784   Module:
2785   Communication
2786 
2787   @*/
2788 int MPID_Isend( const void *buf, int count, MPI_Datatype datatype,
2789 		int dest, int tag, MPID_Comm *comm, int context_offset,
2790 		MPID_Request **request );
2791 
2792 /*@
2793   MPID_Irsend - MPID entry point for MPI_Irsend
2794 
2795   Notes:
2796   The only difference between this and 'MPI_Irsend' is that the basic
2797   error checks (e.g., valid communicator, datatype, dest, and tag)
2798   have been made, the MPI opaque objects have been replaced by
2799   MPID objects, and a context id offset is provided in addition to the
2800   communicator.  This offset is added to the context of the communicator
2801   to get the context it used by the message.
2802 
2803   Module:
2804   Communication
2805 
2806   @*/
2807 int MPID_Irsend( const void *buf, int count, MPI_Datatype datatype,
2808 		 int dest, int tag, MPID_Comm *comm, int context_offset,
2809 		 MPID_Request **request );
2810 
2811 /*@
2812   MPID_Issend - MPID entry point for MPI_Issend
2813 
2814   Notes:
2815   The only difference between this and 'MPI_Issend' is that the basic
2816   error checks (e.g., valid communicator, datatype, dest, and tag)
2817   have been made, the MPI opaque objects have been replaced by
2818   MPID objects, and a context id offset is provided in addition to the
2819   communicator.  This offset is added to the context of the communicator
2820   to get the context it used by the message.
2821 
2822   Module:
2823   Communication
2824 
2825   @*/
2826 int MPID_Issend( const void *buf, int count, MPI_Datatype datatype,
2827 		 int dest, int tag, MPID_Comm *comm, int context_offset,
2828 		 MPID_Request **request );
2829 
2830 /*@
2831   MPID_Recv - MPID entry point for MPI_Recv
2832 
2833   Notes:
2834   The only difference between this and 'MPI_Recv' is that the basic
2835   error checks (e.g., valid communicator, datatype, source, and tag)
2836   have been made, the MPI opaque objects have been replaced by
2837   MPID objects, a context id offset is provided in addition to the
2838   communicator, and a request may be returned.  The context offset is added
2839   to the context of the communicator to get the context it used by the message.
2840   As in 'MPID_Send', the request is returned only if the operation did not
2841   complete.  Conversely, the status object is populated with valid information
2842   only if the operation completed.
2843 
2844   Module:
2845   Communication
2846 
2847   @*/
2848 int MPID_Recv( void *buf, int count, MPI_Datatype datatype,
2849 	       int source, int tag, MPID_Comm *comm, int context_offset,
2850 	       MPI_Status *status, MPID_Request **request );
2851 
2852 
2853 /*@
2854   MPID_Irecv - MPID entry point for MPI_Irecv
2855 
2856   Notes:
2857   The only difference between this and 'MPI_Irecv' is that the basic
2858   error checks (e.g., valid communicator, datatype, source, and tag)
2859   have been made, the MPI opaque objects have been replaced by
2860   MPID objects, and a context id offset is provided in addition to the
2861   communicator.  This offset is added to the context of the communicator
2862   to get the context it used by the message.
2863 
2864   Module:
2865   Communication
2866 
2867   @*/
2868 int MPID_Irecv( void *buf, int count, MPI_Datatype datatype,
2869 		int source, int tag, MPID_Comm *comm, int context_offset,
2870 		MPID_Request **request );
2871 
2872 /*@
2873   MPID_Send_init - MPID entry point for MPI_Send_init
2874 
2875   Notes:
2876   The only difference between this and 'MPI_Send_init' is that the basic
2877   error checks (e.g., valid communicator, datatype, dest, and tag)
2878   have been made, the MPI opaque objects have been replaced by
2879   MPID objects, and a context id offset is provided in addition to the
2880   communicator.  This offset is added to the context of the communicator
2881   to get the context it used by the message.
2882 
2883   Module:
2884   Communication
2885 
2886   @*/
2887 int MPID_Send_init( const void *buf, int count, MPI_Datatype datatype,
2888 		    int dest, int tag, MPID_Comm *comm, int context_offset,
2889 		    MPID_Request **request );
2890 
2891 int MPID_Bsend_init(const void *, int, MPI_Datatype, int, int, MPID_Comm *,
2892 		   int, MPID_Request **);
2893 /*@
2894   MPID_Rsend_init - MPID entry point for MPI_Rsend_init
2895 
2896   Notes:
2897   The only difference between this and 'MPI_Rsend_init' is that the basic
2898   error checks (e.g., valid communicator, datatype, dest, and tag)
2899   have been made, the MPI opaque objects have been replaced by
2900   MPID objects, and a context id offset is provided in addition to the
2901   communicator.  This offset is added to the context of the communicator
2902   to get the context it used by the message.
2903 
2904   Module:
2905   Communication
2906 
2907   @*/
2908 int MPID_Rsend_init( const void *buf, int count, MPI_Datatype datatype,
2909 		     int dest, int tag, MPID_Comm *comm, int context_offset,
2910 		     MPID_Request **request );
2911 /*@
2912   MPID_Ssend_init - MPID entry point for MPI_Ssend_init
2913 
2914   Notes:
2915   The only difference between this and 'MPI_Ssend_init' is that the basic
2916   error checks (e.g., valid communicator, datatype, dest, and tag)
2917   have been made, the MPI opaque objects have been replaced by
2918   MPID objects, and a context id offset is provided in addition to the
2919   communicator.  This offset is added to the context of the communicator
2920   to get the context it used by the message.
2921 
2922   Module:
2923   Communication
2924 
2925   @*/
2926 int MPID_Ssend_init( const void *buf, int count, MPI_Datatype datatype,
2927 		     int dest, int tag, MPID_Comm *comm, int context_offset,
2928 		     MPID_Request **request );
2929 
2930 /*@
2931   MPID_Recv_init - MPID entry point for MPI_Recv_init
2932 
2933   Notes:
2934   The only difference between this and 'MPI_Recv_init' is that the basic
2935   error checks (e.g., valid communicator, datatype, source, and tag)
2936   have been made, the MPI opaque objects have been replaced by
2937   MPID objects, and a context id offset is provided in addition to the
2938   communicator.  This offset is added to the context of the communicator
2939   to get the context it used by the message.
2940 
2941   Module:
2942   Communication
2943 
2944   @*/
2945 int MPID_Recv_init( void *buf, int count, MPI_Datatype datatype,
2946 		    int source, int tag, MPID_Comm *comm, int context_offset,
2947 		    MPID_Request **request );
2948 
2949 /*@
2950   MPID_Startall - MPID entry point for MPI_Startall
2951 
2952   Notes:
2953   The only difference between this and 'MPI_Startall' is that the basic
2954   error checks (e.g., count) have been made, and the MPI opaque objects
2955   have been replaced by pointers to MPID objects.
2956 
2957   Rationale:
2958   This allows the device to schedule communication involving multiple requests,
2959   whereas an implementation built on just 'MPID_Start' would force the
2960   ADI to initiate the communication in the order encountered.
2961   @*/
2962 int MPID_Startall(int count, MPID_Request *requests[]);
2963 
2964 /*@
2965    MPID_Probe - Block until a matching request is found and return information
2966    about it
2967 
2968   Input Parameters:
2969 + source - rank to match (or 'MPI_ANY_SOURCE')
2970 . tag - Tag to match (or 'MPI_ANY_TAG')
2971 . comm - communicator to match.
2972 - context_offset - context id offset of communicator to match
2973 
2974   Output Parameter:
2975 . status - 'MPI_Status' set as defined by 'MPI_Probe'
2976 
2977 
2978   Return Value:
2979   Error code.
2980 
2981   Notes:
2982   Note that the values returned in 'status' will be valid for a subsequent
2983   MPI receive operation only if no other thread attempts to receive the same
2984   message.
2985   (See the
2986   discussion of probe in Section 8.7.2 Clarifications of the MPI-2 standard.)
2987 
2988   Providing the 'context_offset' is necessary at this level to support the
2989   way in which the MPICH implementation uses context ids in the implementation
2990   of other operations.  The communicator is present to allow the device
2991   to use message-queues attached to particular communicators or connections
2992   between processes.
2993 
2994   Module:
2995   Request
2996 
2997   @*/
2998 int MPID_Probe(int, int, MPID_Comm *, int, MPI_Status *);
2999 /*@
3000    MPID_Iprobe - Look for a matching request in the receive queue
3001    but do not remove or return it
3002 
3003   Input Parameters:
3004 + source - rank to match (or 'MPI_ANY_SOURCE')
3005 . tag - Tag to match (or 'MPI_ANY_TAG')
3006 . comm - communicator to match.
3007 - context_offset - context id offset of communicator to match
3008 
3009   Output Parameter:
3010 + flag - true if a matching request was found, false otherwise.
3011 - status - 'MPI_Status' set as defined by 'MPI_Iprobe' (only valid when return
3012   flag is true).
3013 
3014   Return Value:
3015   Error Code.
3016 
3017   Notes:
3018   Note that the values returned in 'status' will be valid for a subsequent
3019   MPI receive operation only if no other thread attempts to receive the same
3020   message.
3021   (See the
3022   discussion of probe in Section 8.7.2 (Clarifications) of the MPI-2 standard.)
3023 
3024   Providing the 'context_offset' is necessary at this level to support the
3025   way in which the MPICH implementation uses context ids in the implementation
3026   of other operations.  The communicator is present to allow the device
3027   to use message-queues attached to particular communicators or connections
3028   between processes.
3029 
3030   Devices that rely solely on polling to make progress should call
3031   MPID_Progress_poke() (or some equivalent function) if a matching request
3032   could not be found.  This insures that progress continues to be made even if
3033   the application is calling MPI_Iprobe() from within a loop not containing
3034   calls to any other MPI functions.
3035 
3036   Module:
3037   Request
3038 
3039   @*/
3040 int MPID_Iprobe(int, int, MPID_Comm *, int, int *, MPI_Status *);
3041 
3042 /*@
3043    MPID_Mprobe - Block until a matching request is found and return information
3044    about it, including a message handle for later reception.
3045 
3046   Input Parameters:
3047 + source - rank to match (or 'MPI_ANY_SOURCE')
3048 . tag - Tag to match (or 'MPI_ANY_TAG')
3049 . comm - communicator to match.
3050 - context_offset - context id offset of communicator to match
3051 
3052   Output Parameter:
3053 + message - 'MPID_Request' (logically a message) set as defined by 'MPI_Mprobe'
3054 - status - 'MPI_Status' set as defined by 'MPI_Mprobe'
3055 
3056   Return Value:
3057   Error code.
3058 
3059   Providing the 'context_offset' is necessary at this level to support the
3060   way in which the MPICH implementation uses context ids in the implementation
3061   of other operations.  The communicator is present to allow the device
3062   to use message-queues attached to particular communicators or connections
3063   between processes.
3064 
3065   Module:
3066   Request
3067 
3068   @*/
3069 int MPID_Mprobe(int source, int tag, MPID_Comm *comm, int context_offset,
3070                 MPID_Request **message, MPI_Status *status);
3071 
3072 /*@
3073    MPID_Improbe - Look for a matching request in the receive queue and return
3074    information about it, including a message handle for later reception.
3075 
3076   Input Parameters:
3077 + source - rank to match (or 'MPI_ANY_SOURCE')
3078 . tag - Tag to match (or 'MPI_ANY_TAG')
3079 . comm - communicator to match.
3080 - context_offset - context id offset of communicator to match
3081 
3082   Output Parameter:
3083 + flag - 'flag' set as defined by 'MPI_Improbe'
3084 . message - 'MPID_Request' (logically a message) set as defined by 'MPI_Improbe'
3085 - status - 'MPI_Status' set as defined by 'MPI_Improbe'
3086 
3087   Return Value:
3088   Error code.
3089 
3090   Providing the 'context_offset' is necessary at this level to support the
3091   way in which the MPICH implementation uses context ids in the implementation
3092   of other operations.  The communicator is present to allow the device
3093   to use message-queues attached to particular communicators or connections
3094   between processes.
3095 
3096   Module:
3097   Request
3098 
3099   @*/
3100 int MPID_Improbe(int source, int tag, MPID_Comm *comm, int context_offset,
3101                  int *flag, MPID_Request **message, MPI_Status *status);
3102 
3103 /*@
3104    MPID_Imrecv - Begin receiving the message indicated by the given message
3105    handle and return a request object for later completion.
3106 
3107   Input Parameters:
3108 + count - number of elements to receive
3109 . datatype - datatype of each recv buffer element
3110 - message - 'MPID_Request' (logically a message) set as defined by 'MPI_Mprobe'
3111 
3112   Output Parameter:
3113 + buf - receive buffer
3114 - request - request object for completing the recv
3115 
3116   Return Value:
3117   Error code.
3118 
3119   Module:
3120   Request
3121 
3122   NOTE: under most implementations the request object returned will
3123   probably be some modified version of the "message" object passed in.
3124 
3125   @*/
3126 int MPID_Imrecv(void *buf, int count, MPI_Datatype datatype,
3127                 MPID_Request *message, MPID_Request **rreqp);
3128 
3129 /*@
3130    MPID_Mrecv - Receive the message indicated by the given message handle.
3131 
3132   Input Parameters:
3133 + count - number of elements to receive
3134 . datatype - datatype of each recv buffer element
3135 - message - 'MPID_Request' (logically a message) set as defined by 'MPI_Mprobe'
3136 
3137   Output Parameter:
3138 + buf - receive buffer
3139 - status - 'MPI_Status' set as defined by 'MPI_Mrecv'
3140 
3141   Return Value:
3142   Error code.
3143 
3144   Module:
3145   Request
3146 
3147   NOTE: under most implementations the request object returned will
3148   probably be some modified version of the "message" object passed in.
3149 
3150   @*/
3151 int MPID_Mrecv(void *buf, int count, MPI_Datatype datatype,
3152                MPID_Request *message, MPI_Status *status);
3153 
3154 /*@
3155   MPID_Cancel_send - Cancel the indicated send request
3156 
3157   Input Parameter:
3158 . request - Send request to cancel
3159 
3160   Return Value:
3161   MPI error code.
3162 
3163   Notes:
3164   Cancel is a tricky operation, particularly for sends.  Read the
3165   discussion in the MPI-1 and MPI-2 documents carefully.  This call
3166   only requests that the request be cancelled; a subsequent wait
3167   or test must first succeed (i.e., the request completion counter must be
3168   zeroed).
3169 
3170   Module:
3171   Request
3172 
3173   @*/
3174 int MPID_Cancel_send(MPID_Request *);
3175 /*@
3176   MPID_Cancel_recv - Cancel the indicated recv request
3177 
3178   Input Parameter:
3179 . request - Receive request to cancel
3180 
3181   Return Value:
3182   MPI error code.
3183 
3184   Notes:
3185   This cancels a pending receive request.  In many cases, this is implemented
3186   by simply removing the request from a pending receive request queue.
3187   However, some ADI implementations may maintain these queues in special
3188   places, such as within a NIC (Network Interface Card).
3189   This call only requests that the request be cancelled; a subsequent wait
3190   or test must first succeed (i.e., the request completion counter must be
3191   zeroed).
3192 
3193   Module:
3194   Request
3195 
3196   @*/
3197 int MPID_Cancel_recv(MPID_Request *);
3198 
3199 /* MPI-2 RMA Routines */
3200 
3201 int MPID_Win_create(void *, MPI_Aint, int, MPID_Info *, MPID_Comm *,
3202                     MPID_Win **);
3203 int MPID_Win_free(MPID_Win **);
3204 
3205 int MPID_Put(void *, int, MPI_Datatype, int, MPI_Aint, int,
3206              MPI_Datatype, MPID_Win *);
3207 int MPID_Get(void *, int, MPI_Datatype, int, MPI_Aint, int,
3208              MPI_Datatype, MPID_Win *);
3209 int MPID_Accumulate(void *, int, MPI_Datatype, int, MPI_Aint, int,
3210                     MPI_Datatype, MPI_Op, MPID_Win *);
3211 
3212 int MPID_Win_fence(int, MPID_Win *);
3213 int MPID_Win_post(MPID_Group *group_ptr, int assert, MPID_Win *win_ptr);
3214 int MPID_Win_start(MPID_Group *group_ptr, int assert, MPID_Win *win_ptr);
3215 int MPID_Win_test(MPID_Win *win_ptr, int *flag);
3216 int MPID_Win_wait(MPID_Win *win_ptr);
3217 int MPID_Win_complete(MPID_Win *win_ptr);
3218 
3219 int MPID_Win_lock(int lock_type, int dest, int assert, MPID_Win *win_ptr);
3220 int MPID_Win_unlock(int dest, MPID_Win *win_ptr);
3221 
3222 /* MPI-3 RMA Routines */
3223 
3224 int MPID_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info *info,
3225                       MPID_Comm *comm, void *baseptr, MPID_Win **win);
3226 int MPID_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Info *info_ptr, MPID_Comm *comm_ptr,
3227                              void **base_ptr, MPID_Win **win_ptr);
3228 int MPID_Win_shared_query(MPID_Win *win, int rank, MPI_Aint *size, int *disp_unit,
3229                           void *baseptr);
3230 int MPID_Win_create_dynamic(MPID_Info *info, MPID_Comm *comm, MPID_Win **win);
3231 int MPID_Win_attach(MPID_Win *win, void *base, MPI_Aint size);
3232 int MPID_Win_detach(MPID_Win *win, const void *base);
3233 
3234 int MPID_Get_accumulate(const void *origin_addr, int origin_count,
3235                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
3236                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
3237                         int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win);
3238 int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
3239                       MPI_Datatype datatype, int target_rank, MPI_Aint target_disp,
3240                       MPI_Op op, MPID_Win *win);
3241 int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
3242                           void *result_addr, MPI_Datatype datatype, int target_rank,
3243                           MPI_Aint target_disp, MPID_Win *win);
3244 int MPID_Rput(const void *origin_addr, int origin_count,
3245               MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
3246               int target_count, MPI_Datatype target_datatype, MPID_Win *win,
3247               MPID_Request **request);
3248 int MPID_Rget(void *origin_addr, int origin_count,
3249               MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
3250               int target_count, MPI_Datatype target_datatype, MPID_Win *win,
3251               MPID_Request **request);
3252 int MPID_Raccumulate(const void *origin_addr, int origin_count,
3253                      MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
3254                      int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win,
3255                      MPID_Request **request);
3256 int MPID_Rget_accumulate(const void *origin_addr, int origin_count,
3257                          MPI_Datatype origin_datatype, void *result_addr, int result_count,
3258                          MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
3259                          int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win,
3260                          MPID_Request **request);
3261 
3262 int MPID_Win_lock_all(int assert, MPID_Win *win);
3263 int MPID_Win_unlock_all(MPID_Win *win);
3264 int MPID_Win_flush(int rank, MPID_Win *win);
3265 int MPID_Win_flush_all(MPID_Win *win);
3266 int MPID_Win_flush_local(int rank, MPID_Win *win);
3267 int MPID_Win_flush_local_all(MPID_Win *win);
3268 int MPID_Win_sync(MPID_Win *win);
3269 
3270 
3271 /*@
3272   MPID_Progress_start - Begin a block of operations that check the completion
3273   counters in requests.
3274 
3275   Input parameters:
3276 . state - pointer to a progress state variable
3277 
3278   Notes:
3279   This routine is informs the progress engine that a block of code follows that
3280   will examine the completion counter of some 'MPID_Request' objects and then
3281   call 'MPID_Progress_wait' zero or more times followed by a call to
3282   'MPID_Progress_end'.
3283 
3284   The progress state variable must be specific to the thread calling it.  If at
3285   all possible, the state should be declared as an auto variable and thus
3286   allocated on the stack of the current thread.  Thread specific storage could
3287   be used instead, but doing such would incur additional (and typically
3288   unnecessary) overhead.
3289 
3290   This routine is needed to properly implement blocking tests when
3291   multithreaded progress engines are used.  In a single-threaded implementation
3292   of the ADI, this may be defined as an empty macro.
3293 
3294   Module:
3295   Communication
3296   @*/
3297 void MPID_Progress_start(MPID_Progress_state * state);
3298 /*@
3299   MPID_Progress_wait - Wait for some communication since 'MPID_Progress_start'
3300 
3301     Input parameters:
3302 .   state - pointer to the progress state initialized by MPID_Progress_start
3303 
3304     Return value:
3305     An mpi error code.
3306 
3307     Notes:
3308     This instructs the progress engine to wait until some communication event
3309     happens since 'MPID_Progress_start' was called.  This call blocks the
3310     calling thread (only, not the process).
3311 
3312   Module:
3313   Communication
3314  @*/
3315 int MPID_Progress_wait(MPID_Progress_state * state);
3316 /*@
3317   MPID_Progress_end - End a block of operations begun with 'MPID_Progress_start'
3318 
3319   Input parameters:
3320   . state - pointer to the progress state variable passed to
3321     'MPID_Progress_start'
3322 
3323    Notes:
3324    This routine instructs the progress engine to end the block begun with
3325    'MPID_Progress_start'.  The progress engine is not required to check for any
3326    pending communication.
3327 
3328    The purpose of this call is to release any locks initiated by
3329    'MPID_Progess_start' or 'MPID_Progess_wait'.  In a single threaded ADI
3330    implementation, this may be defined as an empty macro.
3331 
3332   Module:
3333   Communication
3334    @*/
3335 void MPID_Progress_end(MPID_Progress_state * stae);
3336 /*@
3337   MPID_Progress_test - Check for communication
3338 
3339   Return value:
3340   An mpi error code.
3341 
3342   Notes:
3343   Unlike 'MPID_Progress_wait', this routine is nonblocking.  Therefore, it
3344   does not require the use of 'MPID_Progress_start' and 'MPID_Progress_end'.
3345 
3346   Module:
3347   Communication
3348   @*/
3349 int MPID_Progress_test(void);
3350 /*@
3351   MPID_Progress_poke - Allow a progress engine to check for pending
3352   communication
3353 
3354   Return value:
3355   An mpi error code.
3356 
3357   Notes:
3358   This routine provides a way to invoke the progress engine in a polling
3359   implementation of the ADI.  This routine must be nonblocking.
3360 
3361   A multithreaded implementation is free to define this as an empty macro.
3362 
3363   Module:
3364   Communication
3365   @*/
3366 int MPID_Progress_poke(void);
3367 
3368 /*@
3369   MPID_Request_create - Create and return a bare request
3370 
3371   Return value:
3372   A pointer to a new request object.
3373 
3374   Notes:
3375   This routine is intended for use by 'MPI_Grequest_start' only.  Note that
3376   once a request is created with this routine, any progress engine must assume
3377   that an outside function can complete a request with
3378   'MPID_Request_set_completed'.
3379 
3380   The request object returned by this routine should be initialized such that
3381   ref_count is one and handle contains a valid handle referring to the object.
3382   @*/
3383 MPID_Request * MPID_Request_create(void);
3384 void MPID_Request_set_completed(MPID_Request *);
3385 /*@
3386   MPID_Request_release - Release a request
3387 
3388   Input Parameter:
3389 . request - request to release
3390 
3391   Notes:
3392   This routine is called to release a reference to request object.  If
3393   the reference count of the request object has reached zero, the object will
3394   be deallocated.
3395 
3396   Module:
3397   Request
3398 @*/
3399 void MPID_Request_release(MPID_Request *);
3400 
3401 typedef struct MPID_Grequest_class {
3402      MPIU_OBJECT_HEADER; /* adds handle and ref_count fields */
3403      MPI_Grequest_query_function *query_fn;
3404      MPI_Grequest_free_function *free_fn;
3405      MPI_Grequest_cancel_function *cancel_fn;
3406      MPIX_Grequest_poll_function *poll_fn;
3407      MPIX_Grequest_wait_function *wait_fn;
3408      struct MPID_Grequest_class *next;
3409 } MPID_Grequest_class;
3410 
3411 
3412 /* types and other internal defintions that must be kept out of mpi.h */
3413 
3414 /* forward decls */
3415 struct MPIR_T_pvar_info;
3416 
3417 struct MPIR_T_enum {
3418     /* TODO replace this struct's contents with a real implementation once we
3419      * actually have/support an enum type */
3420     int dummy;
3421 };
3422 struct MPIR_T_cvar_handle {
3423     struct MPIR_Param_t *p;
3424 };
3425 struct MPIR_T_pvar_session {
3426     /* a utlist-managed list (see mpl_utlist.h) -- _must_ be initialized to NULL
3427      * at alloc time or the macros won't work */
3428     struct MPIR_T_pvar_handle *hlist;
3429 
3430     /* TODO does anything else need to go in here at this stage? */
3431 };
3432 
3433 
3434 typedef int MPIR_T_pvar_handle_creator_fn(void *obj_handle,
3435                                           struct MPIR_T_pvar_handle *handle,
3436                                           int *countp);
3437 enum MPIR_T_pvar_impl_kind {
3438     /* generic read/write impl is fine, just deref pointer */
3439     MPIR_T_PVAR_IMPL_SIMPLE,
3440 
3441     /* read/write are dispatched to callbacks to load/store variable values */
3442     MPIR_T_PVAR_IMPL_CB
3443 };
3444 
3445 /* These are descriptors that lower level intialization code creates and feeds
3446  * into the overall MPIX_T_pvar_ system in order to permit the upper level code
3447  * to implement MPIX_T_pvar_{get_num,get_info,handle_alloc}. */
3448 struct MPIR_T_pvar_info {
3449     int idx; /* pvar index value for pvar_get_info and friends */
3450 
3451     /* fields for get_info */
3452     char *name;
3453     enum MPIR_T_verbosity_t verbosity;
3454     enum MPIR_T_pvar_class_t varclass;
3455     MPI_Datatype dtype;
3456     struct MPIR_T_enum *etype;
3457     char *desc;
3458     enum MPIR_T_bind_t binding;
3459     int readonly;
3460     int continuous;
3461     int atomic;
3462 
3463     /* fields for handle_alloc */
3464     enum MPIR_T_pvar_impl_kind impl_kind;
3465     void *var_state;
3466     MPIR_T_pvar_handle_creator_fn *create_fn;
3467 };
3468 
3469 struct MPIR_T_pvar_handle {
3470     /* for linked list of handles attached to the pvar_session */
3471     struct MPIR_T_pvar_handle *next;
3472     struct MPIR_T_pvar_handle *prev;
3473 
3474     struct MPIR_T_pvar_info *info;
3475     struct MPIR_T_pvar_session *session;
3476     int count;
3477 
3478     int bytes; /* for _IMPL_SIMPLE */
3479 
3480     /* for _IMPL_CB types this vtable prevents us from having to duplicate
3481      * multiple sets of function pointers in each handle at the expense of an
3482      * extra pointer indirection */
3483     struct MPIR_T_pvar_hnd_vtable *vtable;
3484     void *handle_state;
3485     int free_handle_state; /* boolean -- true iff the "handle_state" pointer
3486                             * should be freed when this handle is freed */
3487 };
3488 
3489 /* vtable structure for handle "objects".  Implements all major handle operations */
3490 struct MPIR_T_pvar_hnd_vtable {
3491     int (*free)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle);
3492     int (*start)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle);
3493     int (*stop)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle);
3494     int (*read)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle, void *buf);
3495     int (*write)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle, void *buf);
3496     int (*reset)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle);
3497     int (*readreset)(struct MPIR_T_pvar_session *session, struct MPIR_T_pvar_handle *handle, void *buf);
3498 };
3499 
3500 /* Called by lower-level initialization code to add pvars to the global list.
3501  * Will cause the value returned by MPIX_T_pvar_get_num to be incremented and
3502  * sets up that new index to work with get_info, handle_alloc, etc. */
3503 int MPIR_T_pvar_add(const char *name,
3504                     enum MPIR_T_verbosity_t verbosity,
3505                     enum MPIR_T_pvar_class_t varclass,
3506                     MPI_Datatype dtype,
3507                     struct MPIR_T_enum *enumtype,
3508                     const char *desc,
3509                     enum MPIR_T_bind_t bind,
3510                     int readonly,
3511                     int continuous,
3512                     int atomic,
3513                     enum MPIR_T_pvar_impl_kind impl_kind,
3514                     void *var_state,
3515                     MPIR_T_pvar_handle_creator_fn *create_fn,
3516                     int *index_p);
3517 
3518 int MPIR_T_get_num_pvars(int *num);
3519 int MPIR_T_get_pvar_info_by_idx(int idx, struct MPIR_T_pvar_info **info_p);
3520 int MPIR_T_finalize_pvars(void);
3521 void MPIU_Tool_strncpy(char *dst, const char *src, int *len);
3522 
3523 /*TTopoOverview.tex
3524  *
3525  * The MPI collective and topology routines can benefit from information
3526  * about the topology of the underlying interconnect.  Unfortunately, there
3527  * is no best form for the representation (the MPI-1 Forum tried to define
3528  * such a representation, but was unable to).  One useful decomposition
3529  * that has been used in cluster enviroments is a hierarchical decomposition.
3530  *
3531  * The other obviously useful topology information would match the needs of
3532  * 'MPI_Cart_create'.  However, it may be simpler to for the device to
3533  * implement this routine directly.
3534  *
3535  * Other useful information could be the topology information that matches
3536  * the needs of the collective operation, such as spanning trees and rings.
3537  * These may be added to ADI3 later.
3538  *
3539  * Question: Should we define a cart create function?  Dims create?
3540  *
3541  * Usage:
3542  * This routine has nothing to do with the choice of communication method
3543  * that a implementation of the ADI may make.  It is intended only to
3544  * communicate information on the heirarchy of processes, if any, to
3545  * the implementation of the collective communication routines.  This routine
3546  * may also be useful for the MPI Graph topology functions.
3547  *
3548  T*/
3549 
3550 /*@
3551   MPID_Topo_cluster_info - Return information on the hierarchy of
3552   interconnections
3553 
3554   Input Parameter:
3555 . comm - Communicator to study.  May be 'NULL', in which case 'MPI_COMM_WORLD'
3556   is the effective communicator.
3557 
3558   Output Parameters:
3559 + levels - The number of levels in the hierarchy.
3560   To simplify the use of this routine, the maximum value is
3561   'MPID_TOPO_CLUSTER_MAX_LEVELS' (typically 8 or less).
3562 . my_cluster - For each level, the id of the cluster that the calling process
3563   belongs to.
3564 - my_rank - For each level, the rank of the calling process in its cluster
3565 
3566   Notes:
3567   This routine returns a description of the system in terms of nested
3568   clusters of processes.  Levels are numbered from zero.  At each level,
3569   each process may belong to no more than cluster; if a process is in any
3570   cluster at level i, it must be in some cluster at level i-1.
3571 
3572   The communicator argument allows this routine to be used in the dynamic
3573   process case (i.e., with communicators that are created after 'MPI_Init'
3574   and that involve processes that are not part of 'MPI_COMM_WORLD').
3575 
3576   For non-hierarchical systems, this routine simply returns a single
3577   level containing all processes.
3578 
3579   Sample Outputs:
3580   For a single, switch-connected cluster or a uniform-memory-access (UMA)
3581   symmetric multiprocessor (SMP), the return values could be
3582 .vb
3583     level       my_cluster         my_rank
3584     0           0                  rank in comm_world
3585 .ve
3586   This is also a valid response for `any` device.
3587 
3588   For a switch-connected cluster of 2 processor SMPs
3589 .vb
3590     level       my_cluster         my_rank
3591     0           0                  rank in comm_world
3592     1           0 to p/2           0 or 1
3593 .ve
3594  where the value each process on the same SMP has the same value for
3595  'my_cluster[1]' and a different value for 'my_rank[1]'.
3596 
3597   For two SMPs connected by a network,
3598 .vb
3599     level       my_cluster         my_rank
3600     0           0                  rank in comm_world
3601     1           0 or 1             0 to # on SMP
3602 .ve
3603 
3604   An example with more than 2 levels is a collection of clusters, each with
3605   SMP nodes.
3606 
3607   Limitations:
3608   This approach does not provide a representations for topologies that
3609   are not hierarchical.  For example, a mesh interconnect is a single-level
3610   cluster in this view.
3611 
3612   Module:
3613   Topology
3614   @*/
3615 int MPID_Topo_cluster_info( MPID_Comm *comm,
3616 			    int *levels, int my_cluster[], int my_rank[] );
3617 
3618 /*@
3619   MPID_Get_processor_name - Return the name of the current processor
3620 
3621   Input Parameter:
3622 . namelen - Length of name
3623 
3624   Output Parameters:
3625 + name - A unique specifier for the actual (as opposed to virtual) node. This
3626   must be an array of size at least 'MPI_MAX_PROCESSOR_NAME'.
3627 - resultlen - Length (in characters) of the name.  If this pointer is null,
3628    this value is not set.
3629 
3630   Notes:
3631   The name returned should identify a particular piece of hardware;
3632   the exact format is implementation defined.  This name may or may not
3633   be the same as might be returned by 'gethostname', 'uname', or 'sysinfo'.
3634 
3635   This routine is essentially an MPID version of 'MPI_Get_processor_name' .
3636   It must be part of the device because not all environments support calls
3637   to return the processor name.  The additional argument (input name
3638   length) is used to provide better error checking and to ensure that
3639   the input buffer is large enough (rather than assuming that it is
3640   'MPI_MAX_PROCESSOR_NAME' long).
3641   @*/
3642 int MPID_Get_processor_name( char *name, int namelen, int *resultlen);
3643 
3644 void MPID_Errhandler_free(MPID_Errhandler *errhan_ptr);
3645 
3646 /*@
3647   MPID_Get_universe_size - Return the number of processes that the current
3648   process management environment can handle
3649 
3650   Output Parameters:
3651 . universe_size - the universe size; MPIR_UNIVERSE_SIZE_NOT_AVAILABLE if the
3652   size cannot be determined
3653 
3654   Return value:
3655   A MPI error code.
3656 @*/
3657 int MPID_Get_universe_size(int  * universe_size);
3658 
3659 #define MPIR_UNIVERSE_SIZE_NOT_SET -1
3660 #define MPIR_UNIVERSE_SIZE_NOT_AVAILABLE -2
3661 
3662 /*
3663  * FIXME: VCs should not be exposed to the top layer, which implies that these routines should not be exposed either.  Instead,
3664  * the creation, duplication and destruction of communicator objects should be communicated to the device, allowing the device to
3665  * manage the underlying connections in a way that is appropriate (and efficient).
3666  */
3667 
3668 /*@
3669   MPID_VCRT_Create - Create a virtual connection reference table
3670   @*/
3671 int MPID_VCRT_Create(int size, MPID_VCRT *vcrt_ptr);
3672 
3673 /*@
3674   MPID_VCRT_Add_ref - Add a reference to a VCRT
3675   @*/
3676 int MPID_VCRT_Add_ref(MPID_VCRT vcrt);
3677 
3678 /*@
3679   MPID_VCRT_Release - Release a reference to a VCRT
3680 
3681   Notes:
3682   The 'isDisconnect' argument allows this routine to handle the special
3683   case of 'MPI_Comm_disconnect', which needs to take special action
3684   if all references to a VC are removed.
3685   @*/
3686 int MPID_VCRT_Release(MPID_VCRT vcrt, int isDisconnect);
3687 
3688 /*@
3689   MPID_VCRT_Get_ptr -
3690   @*/
3691 int MPID_VCRT_Get_ptr(MPID_VCRT vcrt, MPID_VCR **vc_pptr);
3692 
3693 /*@
3694   MPID_VCR_Dup - Create a duplicate reference to a virtual connection
3695   @*/
3696 int MPID_VCR_Dup(MPID_VCR orig_vcr, MPID_VCR * new_vcr);
3697 
3698 /*@
3699    MPID_VCR_Get_lpid - Get the local process id that corresponds to a
3700    virtual connection reference.
3701 
3702    Notes:
3703    The local process ids are described elsewhere.  Basically, they are
3704    a nonnegative number by which this process can refer to other processes
3705    to which it is connected.  These are local process ids because different
3706    processes may use different ids to identify the same target process
3707   @*/
3708 int MPID_VCR_Get_lpid(MPID_VCR vcr, int * lpid_ptr);
3709 
3710 /* prototypes and declarations for the MPID_Sched interface for nonblocking
3711  * collectives */
3712 #include "mpir_nbc.h"
3713 
3714 #include "mpiimplthreadpost.h"
3715 
3716 /* Include definitions from the device which require items defined by this
3717    file (mpiimpl.h). */
3718 #include "mpidpost.h"
3719 
3720 /* tunable parameter values */
3721 #include "mpich_param_vals.h"
3722 
3723 /* Tags for point to point operations which implement collective and other
3724    internal operations */
3725 #define MPIR_BARRIER_TAG               1
3726 #define MPIR_BCAST_TAG                 2
3727 #define MPIR_GATHER_TAG                3
3728 #define MPIR_GATHERV_TAG               4
3729 #define MPIR_SCATTER_TAG               5
3730 #define MPIR_SCATTERV_TAG              6
3731 #define MPIR_ALLGATHER_TAG             7
3732 #define MPIR_ALLGATHERV_TAG            8
3733 #define MPIR_ALLTOALL_TAG              9
3734 #define MPIR_ALLTOALLV_TAG            10
3735 #define MPIR_REDUCE_TAG               11
3736 #define MPIR_USER_REDUCE_TAG          12
3737 #define MPIR_USER_REDUCEA_TAG         13
3738 #define MPIR_ALLREDUCE_TAG            14
3739 #define MPIR_USER_ALLREDUCE_TAG       15
3740 #define MPIR_USER_ALLREDUCEA_TAG      16
3741 #define MPIR_REDUCE_SCATTER_TAG       17
3742 #define MPIR_USER_REDUCE_SCATTER_TAG  18
3743 #define MPIR_USER_REDUCE_SCATTERA_TAG 19
3744 #define MPIR_SCAN_TAG                 20
3745 #define MPIR_USER_SCAN_TAG            21
3746 #define MPIR_USER_SCANA_TAG           22
3747 #define MPIR_LOCALCOPY_TAG            23
3748 #define MPIR_EXSCAN_TAG               24
3749 #define MPIR_ALLTOALLW_TAG            25
3750 #define MPIR_TOPO_A_TAG               26
3751 #define MPIR_TOPO_B_TAG               27
3752 #define MPIR_REDUCE_SCATTER_BLOCK_TAG 28
3753 #define MPIR_ERROR_TAG                29
3754 #define MPIR_FIRST_NBC_TAG            30
3755 
3756 /* These functions are used in the implementation of collective and
3757    other internal operations. They are wrappers around MPID send/recv
3758    functions. They do sends/receives by setting the context offset to
3759    MPID_CONTEXT_INTRA(INTER)_COLL. */
3760 int MPIC_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3761               MPI_Comm comm);
3762 int MPIC_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
3763               MPI_Comm comm, MPI_Status *status);
3764 int MPIC_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3765                MPI_Comm comm);
3766 int MPIC_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3767                   int dest, int sendtag, void *recvbuf, int recvcount,
3768                   MPI_Datatype recvtype, int source, int recvtag,
3769                   MPI_Comm comm, MPI_Status *status);
3770 int MPIC_Sendrecv_replace(void *buf, int count, MPI_Datatype type,
3771                           int dest, int sendtag,
3772                           int source, int recvtag,
3773                           MPI_Comm comm, MPI_Status *status);
3774 int MPIR_Localcopy(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3775                    void *recvbuf, int recvcount, MPI_Datatype recvtype);
3776 int MPIC_Irecv(void *buf, int count, MPI_Datatype datatype, int
3777                source, int tag, MPI_Comm comm, MPI_Request *request);
3778 int MPIC_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3779                MPI_Comm comm, MPI_Request *request);
3780 int MPIC_Wait(MPID_Request * request_ptr);
3781 int MPIC_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
3782 
3783 /* FT versions of te MPIC_ functions */
3784 int MPIC_Send_ft(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3785                  MPI_Comm comm, int *errflag);
3786 int MPIC_Recv_ft(void *buf, int count, MPI_Datatype datatype, int source, int tag,
3787                  MPI_Comm comm, MPI_Status *status, int *errflag);
3788 int MPIC_Ssend_ft(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3789                   MPI_Comm comm, int *errflag);
3790 int MPIC_Sendrecv_ft(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3791                      int dest, int sendtag, void *recvbuf, int recvcount,
3792                      MPI_Datatype recvtype, int source, int recvtag,
3793                      MPI_Comm comm, MPI_Status *status, int *errflag);
3794 int MPIC_Sendrecv_replace_ft(void *buf, int count, MPI_Datatype datatype,
3795                              int dest, int sendtag,
3796                              int source, int recvtag,
3797                              MPI_Comm comm, MPI_Status *status, int *errflag);
3798 int MPIC_Isend_ft(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
3799                   MPI_Comm comm, MPI_Request *request, int *errflag);
3800 int MPIC_Irecv_ft(void *buf, int count, MPI_Datatype datatype, int source,
3801                   int tag, MPI_Comm comm, MPI_Request *request);
3802 int MPIC_Waitall_ft(int numreq, MPI_Request requests[], MPI_Status statuses[], int *errflag);
3803 
3804 
3805 void MPIR_MAXF  ( void *, void *, int *, MPI_Datatype * ) ;
3806 void MPIR_MINF  ( void *, void *, int *, MPI_Datatype * ) ;
3807 void MPIR_SUM  ( void *, void *, int *, MPI_Datatype * ) ;
3808 void MPIR_PROD  ( void *, void *, int *, MPI_Datatype * ) ;
3809 void MPIR_LAND  ( void *, void *, int *, MPI_Datatype * ) ;
3810 void MPIR_BAND  ( void *, void *, int *, MPI_Datatype * ) ;
3811 void MPIR_LOR  ( void *, void *, int *, MPI_Datatype * ) ;
3812 void MPIR_BOR  ( void *, void *, int *, MPI_Datatype * ) ;
3813 void MPIR_LXOR  ( void *, void *, int *, MPI_Datatype * ) ;
3814 void MPIR_BXOR  ( void *, void *, int *, MPI_Datatype * ) ;
3815 void MPIR_MAXLOC  ( void *, void *, int *, MPI_Datatype * ) ;
3816 void MPIR_MINLOC  ( void *, void *, int *, MPI_Datatype * ) ;
3817 void MPIR_REPLACE  ( void *, void *, int *, MPI_Datatype * ) ;
3818 void MPIR_NO_OP  ( void *, void *, int *, MPI_Datatype * ) ;
3819 
3820 int MPIR_MAXF_check_dtype  ( MPI_Datatype ) ;
3821 int MPIR_MINF_check_dtype ( MPI_Datatype ) ;
3822 int MPIR_SUM_check_dtype  ( MPI_Datatype ) ;
3823 int MPIR_PROD_check_dtype  ( MPI_Datatype ) ;
3824 int MPIR_LAND_check_dtype  ( MPI_Datatype ) ;
3825 int MPIR_BAND_check_dtype  ( MPI_Datatype ) ;
3826 int MPIR_LOR_check_dtype  ( MPI_Datatype ) ;
3827 int MPIR_BOR_check_dtype  ( MPI_Datatype ) ;
3828 int MPIR_LXOR_check_dtype ( MPI_Datatype ) ;
3829 int MPIR_BXOR_check_dtype  ( MPI_Datatype ) ;
3830 int MPIR_MAXLOC_check_dtype  ( MPI_Datatype ) ;
3831 int MPIR_MINLOC_check_dtype  ( MPI_Datatype ) ;
3832 int MPIR_REPLACE_check_dtype  ( MPI_Datatype ) ;
3833 int MPIR_NO_OP_check_dtype  ( MPI_Datatype ) ;
3834 
3835 #define MPIR_PREDEF_OP_COUNT 14
3836 extern MPI_User_function *MPIR_Op_table[];
3837 
3838 typedef int (MPIR_Op_check_dtype_fn) ( MPI_Datatype );
3839 extern MPIR_Op_check_dtype_fn *MPIR_Op_check_dtype_table[];
3840 
3841 #define MPIR_OP_HDL_TO_FN(op) MPIR_Op_table[((op)&0xf) - 1]
3842 #define MPIR_OP_HDL_TO_DTYPE_FN(op) MPIR_Op_check_dtype_table[((op)&0xf) - 1]
3843 
3844 #if !defined MPIR_MIN
3845 #define MPIR_MIN(a,b) (((a)>(b))?(b):(a))
3846 #endif /* MPIR_MIN */
3847 
3848 #if !defined MPIR_MAX
3849 #define MPIR_MAX(a,b) (((b)>(a))?(b):(a))
3850 #endif /* MPIR_MAX */
3851 
3852 int MPIR_Type_is_rma_atomic(MPI_Datatype type);
3853 int MPIR_Compare_equal(const void *a, const void *b, MPI_Datatype type);
3854 
3855 int MPIR_Allgather_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3856                         void *recvbuf, int recvcount, MPI_Datatype recvtype,
3857                         MPID_Comm *comm_ptr, int *errflag );
3858 int MPIR_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3859                    void *recvbuf, int recvcount, MPI_Datatype recvtype,
3860                    MPID_Comm *comm_ptr, int *errflag );
3861 int MPIR_Allgather_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3862                          void *recvbuf, int recvcount, MPI_Datatype recvtype,
3863                          MPID_Comm *comm_ptr, int *errflag );
3864 int MPIR_Allgather_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3865                          void *recvbuf, int recvcount, MPI_Datatype recvtype,
3866                          MPID_Comm *comm_ptr, int *errflag );
3867 int MPIR_Allgatherv_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3868                          void *recvbuf, const int *recvcounts, const int *displs,
3869                          MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag );
3870 int MPIR_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3871                     void *recvbuf, const int *recvcounts, const int *displs,
3872                     MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag );
3873 int MPIR_Allgatherv_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3874                           void *recvbuf, const int *recvcounts, const int *displs,
3875                           MPI_Datatype recvtype, MPID_Comm *comm_pt, int *errflag );
3876 int MPIR_Allgatherv_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3877                           void *recvbuf, const int *recvcounts, const int *displs,
3878                           MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag );
3879 int MPIR_Allreduce_impl(const void *sendbuf, void *recvbuf, int count,
3880                         MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3881 int MPIR_Allreduce(const void *sendbuf, void *recvbuf, int count,
3882                    MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3883 int MPIR_Allreduce_intra(const void *sendbuf, void *recvbuf, int count,
3884                          MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3885 int MPIR_Allreduce_inter(const void *sendbuf, void *recvbuf, int count,
3886                         MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3887 int MPIR_Alltoall_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3888                        void *recvbuf, int recvcount, MPI_Datatype recvtype,
3889                        MPID_Comm *comm_ptr, int *errflag);
3890 int MPIR_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3891                   void *recvbuf, int recvcount, MPI_Datatype recvtype,
3892                   MPID_Comm *comm_ptr, int *errflag);
3893 int MPIR_Alltoall_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3894                         void *recvbuf, int recvcount, MPI_Datatype recvtype,
3895                         MPID_Comm *comm_ptr, int *errflag);
3896 int MPIR_Alltoall_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
3897                         void *recvbuf, int recvcount, MPI_Datatype recvtype,
3898                         MPID_Comm *comm_ptr, int *errflag);
3899 int MPIR_Alltoallv_impl(const void *sendbuf, const int *sendcnts, const int *sdispls,
3900                         MPI_Datatype sendtype, void *recvbuf, const int *recvcnts,
3901                         const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
3902                         int *errflag);
3903 int MPIR_Alltoallv(const void *sendbuf, const int *sendcnts, const int *sdispls,
3904                    MPI_Datatype sendtype, void *recvbuf, const int *recvcnts,
3905                    const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag);
3906 int MPIR_Alltoallv_intra(const void *sendbuf, const int *sendcnts, const int *sdispls,
3907                          MPI_Datatype sendtype, void *recvbuf, const int *recvcnts,
3908                          const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr,
3909                          int *errflag);
3910 int MPIR_Alltoallv_inter(const void *sendbuf, const int *sendcnts, const int *sdispls,
3911                          MPI_Datatype sendtype, void *recvbuf, const int *recvcnts,
3912                          const int *rdispls, MPI_Datatype recvtype,
3913                          MPID_Comm *comm_ptr, int *errflag);
3914 int MPIR_Alltoallw_impl(const void *sendbuf, const int *sendcnts, const int *sdispls,
3915                         const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcnts,
3916                         const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr,
3917                         int *errflag);
3918 int MPIR_Alltoallw(const void *sendbuf, const int *sendcnts, const int *sdispls,
3919                    const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcnts,
3920                    const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr,
3921                    int *errflag);
3922 int MPIR_Alltoallw_intra(const void *sendbuf, const int *sendcnts, const int *sdispls,
3923                          const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcnts,
3924                          const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr,
3925                          int *errflag);
3926 int MPIR_Alltoallw_inter(const void *sendbuf, const int *sendcnts, const int *sdispls,
3927                          const MPI_Datatype *sendtypes, void *recvbuf,
3928                          const int *recvcnts, const int *rdispls, const MPI_Datatype *recvtypes,
3929                          MPID_Comm *comm_ptr, int *errflag);
3930 int MPIR_Bcast_inter(void *buffer, int count, MPI_Datatype datatype,
3931 		     int root, MPID_Comm *comm_ptr, int *errflag);
3932 int MPIR_Bcast_intra (void *buffer, int count, MPI_Datatype datatype, int
3933                       root, MPID_Comm *comm_ptr, int *errflag);
3934 int MPIR_Bcast (void *buffer, int count, MPI_Datatype datatype, int
3935                 root, MPID_Comm *comm_ptr, int *errflag);
3936 int MPIR_Bcast_impl (void *buffer, int count, MPI_Datatype datatype, int
3937                 root, MPID_Comm *comm_ptr, int *errflag);
3938 int MPIR_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3939                 MPI_Op op, MPID_Comm *comm_ptr, int *errflag );
3940 int MPIR_Exscan_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3941                      MPI_Op op, MPID_Comm *comm_ptr, int *errflag );
3942 int MPIR_Gather_impl (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3943                       void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3944                       int root, MPID_Comm *comm_ptr, int *errflag);
3945 int MPIR_Gather (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3946                  void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3947                  int root, MPID_Comm *comm_ptr, int *errflag);
3948 int MPIR_Gather_intra (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3949                        void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3950                        int root, MPID_Comm *comm_ptr, int *errflag);
3951 int MPIR_Gather_inter (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3952                        void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3953                        int root, MPID_Comm *comm_ptr, int *errflag );
3954 int MPIR_Gatherv (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3955                   void *recvbuf, const int *recvcnts, const int *displs,
3956                   MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *errflag);
3957 int MPIR_Gatherv_impl (const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3958                        void *recvbuf, const int *recvcnts, const int *displs,
3959                        MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, int *errflag);
3960 int MPIR_Reduce_scatter_impl(const void *sendbuf, void *recvbuf, const int *recvcnts,
3961                              MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3962 int MPIR_Reduce_scatter(const void *sendbuf, void *recvbuf, const int *recvcnts,
3963                         MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3964 int MPIR_Reduce_scatter_intra(const void *sendbuf, void *recvbuf, const int *recvcnts,
3965                               MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3966 int MPIR_Reduce_scatter_inter(const void *sendbuf, void *recvbuf, const int *recvcnts,
3967                               MPI_Datatype datatype, MPI_Op op,
3968                               MPID_Comm *comm_ptr, int *errflag);
3969 int MPIR_Reduce_scatter_block_impl(const void *sendbuf, void *recvbuf, int recvcount,
3970                                    MPI_Datatype datatype, MPI_Op op, MPID_Comm
3971                                    *comm_ptr, int *errflag );
3972 int MPIR_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
3973                               MPI_Datatype datatype, MPI_Op op, MPID_Comm
3974                               *comm_ptr, int *errflag );
3975 int MPIR_Reduce_scatter_block_intra(const void *sendbuf, void *recvbuf, int recvcount,
3976                                     MPI_Datatype datatype, MPI_Op op, MPID_Comm
3977                                     *comm_ptr, int *errflag );
3978 int MPIR_Reduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int recvcount,
3979                                     MPI_Datatype datatype, MPI_Op op, MPID_Comm
3980                                     *comm_ptr, int *errflag);
3981 int MPIR_Reduce_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3982                      MPI_Op op, int root, MPID_Comm *comm_ptr, int *errflag );
3983 int MPIR_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3984                 MPI_Op op, int root, MPID_Comm *comm_ptr, int *errflag );
3985 int MPIR_Reduce_intra(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3986                       MPI_Op op, int root, MPID_Comm *comm_ptr, int *errflag );
3987 int MPIR_Reduce_inter (const void *sendbuf, void *recvbuf, int count, MPI_Datatype
3988                        datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, int *errflag);
3989 int MPIR_Scan_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3990                    MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3991 int MPIR_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
3992               MPI_Op op, MPID_Comm *comm_ptr, int *errflag);
3993 int MPIR_Scatter_impl(const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3994                       void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3995                       int root, MPID_Comm *comm_ptr, int *errflag );
3996 int MPIR_Scatter(const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
3997                  void *recvbuf, int recvcnt, MPI_Datatype recvtype,
3998                  int root, MPID_Comm *comm_ptr, int *errflag );
3999 int MPIR_Scatter_intra(const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
4000                        void *recvbuf, int recvcnt, MPI_Datatype recvtype,
4001                        int root, MPID_Comm *comm_ptr, int *errflag );
4002 int MPIR_Scatter_inter(const void *sendbuf, int sendcnt, MPI_Datatype sendtype,
4003                        void *recvbuf, int recvcnt, MPI_Datatype recvtype,
4004                        int root, MPID_Comm *comm_ptr, int *errflag );
4005 int MPIR_Scatterv_impl (const void *sendbuf, const int *sendcnts, const int *displs,
4006                         MPI_Datatype sendtype, void *recvbuf, int recvcnt,
4007                         MPI_Datatype recvtype, int root, MPID_Comm
4008                         *comm_ptr, int *errflag);
4009 int MPIR_Scatterv (const void *sendbuf, const int *sendcnts, const int *displs,
4010                    MPI_Datatype sendtype, void *recvbuf, int recvcnt,
4011                    MPI_Datatype recvtype, int root, MPID_Comm
4012                    *comm_ptr, int *errflag);
4013 int MPIR_Barrier_impl( MPID_Comm *comm_ptr, int *errflag);
4014 int MPIR_Barrier( MPID_Comm *comm_ptr, int *errflag);
4015 int MPIR_Barrier_intra( MPID_Comm *comm_ptr, int *errflag);
4016 int MPIR_Barrier_inter( MPID_Comm *comm_ptr, int *errflag);
4017 
4018 int MPIR_Reduce_local_impl(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype, MPI_Op op);
4019 
4020 int MPIR_Setup_intercomm_localcomm( MPID_Comm * );
4021 
4022 int MPIR_Comm_create( MPID_Comm ** );
4023 
4024 /* comm_create helper functions, used by both comm_create and comm_create_group */
4025 int MPIR_Comm_create_calculate_mapping(MPID_Group  *group_ptr,
4026                                        MPID_Comm   *comm_ptr,
4027                                        MPID_VCR   **mapping_vcr_out,
4028                                        int        **mapping_out);
4029 int MPIR_Comm_create_create_and_map_vcrt(int n,
4030                                          int *mapping,
4031                                          MPID_VCR *mapping_vcr,
4032                                          MPID_VCRT *out_vcrt,
4033                                          MPID_VCR **out_vcr);
4034 
4035 int MPIR_Comm_commit( MPID_Comm * );
4036 
4037 int MPIR_Comm_is_node_aware( MPID_Comm * );
4038 
4039 int MPIR_Comm_is_node_consecutive( MPID_Comm *);
4040 
4041 void MPIR_Free_err_dyncodes( void );
4042 
4043 int MPIR_Comm_idup_impl(MPID_Comm *comm_ptr, MPID_Comm **newcomm, MPID_Request **reqp);
4044 
4045 int MPIR_Allreduce_group(void *sendbuf, void *recvbuf, int count,
4046                          MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr,
4047                          MPID_Group *group_ptr, int tag, int *errflag);
4048 int MPIR_Allreduce_group_intra(void *sendbuf, void *recvbuf, int count,
4049                                MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr,
4050                                MPID_Group *group_ptr, int tag, int *errflag);
4051 
4052 
4053 int MPIR_Barrier_group(MPID_Comm *comm_ptr, MPID_Group *group_ptr, int tag, int *errflag);
4054 
4055 
4056 /* topology impl functions */
4057 int MPIR_Dist_graph_neighbors_count_impl(MPID_Comm *comm_ptr, int *indegree, int *outdegree, int *weighted);
4058 int MPIR_Dist_graph_neighbors_impl(MPID_Comm *comm_ptr,
4059                                    int maxindegree, int sources[], int sourceweights[],
4060                                    int maxoutdegree, int destinations[], int destweights[]);
4061 int MPIR_Graph_neighbors_count_impl(MPID_Comm *comm_ptr, int rank, int *nneighbors);
4062 int MPIR_Graph_neighbors_impl(MPID_Comm *comm_ptr, int rank, int maxneighbors, int *neighbors);
4063 int MPIR_Cart_shift_impl(MPID_Comm *comm_ptr, int direction, int displ, int *source, int *dest);
4064 
4065 /* begin impl functions for NBC */
4066 int MPIR_Ibarrier_impl(MPID_Comm *comm_ptr, MPI_Request *request);
4067 int MPIR_Ibcast_impl(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4068 int MPIR_Igather_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4069 int MPIR_Igatherv_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4070 int MPIR_Iscatter_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4071 int MPIR_Iscatterv_impl(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4072 int MPIR_Iallgather_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4073 int MPIR_Iallgatherv_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4074 int MPIR_Ialltoall_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4075 int MPIR_Ialltoallv_impl(const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4076 int MPIR_Ialltoallw_impl(const void *sendbuf, const int *sendcounts, const int *sdispls, const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr, MPI_Request *request);
4077 int MPIR_Ireduce_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPI_Request *request);
4078 int MPIR_Iallreduce_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
4079 int MPIR_Ireduce_scatter_impl(const void *sendbuf, void *recvbuf, const int *recvcounts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
4080 int MPIR_Ireduce_scatter_block_impl(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
4081 int MPIR_Iscan_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
4082 int MPIR_Iexscan_impl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPI_Request *request);
4083 /* end impl functions for NBC */
4084 
4085 /* begin impl functions for neighborhood collectives */
4086 int MPIR_Ineighbor_allgather_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4087 int MPIR_Ineighbor_allgatherv_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4088 int MPIR_Ineighbor_alltoall_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4089 int MPIR_Ineighbor_alltoallv_impl(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPI_Request *request);
4090 int MPIR_Ineighbor_alltoallw_impl(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPID_Comm *comm_ptr, MPI_Request *request);
4091 int MPIR_Neighbor_allgather_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4092 int MPIR_Neighbor_allgatherv_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4093 int MPIR_Neighbor_alltoall_impl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4094 int MPIR_Neighbor_alltoallv_impl(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4095 int MPIR_Neighbor_alltoallw_impl(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPID_Comm *comm_ptr);
4096 /* end impl functions for neighborhood collectives */
4097 
4098 /* neighborhood collective default algorithms */
4099 int MPIR_Neighbor_allgather_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4100 int MPIR_Neighbor_allgatherv_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4101 int MPIR_Neighbor_alltoall_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4102 int MPIR_Neighbor_alltoallv_default(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr);
4103 int MPIR_Neighbor_alltoallw_default(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPID_Comm *comm_ptr);
4104 int MPIR_Ineighbor_allgather_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4105 int MPIR_Ineighbor_allgatherv_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int displs[], MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4106 int MPIR_Ineighbor_alltoall_default(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4107 int MPIR_Ineighbor_alltoallv_default(const void *sendbuf, const int sendcounts[], const int sdispls[], MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[], MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4108 int MPIR_Ineighbor_alltoallw_default(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPID_Comm *comm_ptr, MPID_Sched_t s);
4109 
4110 /* nonblocking collective default algorithms */
4111 int MPIR_Ibcast_intra(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4112 int MPIR_Ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4113 int MPIR_Ibcast_binomial(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4114 int MPIR_Ibcast_SMP(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4115 int MPIR_Iscatter_for_bcast(void *tmp_buf, int root, MPID_Comm *comm_ptr, int nbytes, MPID_Sched_t s);
4116 int MPIR_Ibcast_scatter_rec_dbl_allgather(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4117 int MPIR_Ibcast_scatter_ring_allgather(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4118 int MPIR_Ibarrier_intra(MPID_Comm *comm_ptr, MPID_Sched_t s);
4119 int MPIR_Ibarrier_inter(MPID_Comm *comm_ptr, MPID_Sched_t s);
4120 int MPIR_Ireduce_intra(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4121 int MPIR_Ireduce_inter(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4122 int MPIR_Ireduce_binomial(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4123 int MPIR_Ireduce_redscat_gather(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4124 int MPIR_Ireduce_SMP(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4125 int MPIR_Ialltoallv_intra(const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4126 int MPIR_Ialltoallv_inter(const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4127 int MPIR_Iallreduce_intra(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4128 int MPIR_Iallreduce_inter(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4129 int MPIR_Iallreduce_naive(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4130 int MPIR_Iallreduce_SMP(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4131 int MPIR_Iallreduce_redscat_allgather(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4132 int MPIR_Iallreduce_rec_dbl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4133 int MPIR_Igather_binomial(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4134 int MPIR_Igather_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4135 int MPIR_Igather_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4136 int MPIR_Iscatter_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4137 int MPIR_Iscatter_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4138 int MPIR_Iscatterv(const void *sendbuf, const int *sendcounts, const int *displs, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4139 int MPIR_Ireduce_scatter_intra(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4140 int MPIR_Ireduce_scatter_inter(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4141 int MPIR_Ireduce_scatter_rec_dbl(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4142 int MPIR_Ireduce_scatter_rec_hlv(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4143 int MPIR_Ireduce_scatter_pairwise(const void *sendbuf, void *recvbuf, const int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4144 int MPIR_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root, MPID_Comm *comm_ptr, MPID_Sched_t s);
4145 int MPIR_Ireduce_scatter_block_intra(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4146 int MPIR_Ireduce_scatter_block_inter(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4147 int MPIR_Ireduce_scatter_block_rec_hlv(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4148 int MPIR_Ireduce_scatter_block_pairwise(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4149 int MPIR_Ireduce_scatter_block_rec_dbl(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4150 int MPIR_Ireduce_scatter_block_noncomm(const void *sendbuf, void *recvbuf, int recvcount, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4151 int MPIR_Ialltoall_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4152 int MPIR_Ialltoall_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4153 int MPIR_Ialltoall_inplace(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4154 int MPIR_Ialltoall_bruck(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4155 int MPIR_Ialltoall_perm_sr(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4156 int MPIR_Ialltoall_pairwise(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4157 int MPIR_Iallgather_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4158 int MPIR_Iallgather_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4159 int MPIR_Iallgather_rec_dbl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4160 int MPIR_Iallgather_bruck(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4161 int MPIR_Iallgather_ring(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4162 int MPIR_Iallgatherv_rec_dbl(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4163 int MPIR_Iallgatherv_bruck(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4164 int MPIR_Iallgatherv_ring(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4165 int MPIR_Iallgatherv_intra(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4166 int MPIR_Iallgatherv_inter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s);
4167 int MPIR_Iscan_rec_dbl(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4168 int MPIR_Iscan_SMP(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4169 int MPIR_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPID_Comm *comm_ptr, MPID_Sched_t s);
4170 int MPIR_Ialltoallw_intra(const void *sendbuf, const int *sendcounts, const int *sdispls, const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr, MPID_Sched_t s);
4171 int MPIR_Ialltoallw_inter(const void *sendbuf, const int *sendcounts, const int *sdispls, const MPI_Datatype *sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, const MPI_Datatype *recvtypes, MPID_Comm *comm_ptr, MPID_Sched_t s);
4172 
4173 /* begin impl functions for MPI_T (MPIX_T_ right now) */
4174 int MPIR_T_init_thread_impl(int required, int *provided);
4175 int MPIR_T_finalize_impl(void);
4176 int MPIR_T_enum_get_info_impl(MPIX_T_enum enumtype, int num, char *name, int *name_len);
4177 int MPIR_T_enum_get_item_impl(MPIX_T_enum enumtype, int num, int *value, char *name, int *name_len);
4178 int MPIR_T_cvar_get_num_impl(int *num_cvar);
4179 int MPIR_T_cvar_get_info_impl(int cvar_index, char *name, int *name_len, int *verbosity, MPI_Datatype *datatype, MPIX_T_enum *enumtype, char *desc, int *desc_len, int *binding, int *scope);
4180 int MPIR_T_cvar_handle_alloc_impl(int cvar_index, void *obj_handle, MPIX_T_cvar_handle *handle, int *count);
4181 int MPIR_T_cvar_handle_free_impl(MPIX_T_cvar_handle *handle);
4182 int MPIR_T_cvar_read_impl(MPIX_T_cvar_handle handle, void *buf);
4183 int MPIR_T_cvar_write_impl(MPIX_T_cvar_handle handle, void *buf);
4184 int MPIR_T_pvar_get_num_impl(int *num_pvar);
4185 int MPIR_T_pvar_get_info_impl(int pvar_index, char *name, int *name_len, int *verbosity, int *var_class, MPI_Datatype *datatype, MPIX_T_enum *enumtype, char *desc, int *desc_len, int *binding, int *readonly, int *continuous, int *atomic);
4186 int MPIR_T_pvar_session_create_impl(MPIX_T_pvar_session *session);
4187 int MPIR_T_pvar_session_free_impl(MPIX_T_pvar_session *session);
4188 int MPIR_T_pvar_handle_alloc_impl(MPIX_T_pvar_session session, int pvar_index, void *obj_handle, MPIX_T_pvar_handle *handle, int *count);
4189 int MPIR_T_pvar_handle_free_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle *handle);
4190 int MPIR_T_pvar_start_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle);
4191 int MPIR_T_pvar_stop_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle);
4192 int MPIR_T_pvar_read_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle, void *buf);
4193 int MPIR_T_pvar_write_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle, void *buf);
4194 int MPIR_T_pvar_reset_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle);
4195 int MPIR_T_pvar_readreset_impl(MPIX_T_pvar_session session, MPIX_T_pvar_handle handle, void *buf);
4196 int MPIR_T_category_get_num_impl(int *num_cat);
4197 int MPIR_T_category_get_info_impl(int cat_index, char *name, int *name_len, char *desc, int *desc_len, int *num_controlvars, int *num_pvars, int *num_categories);
4198 int MPIR_T_category_get_cvars_impl(int cat_index, int len, int indices[]);
4199 int MPIR_T_category_get_pvars_impl(int cat_index[], int len, int indices[]);
4200 int MPIR_T_category_get_categories_impl(int cat_index, int len, int indices[]);
4201 int MPIR_T_category_changed_impl(int *stamp);
4202 /* end impl functions for MPI_T (MPIX_T_ right now) */
4203 
4204 int MPIR_T_is_initialized(void);
4205 
4206 /* random initializers */
4207 int MPIR_Group_init(void);
4208 int MPIR_Comm_init(MPID_Comm *);
4209 
4210 
4211 /* Collective functions cannot be called from multiple threads. These
4212    are stubs used in the collective communication calls to check for
4213    user error. Currently they are just being macroed out. */
4214 #define MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER(comm_ptr)
4215 #define MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT(comm_ptr)
4216 
4217 /* Miscellaneous */
4218 void MPIU_SetTimeout( int );
4219 
4220 #if defined(HAVE_VSNPRINTF) && defined(NEEDS_VSNPRINTF_DECL) && !defined(vsnprintf)
4221 int vsnprintf(char *str, size_t size, const char *format, va_list ap);
4222 # endif
4223 
4224 /* Routines for determining local and remote processes */
4225 
4226 int MPIU_Find_local_and_external(struct MPID_Comm *comm, int *local_size_p, int *local_rank_p, int **local_ranks_p,
4227                                  int *external_size_p, int *external_rank_p, int **external_ranks_p,
4228                                  int **intranode_table, int **internode_table_p);
4229 int MPIU_Get_internode_rank(MPID_Comm *comm_ptr, int r);
4230 int MPIU_Get_intranode_rank(MPID_Comm *comm_ptr, int r);
4231 
4232 /* Trivial accessor macros */
4233 
4234 #define MPIR_Comm_rank(comm_ptr) ((comm_ptr)->rank)
4235 #define MPIR_Comm_size(comm_ptr) ((comm_ptr)->local_size)
4236 #define MPIR_Type_extent_impl(datatype, extent_ptr) MPID_Datatype_get_extent_macro(datatype, *(extent_ptr))
4237 #define MPIR_Type_size_impl(datatype, size) MPID_Datatype_get_size_macro(datatype, *(size))
4238 #define MPIR_Test_cancelled_impl(status, flag) *(flag) = (status)->cancelled
4239 
4240 /* MPIR_ functions.  These are versions of MPI_ functions appropriate for calling within MPI */
4241 int MPIR_Cancel_impl(MPID_Request *request_ptr);
4242 struct MPIR_Topology;
4243 void MPIR_Cart_rank_impl(struct MPIR_Topology *cart_ptr, const int *coords, int *rank);
4244 int MPIR_Cart_create_impl(MPID_Comm *comm_ptr, int ndims, const int dims[],
4245                           const int periods[], int reorder, MPI_Comm *comm_cart);
4246 int MPIR_Cart_map_impl(const MPID_Comm *comm_ptr, int ndims, const int dims[],
4247                        const int periodic[], int *newrank);
4248 int MPIR_Close_port_impl(const char *port_name);
4249 int MPIR_Open_port_impl(MPID_Info *info_ptr, char *port_name);
4250 void MPIR_Info_get_impl(MPID_Info *info_ptr, const char *key, int valuelen, char *value, int *flag);
4251 void MPIR_Info_get_nkeys_impl(MPID_Info *info_ptr, int *nkeys);
4252 int MPIR_Info_get_nthkey_impl(MPID_Info *info, int n, char *key);
4253 void MPIR_Info_get_valuelen_impl(MPID_Info *info_ptr, const char *key, int *valuelen, int *flag);
4254 int MPIR_Comm_delete_attr_impl(MPID_Comm *comm_ptr, MPID_Keyval *keyval_ptr);
4255 int MPIR_Comm_create_keyval_impl(MPI_Comm_copy_attr_function *comm_copy_attr_fn,
4256                                  MPI_Comm_delete_attr_function *comm_delete_attr_fn,
4257                                  int *comm_keyval, void *extra_state);
4258 int MPIR_Comm_accept_impl(const char * port_name, MPID_Info * info_ptr, int root,
4259                           MPID_Comm * comm_ptr, MPID_Comm ** newcomm_ptr);
4260 int MPIR_Comm_connect_impl(const char * port_name, MPID_Info * info_ptr, int root,
4261                            MPID_Comm * comm_ptr, MPID_Comm ** newcomm_ptr);
4262 int MPIR_Comm_create_errhandler_impl(MPI_Comm_errhandler_function *function,
4263                                      MPI_Errhandler *errhandler);
4264 int MPIR_Comm_dup_impl(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr);
4265 int MPIR_Comm_free_impl(MPID_Comm * comm_ptr);
4266 void MPIR_Comm_free_keyval_impl(int keyval);
4267 void MPIR_Comm_get_errhandler_impl(MPID_Comm *comm_ptr, MPID_Errhandler **errhandler_ptr);
4268 void MPIR_Comm_set_errhandler_impl(MPID_Comm *comm_ptr, MPID_Errhandler *errhandler_ptr);
4269 void MPIR_Comm_get_name_impl(MPID_Comm *comm, char *comm_name, int *resultlen);
4270 int MPIR_Intercomm_merge_impl(MPID_Comm *comm_ptr, int high, MPID_Comm **new_intracomm_ptr);
4271 int MPIR_Intercomm_create_impl(MPID_Comm *local_comm_ptr, int local_leader,
4272                                MPID_Comm *peer_comm_ptr, int remote_leader, int tag,
4273                                MPID_Comm **new_intercomm_ptr);
4274 int MPIR_Comm_group_impl(MPID_Comm *comm_ptr, MPID_Group **group_ptr);
4275 int MPIR_Comm_remote_group_impl(MPID_Comm *comm_ptr, MPID_Group **group_ptr);
4276 int MPIR_Comm_group_failed_impl(MPID_Comm *comm, MPID_Group **failed_group_ptr);
4277 int MPIR_Comm_remote_group_failed_impl(MPID_Comm *comm, MPID_Group **failed_group_ptr);
4278 int MPIR_Comm_split_impl(MPID_Comm *comm_ptr, int color, int key, MPID_Comm **newcomm_ptr);
4279 int MPIR_Comm_split_type_impl(MPID_Comm *comm_ptr, int split_type, int key, MPID_Info *info_ptr,
4280                               MPID_Comm **newcomm_ptr);
4281 int MPIR_Group_compare_impl(MPID_Group *group_ptr1, MPID_Group *group_ptr2, int *result);
4282 int MPIR_Group_difference_impl(MPID_Group *group_ptr1, MPID_Group *group_ptr2, MPID_Group **new_group_ptr);
4283 int MPIR_Group_excl_impl(MPID_Group *group_ptr, int n, const int *ranks, MPID_Group **new_group_ptr);
4284 int MPIR_Group_free_impl(MPID_Group *group_ptr);
4285 int MPIR_Group_incl_impl(MPID_Group *group_ptr, int n, const int *ranks, MPID_Group **new_group_ptr);
4286 int MPIR_Group_intersection_impl(MPID_Group *group_ptr1, MPID_Group *group_ptr2, MPID_Group **new_group_ptr);
4287 int MPIR_Group_range_excl_impl(MPID_Group *group_ptr, int n, int ranges[][3], MPID_Group **new_group_ptr);
4288 int MPIR_Group_range_incl_impl(MPID_Group *group_ptr, int n, int ranges[][3], MPID_Group **new_group_ptr);
4289 int MPIR_Group_translate_ranks_impl(MPID_Group *group_ptr1, int n, const int *ranks1,
4290                                      MPID_Group *group_ptr2, int *ranks2);
4291 int MPIR_Group_union_impl(MPID_Group *group_ptr1, MPID_Group *group_ptr2, MPID_Group **new_group_ptr);
4292 void MPIR_Get_count_impl(const MPI_Status *status, MPI_Datatype datatype, int *count);
4293 void MPIR_Grequest_complete_impl(MPID_Request *request_ptr);
4294 int MPIR_Grequest_start_impl(MPI_Grequest_query_function *query_fn,
4295                              MPI_Grequest_free_function *free_fn,
4296                              MPI_Grequest_cancel_function *cancel_fn,
4297                              void *extra_state, MPID_Request **request_ptr);
4298 int MPIR_Graph_map_impl(const MPID_Comm *comm_ptr, int nnodes,
4299                         const int indx[], const int edges[], int *newrank);
4300 int MPIR_Type_commit_impl(MPI_Datatype *datatype);
4301 int MPIR_Type_create_struct_impl(int count,
4302                                  const int array_of_blocklengths[],
4303                                  const MPI_Aint array_of_displacements[],
4304                                  const MPI_Datatype array_of_types[],
4305                                  MPI_Datatype *newtype);
4306 int MPIR_Type_create_indexed_block_impl(int count,
4307                                         int blocklength,
4308                                         const int array_of_displacements[],
4309                                         MPI_Datatype oldtype,
4310                                         MPI_Datatype *newtype);
4311 int MPIR_Type_create_hindexed_block_impl(int count, int blocklength,
4312                                          const MPI_Aint array_of_displacements[],
4313                                          MPI_Datatype oldtype, MPI_Datatype *newtype);
4314 int MPIR_Type_contiguous_impl(int count,
4315                               MPI_Datatype old_type,
4316                               MPI_Datatype *new_type_p);
4317 void MPIR_Type_get_extent_impl(MPI_Datatype datatype, MPI_Aint *lb, MPI_Aint *extent);
4318 void MPIR_Type_get_true_extent_impl(MPI_Datatype datatype, MPI_Aint *true_lb, MPI_Aint *true_extent);
4319 void MPIR_Type_get_envelope_impl(MPI_Datatype datatype, int *num_integers, int *num_addresses,
4320                                  int *num_datatypes, int *combiner);
4321 int MPIR_Type_hvector_impl(int count, int blocklen, MPI_Aint stride, MPI_Datatype old_type, MPI_Datatype *newtype_p);
4322 int MPIR_Type_indexed_impl(int count, const int blocklens[], const int indices[],
4323                            MPI_Datatype old_type, MPI_Datatype *newtype);
4324 void MPIR_Type_free_impl(MPI_Datatype *datatype);
4325 int MPIR_Type_vector_impl(int count, int blocklength, int stride, MPI_Datatype old_type, MPI_Datatype *newtype_p);
4326 int MPIR_Type_struct_impl(int count, const int blocklens[], const MPI_Aint indices[], const MPI_Datatype old_types[], MPI_Datatype *newtype);
4327 int MPIR_Pack_impl(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf, int outcount, int *position);
4328 void MPIR_Pack_size_impl(int incount, MPI_Datatype datatype, int *size);
4329 int MPIR_Unpack_impl(const void *inbuf, int insize, int *position,
4330                      void *outbuf, int outcount, MPI_Datatype datatype);
4331 void MPIR_Type_lb_impl(MPI_Datatype datatype, MPI_Aint *displacement);
4332 int MPIR_Ibsend_impl(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
4333                      MPID_Comm *comm_ptr, MPI_Request *request);
4334 int MPIR_Test_impl(MPI_Request *request, int *flag, MPI_Status *status);
4335 int MPIR_Wait_impl(MPI_Request *request, MPI_Status *status);
4336 int MPIR_Waitall_impl(int count, MPI_Request array_of_requests[],
4337                       MPI_Status array_of_statuses[]);
4338 int MPIR_Comm_set_attr_impl(MPID_Comm *comm_ptr, int comm_keyval, void *attribute_val,
4339                             MPIR_AttrType attrType);
4340 
4341 
4342 /* The "fastpath" version of MPIR_Request_complete.  It only handles
4343  * MPID_REQUEST_SEND and MPID_REQUEST_RECV kinds, and it does not attempt to
4344  * deal with status structures under the assumption that bleeding fast code will
4345  * pass either MPI_STATUS_IGNORE or MPI_STATUSES_IGNORE as appropriate.  This
4346  * routine (or some a variation of it) is an unfortunately necessary stunt to
4347  * get high message rates on key benchmarks for high-end systems.
4348  */
4349 #undef FUNCNAME
4350 #define FUNCNAME MPIR_Request_complete_fastpath
4351 #undef FCNAME
4352 #define FCNAME MPIU_QUOTE(FUNCNAME)
MPIR_Request_complete_fastpath(MPI_Request * request,MPID_Request * request_ptr)4353 static inline int MPIR_Request_complete_fastpath(MPI_Request *request, MPID_Request *request_ptr)
4354 {
4355     int mpi_errno = MPI_SUCCESS;
4356 
4357     MPIU_Assert(request_ptr->kind == MPID_REQUEST_SEND || request_ptr->kind == MPID_REQUEST_RECV);
4358 
4359     if (request_ptr->kind == MPID_REQUEST_SEND) {
4360         /* FIXME: are Ibsend requests added to the send queue? */
4361         MPIR_SENDQ_FORGET(request_ptr);
4362     }
4363 
4364     /* the completion path for SEND and RECV is the same at this time, modulo
4365      * the SENDQ hook above */
4366     mpi_errno = request_ptr->status.MPI_ERROR;
4367     MPID_Request_release(request_ptr);
4368     *request = MPI_REQUEST_NULL;
4369 
4370     /* avoid normal fn_exit/fn_fail jump pattern to reduce jumps and compiler confusion */
4371     return mpi_errno;
4372 }
4373 
4374 /* avoid conflicts in source files with old-style "char FCNAME[]" vars */
4375 #undef FUNCNAME
4376 #undef FCNAME
4377 
4378 #endif /* MPIIMPL_INCLUDED */
4379