1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #ifndef MPIR_NBC_H_INCLUDED
7 #define MPIR_NBC_H_INCLUDED
8 
9 /* This specifies the interface that must be exposed by the ADI in order to
10  * support MPI-3 non-blocking collectives.  MPIR_Sched_ routines are all
11  * permitted to be inlines.  They are not permitted to be macros.
12  *
13  * Most (currently all) devices will just use the default implementation that
14  * lives in "src/mpid/common/sched" */
15 
16 /* The device must supply a typedef for MPIR_Sched_t.  MPIR_Sched_t is a handle
17  * to the schedule (often a pointer under the hood), not the actual schedule.
18  * This makes it easy to cheaply pass the schedule between functions.  Many
19  *
20  * The device must also define a constant (possibly a macro) for an invalid
21  * schedule: MPIR_SCHED_NULL */
22 
23 /* Context/tag strategy for send/recv ops:
24  * -------------------------------
25  *
26  * Blocking collectives were able to more or less safely separate all
27  * communication between different collectives by using a fixed tag per
28  * operation.  This prevents some potentially very surprising message matching
29  * patterns when two different collectives are posted on the same communicator
30  * in rapid succession.  But this strategy probably won't work for NBC because
31  * multiple operations of any combination of types can be outstanding at the
32  * same time.
33  *
34  * The MPI-3 draft standard says that all collective ops must be collectively
35  * posted in a consistent order w.r.t. other collective operations, including
36  * nonblocking collectives.  This means that we can just use a counter to assign
37  * tag values that is incremented at each collective start.  We can jump through
38  * some hoops to make sure that the blocking collective code is left
39  * undisturbed, but it's cleaner to just update them to use the new counter
40  * mechanism as well.
41  */
42 
43 #define MPIR_SCHED_NULL (NULL)
44 
45 /* Open question: should tag allocation be rolled into Sched_start?  Keeping it
46  * separate potentially allows more parallelism in the future, but it also
47  * pushes more work onto the clients of this interface. */
48 int MPIR_Sched_next_tag(MPIR_Comm * comm_ptr, int *tag);
49 
50 /* the device must provide a typedef for MPIR_Sched_t in mpidpre.h */
51 
52 /* creates a new opaque schedule object and returns a handle to it in (*sp) */
53 int MPIR_Sched_create(MPIR_Sched_t * sp);
54 /* clones orig and returns a handle to the new schedule in (*cloned) */
55 int MPIR_Sched_clone(MPIR_Sched_t orig, MPIR_Sched_t * cloned);
56 /* sets (*sp) to MPIR_SCHED_NULL and gives you back a request pointer in (*req).
57  * The caller is giving up ownership of the opaque schedule object.
58  *
59  * comm should be the primary (user) communicator with which this collective is
60  * associated, even if other hidden communicators are used for a subset of the
61  * operations.  It will be used for error handling and similar operations. */
62 int MPIR_Sched_start(MPIR_Sched_t * sp, MPIR_Comm * comm, int tag, MPIR_Request ** req);
63 
64 /* send and recv take a comm ptr to enable hierarchical collectives */
65 int MPIR_Sched_send(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest,
66                     MPIR_Comm * comm, MPIR_Sched_t s);
67 int MPIR_Sched_recv(void *buf, MPI_Aint count, MPI_Datatype datatype, int src, MPIR_Comm * comm,
68                     MPIR_Sched_t s);
69 
70 /* just like MPI_Issend, can't complete until the matching recv is posted */
71 int MPIR_Sched_ssend(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest,
72                      MPIR_Comm * comm, MPIR_Sched_t s);
73 
74 int MPIR_Sched_reduce(const void *inbuf, void *inoutbuf, MPI_Aint count, MPI_Datatype datatype,
75                       MPI_Op op, MPIR_Sched_t s);
76 /* packing/unpacking can be accomplished by passing MPI_PACKED as either intype
77  * or outtype */
78 int MPIR_Sched_copy(const void *inbuf, MPI_Aint incount, MPI_Datatype intype,
79                     void *outbuf, MPI_Aint outcount, MPI_Datatype outtype, MPIR_Sched_t s);
80 /* require that all previously added ops are complete before subsequent ops
81  * may begin to execute */
82 int MPIR_Sched_barrier(MPIR_Sched_t s);
83 
84 /* A convenience macro for the extremely common case that "mpi_errno" is the
85  * variable used for tracking error state and MPIR_ERR_POP is needed.  This
86  * declutters the NBC code substantially. */
87 #define MPIR_SCHED_BARRIER(sched_)              \
88     do {                                        \
89         mpi_errno = MPIR_Sched_barrier(sched_); \
90         MPIR_ERR_CHECK(mpi_errno);              \
91     } while (0)
92 
93 /* Defers evaluating (*count) until the entry actually begins to execute.  This
94  * permits algorithms that accumulate/dissipate bytes as rounds progress without
95  * excessive (re)calculation of counts for/from other processes.
96  *
97  * A corresponding _recv_defer function is not currently provided because there
98  * is no known use case.  The recv count is just an upper bound, not an exact
99  * amount to be received, so an oversized recv is used instead of deferral. */
100 int MPIR_Sched_send_defer(const void *buf, const MPI_Aint * count, MPI_Datatype datatype, int dest,
101                           MPIR_Comm * comm, MPIR_Sched_t s);
102 /* Just like MPIR_Sched_recv except it populates the given status object with
103  * the received count and error information, much like a normal recv.  Often
104  * useful in conjunction with MPIR_Sched_send_defer. */
105 int MPIR_Sched_recv_status(void *buf, MPI_Aint count, MPI_Datatype datatype, int src,
106                            MPIR_Comm * comm, MPI_Status * status, MPIR_Sched_t s);
107 
108 /* buffer management, fancy reductions, etc */
109 int MPIR_Sched_cb(MPIR_Sched_cb_t * cb_p, void *cb_state, MPIR_Sched_t s);
110 int MPIR_Sched_cb2(MPIR_Sched_cb2_t * cb_p, void *cb_state, void *cb_state2, MPIR_Sched_t s);
111 
112 /* TODO: develop a caching infrastructure for use by the upper level as well,
113  * hopefully s.t. uthash can be used somehow */
114 
115 /* common callback utility functions */
116 int MPIR_Sched_cb_free_buf(MPIR_Comm * comm, int tag, void *state);
117 
118 /* an upgraded version of MPIR_CHKPMEM_MALLOC/_DECL/_REAP/_COMMIT that adds
119  * corresponding cleanup callbacks to the given schedule at _COMMIT time */
120 #define MPIR_SCHED_CHKPMEM_DECL(n_)                               \
121     void *(mpir_sched_chkpmem_stk_[n_]) = { NULL };               \
122     int mpir_sched_chkpmem_stk_sp_=0;                             \
123     MPIR_AssertDeclValue(const int mpir_sched_chkpmem_stk_sz_,n_)
124 
125 #define MPIR_SCHED_CHKPMEM_MALLOC_ORSTMT(pointer_,type_,nbytes_,rc_,name_,class_,stmt_)  \
126     do {                                                                          \
127         (pointer_) = (type_)MPL_malloc(nbytes_,class_);                           \
128         if (pointer_) {                                                           \
129             MPIR_Assert(mpir_sched_chkpmem_stk_sp_ < mpir_sched_chkpmem_stk_sz_); \
130             mpir_sched_chkpmem_stk_[mpir_sched_chkpmem_stk_sp_++] = (pointer_);   \
131         } else if ((nbytes_) > 0) {                                               \
132             MPIR_CHKMEM_SETERR((rc_),(nbytes_),(name_));                          \
133             stmt_;                                                                \
134         }                                                                         \
135     } while (0)
136 
137 #define MPIR_SCHED_CHKPMEM_MALLOC(pointer_,type_,nbytes_,rc_,name_,class_) \
138     MPIR_SCHED_CHKPMEM_MALLOC_ORSTMT(pointer_,type_,nbytes_,rc_,name_,class_,goto fn_fail)
139 
140 /* just cleanup, don't add anything to the schedule */
141 #define MPIR_SCHED_CHKPMEM_REAP(sched_)                                       \
142     do {                                                                      \
143         while (mpir_sched_chkpmem_stk_sp_ > 0) {                              \
144             MPL_free(mpir_sched_chkpmem_stk_[--mpir_sched_chkpmem_stk_sp_]); \
145         }                                                                     \
146     } while (0)
147 
148 #define MPIR_SCHED_CHKPMEM_COMMIT(sched_)                                                      \
149     do {                                                                                       \
150         MPIR_SCHED_BARRIER(s);                                                                 \
151         while (mpir_sched_chkpmem_stk_sp_ > 0) {                                               \
152             mpi_errno = MPIR_Sched_cb(&MPIR_Sched_cb_free_buf,                                 \
153                                       (mpir_sched_chkpmem_stk_[--mpir_sched_chkpmem_stk_sp_]), \
154                                       (sched_));                                               \
155             MPIR_ERR_CHECK(mpi_errno);                                                         \
156         }                                                                                      \
157     } while (0)
158 
159 #endif /* MPIR_NBC_H_INCLUDED */
160