1 /* 2 * Copyright (C) by Argonne National Laboratory 3 * See COPYRIGHT in top-level directory 4 */ 5 6 #ifndef MPIR_NBC_H_INCLUDED 7 #define MPIR_NBC_H_INCLUDED 8 9 /* This specifies the interface that must be exposed by the ADI in order to 10 * support MPI-3 non-blocking collectives. MPIR_Sched_ routines are all 11 * permitted to be inlines. They are not permitted to be macros. 12 * 13 * Most (currently all) devices will just use the default implementation that 14 * lives in "src/mpid/common/sched" */ 15 16 /* The device must supply a typedef for MPIR_Sched_t. MPIR_Sched_t is a handle 17 * to the schedule (often a pointer under the hood), not the actual schedule. 18 * This makes it easy to cheaply pass the schedule between functions. Many 19 * 20 * The device must also define a constant (possibly a macro) for an invalid 21 * schedule: MPIR_SCHED_NULL */ 22 23 /* Context/tag strategy for send/recv ops: 24 * ------------------------------- 25 * 26 * Blocking collectives were able to more or less safely separate all 27 * communication between different collectives by using a fixed tag per 28 * operation. This prevents some potentially very surprising message matching 29 * patterns when two different collectives are posted on the same communicator 30 * in rapid succession. But this strategy probably won't work for NBC because 31 * multiple operations of any combination of types can be outstanding at the 32 * same time. 33 * 34 * The MPI-3 draft standard says that all collective ops must be collectively 35 * posted in a consistent order w.r.t. other collective operations, including 36 * nonblocking collectives. This means that we can just use a counter to assign 37 * tag values that is incremented at each collective start. We can jump through 38 * some hoops to make sure that the blocking collective code is left 39 * undisturbed, but it's cleaner to just update them to use the new counter 40 * mechanism as well. 41 */ 42 43 #define MPIR_SCHED_NULL (NULL) 44 45 /* Open question: should tag allocation be rolled into Sched_start? Keeping it 46 * separate potentially allows more parallelism in the future, but it also 47 * pushes more work onto the clients of this interface. */ 48 int MPIR_Sched_next_tag(MPIR_Comm * comm_ptr, int *tag); 49 50 /* the device must provide a typedef for MPIR_Sched_t in mpidpre.h */ 51 52 /* creates a new opaque schedule object and returns a handle to it in (*sp) */ 53 int MPIR_Sched_create(MPIR_Sched_t * sp); 54 /* clones orig and returns a handle to the new schedule in (*cloned) */ 55 int MPIR_Sched_clone(MPIR_Sched_t orig, MPIR_Sched_t * cloned); 56 /* sets (*sp) to MPIR_SCHED_NULL and gives you back a request pointer in (*req). 57 * The caller is giving up ownership of the opaque schedule object. 58 * 59 * comm should be the primary (user) communicator with which this collective is 60 * associated, even if other hidden communicators are used for a subset of the 61 * operations. It will be used for error handling and similar operations. */ 62 int MPIR_Sched_start(MPIR_Sched_t * sp, MPIR_Comm * comm, int tag, MPIR_Request ** req); 63 64 /* send and recv take a comm ptr to enable hierarchical collectives */ 65 int MPIR_Sched_send(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, 66 MPIR_Comm * comm, MPIR_Sched_t s); 67 int MPIR_Sched_recv(void *buf, MPI_Aint count, MPI_Datatype datatype, int src, MPIR_Comm * comm, 68 MPIR_Sched_t s); 69 70 /* just like MPI_Issend, can't complete until the matching recv is posted */ 71 int MPIR_Sched_ssend(const void *buf, MPI_Aint count, MPI_Datatype datatype, int dest, 72 MPIR_Comm * comm, MPIR_Sched_t s); 73 74 int MPIR_Sched_reduce(const void *inbuf, void *inoutbuf, MPI_Aint count, MPI_Datatype datatype, 75 MPI_Op op, MPIR_Sched_t s); 76 /* packing/unpacking can be accomplished by passing MPI_PACKED as either intype 77 * or outtype */ 78 int MPIR_Sched_copy(const void *inbuf, MPI_Aint incount, MPI_Datatype intype, 79 void *outbuf, MPI_Aint outcount, MPI_Datatype outtype, MPIR_Sched_t s); 80 /* require that all previously added ops are complete before subsequent ops 81 * may begin to execute */ 82 int MPIR_Sched_barrier(MPIR_Sched_t s); 83 84 /* A convenience macro for the extremely common case that "mpi_errno" is the 85 * variable used for tracking error state and MPIR_ERR_POP is needed. This 86 * declutters the NBC code substantially. */ 87 #define MPIR_SCHED_BARRIER(sched_) \ 88 do { \ 89 mpi_errno = MPIR_Sched_barrier(sched_); \ 90 MPIR_ERR_CHECK(mpi_errno); \ 91 } while (0) 92 93 /* Defers evaluating (*count) until the entry actually begins to execute. This 94 * permits algorithms that accumulate/dissipate bytes as rounds progress without 95 * excessive (re)calculation of counts for/from other processes. 96 * 97 * A corresponding _recv_defer function is not currently provided because there 98 * is no known use case. The recv count is just an upper bound, not an exact 99 * amount to be received, so an oversized recv is used instead of deferral. */ 100 int MPIR_Sched_send_defer(const void *buf, const MPI_Aint * count, MPI_Datatype datatype, int dest, 101 MPIR_Comm * comm, MPIR_Sched_t s); 102 /* Just like MPIR_Sched_recv except it populates the given status object with 103 * the received count and error information, much like a normal recv. Often 104 * useful in conjunction with MPIR_Sched_send_defer. */ 105 int MPIR_Sched_recv_status(void *buf, MPI_Aint count, MPI_Datatype datatype, int src, 106 MPIR_Comm * comm, MPI_Status * status, MPIR_Sched_t s); 107 108 /* buffer management, fancy reductions, etc */ 109 int MPIR_Sched_cb(MPIR_Sched_cb_t * cb_p, void *cb_state, MPIR_Sched_t s); 110 int MPIR_Sched_cb2(MPIR_Sched_cb2_t * cb_p, void *cb_state, void *cb_state2, MPIR_Sched_t s); 111 112 /* TODO: develop a caching infrastructure for use by the upper level as well, 113 * hopefully s.t. uthash can be used somehow */ 114 115 /* common callback utility functions */ 116 int MPIR_Sched_cb_free_buf(MPIR_Comm * comm, int tag, void *state); 117 118 /* an upgraded version of MPIR_CHKPMEM_MALLOC/_DECL/_REAP/_COMMIT that adds 119 * corresponding cleanup callbacks to the given schedule at _COMMIT time */ 120 #define MPIR_SCHED_CHKPMEM_DECL(n_) \ 121 void *(mpir_sched_chkpmem_stk_[n_]) = { NULL }; \ 122 int mpir_sched_chkpmem_stk_sp_=0; \ 123 MPIR_AssertDeclValue(const int mpir_sched_chkpmem_stk_sz_,n_) 124 125 #define MPIR_SCHED_CHKPMEM_MALLOC_ORSTMT(pointer_,type_,nbytes_,rc_,name_,class_,stmt_) \ 126 do { \ 127 (pointer_) = (type_)MPL_malloc(nbytes_,class_); \ 128 if (pointer_) { \ 129 MPIR_Assert(mpir_sched_chkpmem_stk_sp_ < mpir_sched_chkpmem_stk_sz_); \ 130 mpir_sched_chkpmem_stk_[mpir_sched_chkpmem_stk_sp_++] = (pointer_); \ 131 } else if ((nbytes_) > 0) { \ 132 MPIR_CHKMEM_SETERR((rc_),(nbytes_),(name_)); \ 133 stmt_; \ 134 } \ 135 } while (0) 136 137 #define MPIR_SCHED_CHKPMEM_MALLOC(pointer_,type_,nbytes_,rc_,name_,class_) \ 138 MPIR_SCHED_CHKPMEM_MALLOC_ORSTMT(pointer_,type_,nbytes_,rc_,name_,class_,goto fn_fail) 139 140 /* just cleanup, don't add anything to the schedule */ 141 #define MPIR_SCHED_CHKPMEM_REAP(sched_) \ 142 do { \ 143 while (mpir_sched_chkpmem_stk_sp_ > 0) { \ 144 MPL_free(mpir_sched_chkpmem_stk_[--mpir_sched_chkpmem_stk_sp_]); \ 145 } \ 146 } while (0) 147 148 #define MPIR_SCHED_CHKPMEM_COMMIT(sched_) \ 149 do { \ 150 MPIR_SCHED_BARRIER(s); \ 151 while (mpir_sched_chkpmem_stk_sp_ > 0) { \ 152 mpi_errno = MPIR_Sched_cb(&MPIR_Sched_cb_free_buf, \ 153 (mpir_sched_chkpmem_stk_[--mpir_sched_chkpmem_stk_sp_]), \ 154 (sched_)); \ 155 MPIR_ERR_CHECK(mpi_errno); \ 156 } \ 157 } while (0) 158 159 #endif /* MPIR_NBC_H_INCLUDED */ 160