1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "ofi_impl.h"
7 
8 /* ------------------------------------------------------------------------ */
9 /* ofi_tag_to_vc                                                            */
10 /* This routine converts tag information from an incoming preposted receive */
11 /* into the VC that uses the routine.  There is a possibility of a small    */
12 /* list of temporary VC's that are used during dynamic task management      */
13 /* to create the VC's.  This search is linear, but should be a small number */
14 /* of temporary VC's that will eventually be destroyed by the upper layers  */
15 /* Otherwise the tag is split into a PG "number", which is a hash of the    */
16 /* data contained in the process group, and a source.  The source/pg number */
17 /* is enough to look up the VC.                                             */
18 /* ------------------------------------------------------------------------ */
ofi_wc_to_vc(cq_tagged_entry_t * wc)19 static inline MPIDI_VC_t *ofi_wc_to_vc(cq_tagged_entry_t * wc)
20 {
21     int pgid = 0, port = 0;
22     MPIDI_VC_t *vc = NULL;
23     MPIDI_PG_t *pg = NULL;
24     uint64_t match_bits = wc->tag;
25     int wc_pgid;
26     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_OFI_TAG_TO_VC);
27     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_OFI_TAG_TO_VC);
28     if (gl_data.api_set == API_SET_1) {
29         wc_pgid = get_pgid(match_bits);
30     } else {
31         wc_pgid = wc->data;
32     }
33 
34     if (NO_PGID == wc_pgid) {
35         /* -------------------------------------------------------------------- */
36         /* Dynamic path -- This uses a linear search, but number of cm vc's is  */
37         /* a small number, and they should be ephemeral.  This lookup should    */
38         /* be fast yet not normally on the critical path.                       */
39         /* -------------------------------------------------------------------- */
40         port = get_port(match_bits);
41         vc = gl_data.cm_vcs;
42         while (vc && vc->port_name_tag != port) {
43             vc = VC_OFI(vc)->next;
44         }
45         if (NULL == vc) {
46             MPIR_Assertp(0);
47         }
48     } else {
49         /* -------------------------------------------------------------------- */
50         /* If there are no connection management VC's, this is the normal path  */
51         /* Generate the PG number has from each known process group compare to  */
52         /* the pg number in the tag.  The number of PG's should be small        */
53         /* -------------------------------------------------------------------- */
54         pg = gl_data.pg_p;
55         while (pg) {
56             MPIDI_PG_IdToNum(pg, &pgid);
57             if (wc_pgid == pgid) {
58                 break;
59             }
60             pg = pg->next;
61         }
62         if (pg) {
63             MPIDI_PG_Get_vc(pg, get_psource(match_bits), &vc);
64         } else {
65             MPIR_Assert(0);
66         }
67     }
68     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_OFI_TAG_TO_VC);
69     return vc;
70 }
71 
72 /* ------------------------------------------------------------------------ */
73 /* MPID_nem_ofi_conn_req_callback                                           */
74 /* A new process has been created and is connected to the current world     */
75 /* The address of the new process is exchanged via the business card        */
76 /* instead of being exchanged up front during the creation of the first     */
77 /* world.  The new connection routine is usually invoked when two worlds    */
78 /* are started via dynamic tasking.                                         */
79 /* This routine:                                                            */
80 /*     * repost the persistent connection management receive request        */
81 /*     * malloc/create/initialize the VC                                    */
82 /*     * grabs the address name from the business card                      */
83 /*     * uses fi_av_insert to insert the addr into the address vector.      */
84 /* This is marked as a "connection management" vc, and may be destroyed     */
85 /* by the upper layers.  We handle the cm vc's slightly differently than    */
86 /* other VC's because they may not be part of a process group.              */
87 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_conn_req_callback(cq_tagged_entry_t * wc,MPIR_Request * rreq)88 static inline int MPID_nem_ofi_conn_req_callback(cq_tagged_entry_t * wc, MPIR_Request * rreq)
89 {
90     int ret, len, mpi_errno = MPI_SUCCESS;
91     char bc[MPIDI_OFI_KVSAPPSTRLEN];
92 
93     MPIDI_VC_t *vc;
94     char *addr = NULL;
95     fi_addr_t direct_addr;
96 
97     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_CONN_REQ_CALLBACK);
98     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_CONN_REQ_CALLBACK);
99 
100     MPIR_Memcpy(bc, rreq->dev.user_buf, wc->len);
101     bc[wc->len] = '\0';
102     MPIR_Assert(gl_data.conn_req == rreq);
103     FI_RC_RETRY(fi_trecv(gl_data.endpoint,
104                          gl_data.conn_req->dev.user_buf,
105                          MPIDI_OFI_KVSAPPSTRLEN,
106                          gl_data.mr,
107                          FI_ADDR_UNSPEC,
108                          MPIDI_OFI_CONN_REQ,
109                          GET_RCD_IGNORE_MASK(),
110                          (void *) &(REQ_OFI(gl_data.conn_req)->ofi_context)), trecv);
111 
112     addr = MPL_malloc(gl_data.bound_addrlen, MPL_MEM_ADDRESS);
113     MPIR_Assertp(addr);
114 
115     vc = MPL_malloc(sizeof(MPIDI_VC_t), MPL_MEM_ADDRESS);
116     MPIR_Assertp(vc);
117 
118     MPIDI_VC_Init(vc, NULL, 0);
119     MPIDI_CH3I_NM_OFI_RC(MPIDI_GetTagFromPort(bc, &vc->port_name_tag));
120     ret = MPL_str_get_binary_arg(bc, "OFI", addr, gl_data.bound_addrlen, &len);
121     MPIR_ERR_CHKANDJUMP((ret != MPL_SUCCESS && ret != MPL_ERR_STR_NOMEM) ||
122                         (size_t) len != gl_data.bound_addrlen,
123                         mpi_errno, MPI_ERR_OTHER, "**business_card");
124 
125     FI_RC(fi_av_insert(gl_data.av, addr, 1, &direct_addr, 0ULL, NULL), avmap);
126     VC_OFI(vc)->direct_addr = direct_addr;
127     VC_OFI(vc)->ready = 1;
128     VC_OFI(vc)->is_cmvc = 1;
129     VC_OFI(vc)->next = gl_data.cm_vcs;
130     gl_data.cm_vcs = vc;
131 
132     MPIDI_CH3I_Acceptq_enqueue(vc, vc->port_name_tag);
133     MPIDI_CH3I_INCR_PROGRESS_COMPLETION_COUNT;
134   fn_exit:
135     MPL_free(addr);
136     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_CONN_REQ_CALLBACK);
137     return mpi_errno;
138   fn_fail:
139     MPL_free(vc);
140     goto fn_exit;
141 }
142 
143 /* ------------------------------------------------------------------------ */
144 /* MPID_nem_ofi_handle_packet                                               */
145 /* The "parent" request tracks the state of the entire rendezvous           */
146 /* As "child" requests complete, the cc counter is decremented              */
147 /* Notify CH3 that we have an incoming packet (if cc hits 1).  Otherwise    */
148 /* decrement the ref counter via request completion                         */
149 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_handle_packet(cq_tagged_entry_t * wc ATTRIBUTE ((unused)),MPIR_Request * rreq)150 static inline int MPID_nem_ofi_handle_packet(cq_tagged_entry_t * wc ATTRIBUTE((unused)),
151                                              MPIR_Request * rreq)
152 {
153     int mpi_errno = MPI_SUCCESS;
154     MPIDI_VC_t *vc;
155 
156     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_HANDLE_PACKET);
157     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_HANDLE_PACKET);
158     if (MPIR_cc_get(rreq->cc) == 1) {
159         vc = REQ_OFI(rreq)->vc;
160         MPIR_Assert(vc);
161         MPIDI_CH3I_NM_OFI_RC(MPID_nem_handle_pkt
162                              (vc, REQ_OFI(rreq)->pack_buffer, REQ_OFI(rreq)->pack_buffer_size));
163         MPL_free(REQ_OFI(rreq)->pack_buffer);
164     }
165     MPIDI_CH3I_NM_OFI_RC(MPID_Request_complete(rreq));
166   fn_exit:
167     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_HANDLE_PACKET);
168     return mpi_errno;
169   fn_fail:
170     goto fn_exit;
171 }
172 
173 /* ------------------------------------------------------------------------ */
174 /* MPID_nem_ofi_cts_send_callback                                           */
175 /* A wrapper around MPID_nem_ofi_handle_packet that decrements              */
176 /* the parent request's counter, and cleans up the CTS request              */
177 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_cts_send_callback(cq_tagged_entry_t * wc,MPIR_Request * sreq)178 static inline int MPID_nem_ofi_cts_send_callback(cq_tagged_entry_t * wc, MPIR_Request * sreq)
179 {
180     int mpi_errno = MPI_SUCCESS;
181     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_CTS_SEND_CALLBACK);
182     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_CTS_SEND_CALLBACK);
183     MPIDI_CH3I_NM_OFI_RC(MPID_nem_ofi_handle_packet(wc, REQ_OFI(sreq)->parent));
184     MPIDI_CH3I_NM_OFI_RC(MPID_Request_complete(sreq));
185   fn_exit:
186     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_CTS_SEND_CALLBACK);
187     return mpi_errno;
188   fn_fail:
189     goto fn_exit;
190 }
191 
192 /* ------------------------------------------------------------------------ */
193 /* MPID_nem_ofi_preposted_callback                                          */
194 /* This callback handles incoming "SendContig" messages (see ofi_msg.c)     */
195 /* for the send routines.  This implements the CTS response and the RTS     */
196 /* handler.  The steps are as follows:                                      */
197 /*   * Create a parent data request and post a receive into a pack buffer   */
198 /*   * Create a child request and send the CTS packet                       */
199 /*   * Re-Post the RTS receive and handler to handle the next message       */
200 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_preposted_callback(cq_tagged_entry_t * wc,MPIR_Request * rreq)201 static inline int MPID_nem_ofi_preposted_callback(cq_tagged_entry_t * wc, MPIR_Request * rreq)
202 {
203     int c, mpi_errno = MPI_SUCCESS;
204     size_t pkt_len;
205     char *pack_buffer = NULL;
206     MPIDI_VC_t *vc;
207     MPIR_Request *new_rreq, *sreq;
208     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_PREPOSTED_CALLBACK);
209     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_PREPOSTED_CALLBACK);
210 
211     vc = ofi_wc_to_vc(wc);
212     MPIR_Assert(vc);
213     VC_READY_CHECK(vc);
214 
215     pkt_len = REQ_OFI(rreq)->msg_bytes;
216     pack_buffer = (char *) MPL_malloc(pkt_len, MPL_MEM_BUFFER);
217     /* If the pack buffer is NULL, let OFI handle the truncation
218      * in the progress loop
219      */
220     if (pack_buffer == NULL)
221         pkt_len = 0;
222     c = 1;
223     MPID_nem_ofi_create_req(&new_rreq, 1);
224     MPIR_cc_incr(new_rreq->cc_ptr, &c);
225     new_rreq->dev.OnDataAvail = NULL;
226     new_rreq->dev.next = NULL;
227     REQ_OFI(new_rreq)->event_callback = MPID_nem_ofi_handle_packet;
228     REQ_OFI(new_rreq)->vc = vc;
229     REQ_OFI(new_rreq)->pack_buffer = pack_buffer;
230     REQ_OFI(new_rreq)->pack_buffer_size = pkt_len;
231     FI_RC_RETRY(fi_trecv(gl_data.endpoint,
232                          REQ_OFI(new_rreq)->pack_buffer,
233                          REQ_OFI(new_rreq)->pack_buffer_size,
234                          gl_data.mr,
235                          VC_OFI(vc)->direct_addr,
236                          wc->tag | MPIDI_OFI_MSG_CTS | MPIDI_OFI_MSG_DATA, 0,
237                          &(REQ_OFI(new_rreq)->ofi_context)), trecv);
238 
239     MPID_nem_ofi_create_req(&sreq, 1);
240     sreq->dev.OnDataAvail = NULL;
241     sreq->dev.next = NULL;
242     REQ_OFI(sreq)->event_callback = MPID_nem_ofi_cts_send_callback;
243     REQ_OFI(sreq)->parent = new_rreq;
244     FI_RC_RETRY(fi_tsend(gl_data.endpoint,
245                          NULL,
246                          0,
247                          gl_data.mr,
248                          VC_OFI(vc)->direct_addr,
249                          wc->tag | MPIDI_OFI_MSG_CTS, &(REQ_OFI(sreq)->ofi_context)), tsend);
250     MPIR_Assert(gl_data.persistent_req == rreq);
251 
252     FI_RC_RETRY(fi_trecv(gl_data.endpoint,
253                          &REQ_OFI(rreq)->msg_bytes,
254                          sizeof REQ_OFI(rreq)->msg_bytes,
255                          gl_data.mr,
256                          FI_ADDR_UNSPEC,
257                          MPIDI_OFI_MSG_RTS,
258                          GET_RCD_IGNORE_MASK(), &(REQ_OFI(rreq)->ofi_context)), trecv);
259     /* Return a proper error to MPI to indicate out of memory condition */
260     MPIR_ERR_CHKANDJUMP1(pack_buffer == NULL, mpi_errno, MPI_ERR_OTHER,
261                          "**nomem", "**nomem %s", "Pack Buffer alloc");
262   fn_exit:
263     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_PREPOSTED_CALLBACK);
264     return mpi_errno;
265   fn_fail:
266     goto fn_exit;
267 }
268 
269 /* ------------------------------------------------------------------------ */
270 /* MPID_nem_ofi_connect_to_root_callback                                    */
271 /* Complete and clean up the request                                        */
272 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_connect_to_root_callback(cq_tagged_entry_t * wc ATTRIBUTE ((unused)),MPIR_Request * sreq)273 int MPID_nem_ofi_connect_to_root_callback(cq_tagged_entry_t * wc ATTRIBUTE((unused)),
274                                           MPIR_Request * sreq)
275 {
276     int mpi_errno = MPI_SUCCESS;
277     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_CONNECT_TO_ROOT_CALLBACK);
278     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_CONNECT_TO_ROOT_CALLBACK);
279 
280     MPL_free(REQ_OFI(sreq)->pack_buffer);
281 
282     MPIDI_CH3I_NM_OFI_RC(MPID_Request_complete(sreq));
283 
284   fn_exit:
285     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_CONNECT_TO_ROOT_CALLBACK);
286     return mpi_errno;
287   fn_fail:
288     goto fn_exit;
289 }
290 
291 /* ------------------------------------------------------------------------ */
292 /* MPID_nem_ofi_cm_init                                                     */
293 /* This is a utility routine that sets up persistent connection management  */
294 /* requests and a persistent data request to handle rendezvous SendContig   */
295 /* messages.                                                                */
296 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p,int pg_rank ATTRIBUTE ((unused)))297 int MPID_nem_ofi_cm_init(MPIDI_PG_t * pg_p, int pg_rank ATTRIBUTE((unused)))
298 {
299     int mpi_errno = MPI_SUCCESS;
300     MPIR_Request *persistent_req, *conn_req;
301     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_CM_INIT);
302     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_CM_INIT);
303 
304     /* ------------------------------------- */
305     /* Set up CH3 and netmod data structures */
306     /* ------------------------------------- */
307     if (gl_data.api_set == API_SET_1) {
308         MPIDI_CH3I_NM_OFI_RC(MPIDI_CH3I_Register_anysource_notification
309                              (MPID_nem_ofi_anysource_posted, MPID_nem_ofi_anysource_matched));
310         MPIDI_Anysource_iprobe_fn = MPID_nem_ofi_anysource_iprobe;
311         MPIDI_Anysource_improbe_fn = MPID_nem_ofi_anysource_improbe;
312     } else {
313         MPIDI_CH3I_NM_OFI_RC(MPIDI_CH3I_Register_anysource_notification
314                              (MPID_nem_ofi_anysource_posted_2, MPID_nem_ofi_anysource_matched));
315         MPIDI_Anysource_iprobe_fn = MPID_nem_ofi_anysource_iprobe_2;
316         MPIDI_Anysource_improbe_fn = MPID_nem_ofi_anysource_improbe_2;
317     }
318     gl_data.pg_p = pg_p;
319 
320     /* ----------------------------------- */
321     /* Post a persistent request to handle */
322     /* ----------------------------------- */
323     MPID_nem_ofi_create_req(&persistent_req, 1);
324     persistent_req->dev.OnDataAvail = NULL;
325     persistent_req->dev.next = NULL;
326     REQ_OFI(persistent_req)->vc = NULL;
327     REQ_OFI(persistent_req)->event_callback = MPID_nem_ofi_preposted_callback;
328     FI_RC_RETRY(fi_trecv(gl_data.endpoint,
329                          &REQ_OFI(persistent_req)->msg_bytes,
330                          sizeof REQ_OFI(persistent_req)->msg_bytes,
331                          gl_data.mr,
332                          FI_ADDR_UNSPEC,
333                          MPIDI_OFI_MSG_RTS,
334                          GET_RCD_IGNORE_MASK(),
335                          (void *) &(REQ_OFI(persistent_req)->ofi_context)), trecv);
336     gl_data.persistent_req = persistent_req;
337 
338     /* --------------------------------- */
339     /* Post recv for connection requests */
340     /* --------------------------------- */
341     MPID_nem_ofi_create_req(&conn_req, 1);
342     conn_req->dev.user_buf = MPL_malloc(MPIDI_OFI_KVSAPPSTRLEN * sizeof(char), MPL_MEM_BUFFER);
343     conn_req->dev.OnDataAvail = NULL;
344     conn_req->dev.next = NULL;
345     REQ_OFI(conn_req)->vc = NULL;       /* We don't know the source yet */
346     REQ_OFI(conn_req)->event_callback = MPID_nem_ofi_conn_req_callback;
347     FI_RC_RETRY(fi_trecv(gl_data.endpoint,
348                          conn_req->dev.user_buf,
349                          MPIDI_OFI_KVSAPPSTRLEN,
350                          gl_data.mr,
351                          FI_ADDR_UNSPEC,
352                          MPIDI_OFI_CONN_REQ,
353                          GET_RCD_IGNORE_MASK(), (void *) &(REQ_OFI(conn_req)->ofi_context)), trecv);
354     gl_data.conn_req = conn_req;
355 
356 
357   fn_exit:
358     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_CM_INIT);
359     return mpi_errno;
360 
361   fn_fail:
362     goto fn_exit;
363 }
364 
365 /* ------------------------------------------------------------------------ */
366 /* MPID_nem_ofi_cm_finalize                                                 */
367 /* Clean up and cancle the requests initiated by the cm_init routine        */
368 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_cm_finalize()369 int MPID_nem_ofi_cm_finalize()
370 {
371     int mpi_errno = MPI_SUCCESS;
372     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_CM_FINALIZE);
373     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_CM_FINALIZE);
374     FI_RC(fi_cancel((fid_t) gl_data.endpoint,
375                     &(REQ_OFI(gl_data.persistent_req)->ofi_context)), cancel);
376     MPIR_STATUS_SET_CANCEL_BIT(gl_data.persistent_req->status, TRUE);
377     MPIR_STATUS_SET_COUNT(gl_data.persistent_req->status, 0);
378     MPIDI_CH3I_NM_OFI_RC(MPID_Request_complete(gl_data.persistent_req));
379 
380     FI_RC(fi_cancel((fid_t) gl_data.endpoint, &(REQ_OFI(gl_data.conn_req)->ofi_context)), cancel);
381     MPL_free(gl_data.conn_req->dev.user_buf);
382     MPIR_STATUS_SET_CANCEL_BIT(gl_data.conn_req->status, TRUE);
383     MPIR_STATUS_SET_COUNT(gl_data.conn_req->status, 0);
384     MPIDI_CH3I_NM_OFI_RC(MPID_Request_complete(gl_data.conn_req));
385   fn_exit:
386     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_CM_FINALIZE);
387     return mpi_errno;
388   fn_fail:
389     goto fn_exit;
390 }
391 
392 /* ------------------------------------------------------------------------ */
393 /* MPID_nem_ofi_vc_connect                                                  */
394 /* Handle CH3/Nemesis VC connections                                        */
395 /*   * Query the VC address information.  In particular we are looking for  */
396 /*     the fabric address name.                                             */
397 /*   * Use fi_av_insert to register the address name with OFI               */
398 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_vc_connect(MPIDI_VC_t * vc)399 int MPID_nem_ofi_vc_connect(MPIDI_VC_t * vc)
400 {
401     int len, ret, mpi_errno = MPI_SUCCESS;
402     char bc[MPIDI_OFI_KVSAPPSTRLEN], *addr = NULL;
403 
404     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_VC_CONNECT);
405     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_VC_CONNECT);
406     addr = MPL_malloc(gl_data.bound_addrlen, MPL_MEM_ADDRESS);
407     MPIR_Assert(addr);
408     MPIR_Assert(1 != VC_OFI(vc)->ready);
409 
410     if (!vc->pg || !vc->pg->getConnInfo) {
411         goto fn_exit;
412     }
413 
414     MPIDI_CH3I_NM_OFI_RC(vc->pg->getConnInfo(vc->pg_rank, bc, MPIDI_OFI_KVSAPPSTRLEN, vc->pg));
415     ret = MPL_str_get_binary_arg(bc, "OFI", addr, gl_data.bound_addrlen, &len);
416     MPIR_ERR_CHKANDJUMP((ret != MPL_SUCCESS && ret != MPL_ERR_STR_NOMEM) ||
417                         (size_t) len != gl_data.bound_addrlen,
418                         mpi_errno, MPI_ERR_OTHER, "**business_card");
419     FI_RC(fi_av_insert(gl_data.av, addr, 1, &(VC_OFI(vc)->direct_addr), 0ULL, NULL), avmap);
420     VC_OFI(vc)->ready = 1;
421 
422   fn_exit:
423     MPL_free(addr);
424     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_VC_CONNECT);
425     return mpi_errno;
426 
427   fn_fail:
428     goto fn_exit;
429 }
430 
MPID_nem_ofi_vc_init(MPIDI_VC_t * vc)431 int MPID_nem_ofi_vc_init(MPIDI_VC_t * vc)
432 {
433     int mpi_errno = MPI_SUCCESS;
434     MPIDI_CH3I_VC *const vc_ch = &vc->ch;
435     MPID_nem_ofi_vc_t *const vc_ofi = VC_OFI(vc);
436 
437     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_VC_INIT);
438     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_VC_INIT);
439     vc->sendNoncontig_fn = MPID_nem_ofi_SendNoncontig;
440     vc_ch->iStartContigMsg = MPID_nem_ofi_iStartContigMsg;
441     vc_ch->iSendContig = MPID_nem_ofi_iSendContig;
442     vc_ch->iSendIov = MPID_nem_ofi_iSendIov;
443     vc_ch->next = NULL;
444     vc_ch->prev = NULL;
445     vc_ofi->is_cmvc = 0;
446     vc->comm_ops = &_g_comm_ops;
447 
448     MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
449 
450     if (NULL == vc->pg) {
451         vc_ofi->is_cmvc = 1;
452     } else {
453     }
454     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_VC_INIT);
455     return mpi_errno;
456 }
457 
458 /* ------------------------------------------------------------------------ */
459 /* MPID_nem_ofi_vc_destroy                                                  */
460 /* MPID_nem_ofi_vc_terminate                                                */
461 /* TODO:  Verify this code has no leaks                                     */
462 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_vc_destroy(MPIDI_VC_t * vc)463 int MPID_nem_ofi_vc_destroy(MPIDI_VC_t * vc)
464 {
465     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_VC_DESTROY);
466     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_VC_DESTROY);
467     if (gl_data.cm_vcs && vc && (VC_OFI(vc)->is_cmvc == 1)) {
468         if (vc->pg != NULL) {
469             printf("ERROR: VC Destroy (%p) pg = %s\n", vc, (char *) vc->pg->id);
470         }
471         MPIDI_VC_t *prev = gl_data.cm_vcs;
472         while (prev && prev != vc && VC_OFI(prev)->next != vc) {
473             prev = VC_OFI(prev)->next;
474         }
475 
476         MPIR_Assert(prev != NULL);
477 
478         if (VC_OFI(prev)->next == vc) {
479             VC_OFI(prev)->next = VC_OFI(vc)->next;
480         } else if (vc == gl_data.cm_vcs) {
481             gl_data.cm_vcs = VC_OFI(vc)->next;
482         } else {
483             MPIR_Assert(0);
484         }
485     }
486     VC_OFI(vc)->ready = 0;
487     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_VC_DESTROY);
488     return MPI_SUCCESS;
489 }
490 
MPID_nem_ofi_vc_terminate(MPIDI_VC_t * vc)491 int MPID_nem_ofi_vc_terminate(MPIDI_VC_t * vc)
492 {
493     int mpi_errno = MPI_SUCCESS;
494     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_VC_TERMINATE);
495     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_VC_TERMINATE);
496     MPIDI_CH3I_NM_OFI_RC(MPIDI_CH3U_Handle_connection(vc, MPIDI_VC_EVENT_TERMINATED));
497     VC_OFI(vc)->ready = 0;
498   fn_exit:
499     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_VC_TERMINATE);
500     return mpi_errno;
501   fn_fail:
502     goto fn_exit;
503 }
504 
505 
506 
507 /* ------------------------------------------------------------------------ */
508 /* MPID_nem_ofi_connect_to_root                                             */
509 /*  * A new unconnected VC (cm/ephemeral VC) has been created.  This code   */
510 /*    connects the new VC to a rank in another process group.  The parent   */
511 /*    address is obtained by an out of band method and given to this        */
512 /*    routine as a business card                                            */
513 /*  * Read the business card address and insert the address                 */
514 /*  * Send a connection request to the parent.  The parent has posted a     */
515 /*    persistent request to handle incoming connection requests             */
516 /*    The connect message has the child's business card.                    */
517 /*  * Add the new VC to the list of ephemeral BC's (cm_vc's).  These VC's   */
518 /*    are not part of the process group, so they require special handling   */
519 /*    during the SendContig family of routines.                             */
520 /* ------------------------------------------------------------------------ */
MPID_nem_ofi_connect_to_root(const char * business_card,MPIDI_VC_t * new_vc)521 int MPID_nem_ofi_connect_to_root(const char *business_card, MPIDI_VC_t * new_vc)
522 {
523     int len, ret, mpi_errno = MPI_SUCCESS, str_errno = MPI_SUCCESS;
524     int my_bc_len = MPIDI_OFI_KVSAPPSTRLEN;
525     char *addr = NULL, *bc = NULL, *my_bc = NULL;
526     MPIR_Request *sreq;
527     uint64_t conn_req_send_bits;
528 
529     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_NM_CONNECT_TO_ROOT);
530     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_NM_CONNECT_TO_ROOT);
531     addr = MPL_malloc(gl_data.bound_addrlen, MPL_MEM_ADDRESS);
532     bc = MPL_malloc(MPIDI_OFI_KVSAPPSTRLEN, MPL_MEM_ADDRESS);
533     MPIR_Assertp(addr);
534     MPIR_Assertp(bc);
535     my_bc = bc;
536     if (!business_card || business_card[0] != 't') {
537         mpi_errno = MPI_ERR_OTHER;
538         goto fn_fail;
539     }
540     MPIDI_CH3I_NM_OFI_RC(MPIDI_GetTagFromPort(business_card, &new_vc->port_name_tag));
541     ret = MPL_str_get_binary_arg(business_card, "OFI", addr, gl_data.bound_addrlen, &len);
542     MPIR_ERR_CHKANDJUMP((ret != MPL_SUCCESS && ret != MPL_ERR_STR_NOMEM) ||
543                         (size_t) len != gl_data.bound_addrlen,
544                         mpi_errno, MPI_ERR_OTHER, "**business_card");
545     FI_RC(fi_av_insert(gl_data.av, addr, 1, &(VC_OFI(new_vc)->direct_addr), 0ULL, NULL), avmap);
546 
547     VC_OFI(new_vc)->ready = 1;
548     str_errno = MPL_str_add_int_arg(&bc, &my_bc_len, "tag", new_vc->port_name_tag);
549     MPIR_ERR_CHKANDJUMP(str_errno, mpi_errno, MPI_ERR_OTHER, "**argstr_port_name_tag");
550     MPIDI_CH3I_NM_OFI_RC(MPID_nem_ofi_get_business_card
551                          (MPIR_Process.comm_world->rank, &bc, &my_bc_len));
552     my_bc_len = MPIDI_OFI_KVSAPPSTRLEN - my_bc_len;
553 
554     MPID_nem_ofi_create_req(&sreq, 1);
555     sreq->kind = MPIR_REQUEST_KIND__SEND;
556     sreq->dev.OnDataAvail = NULL;
557     sreq->dev.next = NULL;
558     REQ_OFI(sreq)->event_callback = MPID_nem_ofi_connect_to_root_callback;
559     REQ_OFI(sreq)->pack_buffer = my_bc;
560     if (gl_data.api_set == API_SET_1) {
561         conn_req_send_bits = init_sendtag(0, MPIR_Process.comm_world->rank, 0, MPIDI_OFI_CONN_REQ);
562         FI_RC_RETRY(fi_tsend(gl_data.endpoint,
563                              REQ_OFI(sreq)->pack_buffer,
564                              my_bc_len,
565                              gl_data.mr,
566                              VC_OFI(new_vc)->direct_addr,
567                              conn_req_send_bits, &(REQ_OFI(sreq)->ofi_context)), tsend);
568     } else {
569         conn_req_send_bits = init_sendtag_2(0, 0, MPIDI_OFI_CONN_REQ);
570         FI_RC_RETRY(fi_tsenddata(gl_data.endpoint,
571                                  REQ_OFI(sreq)->pack_buffer,
572                                  my_bc_len,
573                                  gl_data.mr,
574                                  MPIR_Process.comm_world->rank,
575                                  VC_OFI(new_vc)->direct_addr,
576                                  conn_req_send_bits, &(REQ_OFI(sreq)->ofi_context)), tsend);
577     }
578     MPID_nem_ofi_poll(MPID_NONBLOCKING_POLL);
579     VC_OFI(new_vc)->is_cmvc = 1;
580     VC_OFI(new_vc)->next = gl_data.cm_vcs;
581     gl_data.cm_vcs = new_vc;
582   fn_exit:
583     MPL_free(addr);
584     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_NM_CONNECT_TO_ROOT);
585     return mpi_errno;
586   fn_fail:
587     MPL_free(my_bc);
588     goto fn_exit;
589 }
590 
MPID_nem_ofi_get_business_card(int my_rank ATTRIBUTE ((unused)),char ** bc_val_p,int * val_max_sz_p)591 int MPID_nem_ofi_get_business_card(int my_rank ATTRIBUTE((unused)),
592                                    char **bc_val_p, int *val_max_sz_p)
593 {
594     int mpi_errno = MPI_SUCCESS, str_errno = MPL_SUCCESS;
595     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_OFI_GET_BUSINESS_CARD);
596     MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_OFI_GET_BUSINESS_CARD);
597     str_errno = MPL_str_add_binary_arg(bc_val_p,
598                                        val_max_sz_p,
599                                        "OFI", (char *) &gl_data.bound_addr, gl_data.bound_addrlen);
600     if (str_errno) {
601         MPIR_ERR_CHKANDJUMP(str_errno == MPL_ERR_STR_NOMEM, mpi_errno, MPI_ERR_OTHER, "**buscard_len");
602         MPIR_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**buscard");
603     }
604   fn_exit:
605     MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_OFI_GET_BUSINESS_CARD);
606     return mpi_errno;
607   fn_fail:
608     goto fn_exit;
609 }
610