1 /*
2 * Copyright (c) 2018 Cray Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33 #include "config.h"
34 #include "fi_verbs.h"
35
vrb_next_xrc_conn_state(struct vrb_xrc_ep * ep)36 void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep)
37 {
38 switch (ep->conn_state) {
39 case VRB_XRC_UNCONNECTED:
40 ep->conn_state = VRB_XRC_ORIG_CONNECTING;
41 break;
42 case VRB_XRC_ORIG_CONNECTING:
43 ep->conn_state = VRB_XRC_ORIG_CONNECTED;
44 break;
45 case VRB_XRC_ORIG_CONNECTED:
46 ep->conn_state = VRB_XRC_RECIP_CONNECTING;
47 break;
48 case VRB_XRC_RECIP_CONNECTING:
49 ep->conn_state = VRB_XRC_CONNECTED;
50 break;
51 case VRB_XRC_CONNECTED:
52 case VRB_XRC_ERROR:
53 break;
54 default:
55 assert(0);
56 VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
57 ep->conn_state);
58 }
59 }
60
vrb_prev_xrc_conn_state(struct vrb_xrc_ep * ep)61 void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep)
62 {
63 switch (ep->conn_state) {
64 case VRB_XRC_UNCONNECTED:
65 break;
66 case VRB_XRC_ORIG_CONNECTING:
67 ep->conn_state = VRB_XRC_UNCONNECTED;
68 break;
69 case VRB_XRC_ORIG_CONNECTED:
70 ep->conn_state = VRB_XRC_ORIG_CONNECTING;
71 break;
72 case VRB_XRC_RECIP_CONNECTING:
73 ep->conn_state = VRB_XRC_ORIG_CONNECTED;
74 break;
75 case VRB_XRC_CONNECTED:
76 ep->conn_state = VRB_XRC_RECIP_CONNECTING;
77 break;
78 case VRB_XRC_ERROR:
79 break;
80 default:
81 assert(0);
82 VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
83 ep->conn_state);
84 }
85 }
86
vrb_save_priv_data(struct vrb_xrc_ep * ep,const void * data,size_t len)87 void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data,
88 size_t len)
89 {
90 ep->conn_setup->event_len = MIN(sizeof(ep->conn_setup->event_data),
91 len);
92 memcpy(ep->conn_setup->event_data, data, ep->conn_setup->event_len);
93 }
94
vrb_set_xrc_cm_data(struct vrb_xrc_cm_data * local,int reciprocal,uint32_t conn_tag,uint16_t port,uint32_t tgt_qpn,uint32_t srqn)95 void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal,
96 uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn,
97 uint32_t srqn)
98 {
99 local->version = VRB_XRC_VERSION;
100 local->reciprocal = reciprocal ? 1 : 0;
101 local->port = htons(port);
102 local->conn_tag = htonl(conn_tag);
103 local->tgt_qpn = htonl(tgt_qpn);
104 local->srqn = htonl(srqn);
105 }
106
vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data * remote,int private_data_len)107 int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
108 int private_data_len)
109 {
110 if (sizeof(*remote) > private_data_len) {
111 VERBS_WARN(FI_LOG_EP_CTRL,
112 "XRC MSG EP CM data length mismatch\n");
113 return -FI_EINVAL;
114 }
115
116 if (remote->version != VRB_XRC_VERSION) {
117 VERBS_WARN(FI_LOG_EP_CTRL,
118 "XRC MSG EP connection protocol mismatch "
119 "(local %"PRIu8", remote %"PRIu8")\n",
120 VRB_XRC_VERSION, remote->version);
121 return -FI_EINVAL;
122 }
123 return FI_SUCCESS;
124 }
125
vrb_log_ep_conn(struct vrb_xrc_ep * ep,char * desc)126 void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
127 {
128 struct sockaddr *addr;
129 char buf[OFI_ADDRSTRLEN];
130 size_t len = sizeof(buf);
131
132 if (!fi_log_enabled(&vrb_prov, FI_LOG_INFO, FI_LOG_EP_CTRL))
133 return;
134
135 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, %s\n", ep, desc);
136 VERBS_INFO(FI_LOG_EP_CTRL,
137 "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n",
138 ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn);
139
140
141 if (ep->base_ep.id) {
142 addr = rdma_get_local_addr(ep->base_ep.id);
143 if (addr) {
144 ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
145 addr);
146 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n",
147 ep, buf);
148 }
149 addr = rdma_get_peer_addr(ep->base_ep.id);
150 if (addr) {
151 len = sizeof(buf);
152 ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
153 addr);
154 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n",
155 ep, buf);
156 }
157 }
158
159 if (ep->base_ep.ibv_qp) {
160 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, INI QP Num %d\n",
161 ep, ep->base_ep.ibv_qp->qp_num);
162 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, Remote TGT QP Num %d\n", ep,
163 ep->ini_conn->tgt_qpn);
164 }
165 if (ep->tgt_ibv_qp)
166 VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, TGT QP Num %d\n",
167 ep, ep->tgt_ibv_qp->qp_num);
168 }
169
170 /* Caller must hold eq:lock */
vrb_free_xrc_conn_setup(struct vrb_xrc_ep * ep,int disconnect)171 void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
172 {
173 assert(ep->conn_setup);
174
175 /* If a disconnect is requested then the XRC bidirectional connection
176 * has completed and a disconnect sequence is started (the XRC INI QP
177 * side disconnect is initiated when the remote target disconnect is
178 * received). */
179 if (disconnect) {
180 assert(ep->tgt_id);
181 assert(!ep->tgt_id->qp);
182
183 if (ep->tgt_id->ps == RDMA_PS_UDP) {
184 rdma_destroy_id(ep->tgt_id);
185 ep->tgt_id = NULL;
186 } else {
187 rdma_disconnect(ep->tgt_id);
188 }
189
190 if (ep->base_ep.id->ps == RDMA_PS_UDP) {
191 rdma_destroy_id(ep->base_ep.id);
192 ep->base_ep.id = NULL;
193 }
194 }
195
196 vrb_eq_clear_xrc_conn_tag(ep);
197 if (!disconnect) {
198 free(ep->conn_setup);
199 ep->conn_setup = NULL;
200 }
201 }
202
203 /* Caller must hold the eq:lock */
vrb_connect_xrc(struct vrb_xrc_ep * ep,struct sockaddr * addr,int reciprocal,void * param,size_t paramlen)204 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
205 int reciprocal, void *param, size_t paramlen)
206 {
207 int ret;
208
209 assert(!ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn);
210
211 ret = vrb_get_shared_ini_conn(ep, &ep->ini_conn);
212 if (ret) {
213 VERBS_WARN(FI_LOG_EP_CTRL,
214 "Get of shared XRC INI connection failed %d\n", ret);
215 if (!reciprocal) {
216 free(ep->conn_setup);
217 ep->conn_setup = NULL;
218 }
219 return ret;
220 }
221
222 vrb_eq_set_xrc_conn_tag(ep);
223 vrb_add_pending_ini_conn(ep, reciprocal, param, paramlen);
224 vrb_sched_ini_conn(ep->ini_conn);
225
226 return FI_SUCCESS;
227 }
228
229 /* Caller must hold the eq:lock */
vrb_ep_ini_conn_done(struct vrb_xrc_ep * ep,uint32_t tgt_qpn)230 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
231 {
232 assert(ep->base_ep.id && ep->ini_conn);
233
234 assert(ep->ini_conn->state == VRB_INI_QP_CONNECTING ||
235 ep->ini_conn->state == VRB_INI_QP_CONNECTED);
236
237 /* If this was a physical INI/TGT QP connection, remove the QP
238 * from control of the RDMA CM. We don't want the shared INI QP
239 * to be destroyed if this endpoint closes. */
240 if (ep->base_ep.id == ep->ini_conn->phys_conn_id) {
241 ep->ini_conn->phys_conn_id = NULL;
242 ep->ini_conn->state = VRB_INI_QP_CONNECTED;
243 ep->ini_conn->tgt_qpn = tgt_qpn;
244 ep->base_ep.id->qp = NULL;
245 VERBS_DBG(FI_LOG_EP_CTRL,
246 "Set INI Conn QP %d remote TGT QP %d\n",
247 ep->ini_conn->ini_qp->qp_num,
248 ep->ini_conn->tgt_qpn);
249 }
250
251 vrb_log_ep_conn(ep, "INI Connection Done");
252 vrb_sched_ini_conn(ep->ini_conn);
253 }
254
255 /* Caller must hold the eq:lock */
vrb_ep_ini_conn_rejected(struct vrb_xrc_ep * ep)256 void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep)
257 {
258 assert(ep->base_ep.id && ep->ini_conn);
259
260 vrb_log_ep_conn(ep, "INI Connection Rejected");
261 vrb_put_shared_ini_conn(ep);
262 ep->conn_state = VRB_XRC_ERROR;
263 }
264
vrb_ep_tgt_conn_done(struct vrb_xrc_ep * ep)265 void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *ep)
266 {
267 vrb_log_ep_conn(ep, "TGT Connection Done\n");
268
269 if (ep->tgt_id->qp) {
270 assert(ep->tgt_ibv_qp == ep->tgt_id->qp);
271 ep->tgt_id->qp = NULL;
272 }
273 }
274
275 /* Caller must hold the eq:lock */
vrb_resend_shared_accept_xrc(struct vrb_xrc_ep * ep,struct vrb_connreq * connreq,struct rdma_cm_id * id)276 int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
277 struct vrb_connreq *connreq,
278 struct rdma_cm_id *id)
279 {
280 struct rdma_conn_param conn_param = { 0 };
281 struct vrb_xrc_cm_data *cm_data = ep->accept_param_data;
282
283 assert(cm_data && ep->tgt_ibv_qp);
284 assert(ep->tgt_ibv_qp->qp_num == connreq->xrc.tgt_qpn);
285 assert(ep->peer_srqn == connreq->xrc.peer_srqn);
286
287 vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal,
288 connreq->xrc.conn_tag, connreq->xrc.port,
289 0, ep->srqn);
290 conn_param.private_data = cm_data;
291 conn_param.private_data_len = ep->accept_param_len;
292
293 conn_param.responder_resources = RDMA_MAX_RESP_RES;
294 conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
295 conn_param.flow_control = 1;
296 conn_param.rnr_retry_count = 7;
297 if (ep->base_ep.srq_ep)
298 conn_param.srq = 1;
299 conn_param.qp_num = ep->tgt_ibv_qp->qp_num;
300
301 return rdma_accept(id, &conn_param);
302 }
303
304 /* Caller must hold the eq:lock */
vrb_accept_xrc(struct vrb_xrc_ep * ep,int reciprocal,void * param,size_t paramlen)305 int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
306 void *param, size_t paramlen)
307 {
308 struct sockaddr *addr;
309 struct vrb_connreq *connreq;
310 struct rdma_conn_param conn_param = { 0 };
311 struct vrb_xrc_cm_data *cm_data = param;
312 struct vrb_xrc_cm_data connect_cm_data;
313 int ret;
314
315 addr = rdma_get_local_addr(ep->tgt_id);
316 if (addr)
317 ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "src_addr", addr);
318
319 addr = rdma_get_peer_addr(ep->tgt_id);
320 if (addr)
321 ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "dest_addr", addr);
322
323 connreq = container_of(ep->base_ep.info->handle,
324 struct vrb_connreq, handle);
325 ret = vrb_ep_create_tgt_qp(ep, connreq->xrc.tgt_qpn);
326 if (ret)
327 return ret;
328
329 ep->peer_srqn = connreq->xrc.peer_srqn;
330 ep->remote_pep_port = connreq->xrc.port;
331 ep->recip_accept = connreq->xrc.is_reciprocal;
332 vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal,
333 connreq->xrc.conn_tag, connreq->xrc.port,
334 0, ep->srqn);
335 conn_param.private_data = cm_data;
336 conn_param.private_data_len = paramlen;
337 conn_param.responder_resources = RDMA_MAX_RESP_RES;
338 conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
339 conn_param.flow_control = 1;
340 conn_param.rnr_retry_count = 7;
341 if (ep->base_ep.srq_ep)
342 conn_param.srq = 1;
343
344 if (!ep->tgt_id->qp)
345 conn_param.qp_num = ep->tgt_ibv_qp->qp_num;
346
347 ep->conn_setup->remote_conn_tag = connreq->xrc.conn_tag;
348
349 assert(ep->conn_state == VRB_XRC_UNCONNECTED ||
350 ep->conn_state == VRB_XRC_ORIG_CONNECTED);
351 vrb_next_xrc_conn_state(ep);
352
353 ret = rdma_accept(ep->tgt_id, &conn_param);
354 if (OFI_UNLIKELY(ret)) {
355 ret = -errno;
356 VERBS_WARN(FI_LOG_EP_CTRL,
357 "XRC TGT, rdma_accept error %d\n", ret);
358 vrb_prev_xrc_conn_state(ep);
359 return ret;
360 }
361 free(connreq);
362
363 if (ep->tgt_id->ps == RDMA_PS_UDP &&
364 vrb_eq_add_sidr_conn(ep, cm_data, paramlen))
365 VERBS_WARN(FI_LOG_EP_CTRL,
366 "SIDR connection accept not added to map\n");
367
368 /* The passive side of the initial shared connection using
369 * SIDR is complete, initiate reciprocal connection */
370 if (ep->tgt_id->ps == RDMA_PS_UDP && !reciprocal) {
371 vrb_next_xrc_conn_state(ep);
372 vrb_ep_tgt_conn_done(ep);
373 ret = vrb_connect_xrc(ep, NULL, VRB_RECIP_CONN,
374 &connect_cm_data,
375 sizeof(connect_cm_data));
376 if (ret) {
377 VERBS_WARN(FI_LOG_EP_CTRL,
378 "XRC reciprocal connect error %d\n", ret);
379 vrb_prev_xrc_conn_state(ep);
380 ep->tgt_id->qp = NULL;
381 }
382 }
383
384 return ret;
385 }
386
vrb_process_xrc_connreq(struct vrb_ep * ep,struct vrb_connreq * connreq)387 int vrb_process_xrc_connreq(struct vrb_ep *ep,
388 struct vrb_connreq *connreq)
389 {
390 struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep,
391 base_ep);
392
393 assert(ep->info->src_addr);
394 assert(ep->info->dest_addr);
395
396 xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup));
397 if (!xrc_ep->conn_setup) {
398 VERBS_WARN(FI_LOG_EP_CTRL,
399 "Unable to allocate connection setup memory\n");
400 return -FI_ENOMEM;
401 }
402 xrc_ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID;
403
404 /* This endpoint was created on the passive side of a connection
405 * request. The reciprocal connection request will go back to the
406 * passive port indicated by the active side */
407 ofi_addr_set_port(ep->info->src_addr, 0);
408 ofi_addr_set_port(ep->info->dest_addr, connreq->xrc.port);
409 xrc_ep->tgt_id = connreq->id;
410 xrc_ep->tgt_id->context = &ep->util_ep.ep_fid.fid;
411
412 return FI_SUCCESS;
413 }
414