1 /*
2  * Copyright (c) 2018 Cray Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include "config.h"
34 #include "fi_verbs.h"
35 
vrb_next_xrc_conn_state(struct vrb_xrc_ep * ep)36 void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep)
37 {
38 	switch (ep->conn_state) {
39 	case VRB_XRC_UNCONNECTED:
40 		ep->conn_state = VRB_XRC_ORIG_CONNECTING;
41 		break;
42 	case VRB_XRC_ORIG_CONNECTING:
43 		ep->conn_state = VRB_XRC_ORIG_CONNECTED;
44 		break;
45 	case VRB_XRC_ORIG_CONNECTED:
46 		ep->conn_state = VRB_XRC_RECIP_CONNECTING;
47 		break;
48 	case VRB_XRC_RECIP_CONNECTING:
49 		ep->conn_state = VRB_XRC_CONNECTED;
50 		break;
51 	case VRB_XRC_CONNECTED:
52 	case VRB_XRC_ERROR:
53 		break;
54 	default:
55 		assert(0);
56 		VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
57 			   ep->conn_state);
58 	}
59 }
60 
vrb_prev_xrc_conn_state(struct vrb_xrc_ep * ep)61 void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep)
62 {
63 	switch (ep->conn_state) {
64 	case VRB_XRC_UNCONNECTED:
65 		break;
66 	case VRB_XRC_ORIG_CONNECTING:
67 		ep->conn_state = VRB_XRC_UNCONNECTED;
68 		break;
69 	case VRB_XRC_ORIG_CONNECTED:
70 		ep->conn_state = VRB_XRC_ORIG_CONNECTING;
71 		break;
72 	case VRB_XRC_RECIP_CONNECTING:
73 		ep->conn_state = VRB_XRC_ORIG_CONNECTED;
74 		break;
75 	case VRB_XRC_CONNECTED:
76 		ep->conn_state = VRB_XRC_RECIP_CONNECTING;
77 		break;
78 	case VRB_XRC_ERROR:
79 		break;
80 	default:
81 		assert(0);
82 		VERBS_WARN(FI_LOG_EP_CTRL, "Unkown XRC connection state %d\n",
83 			   ep->conn_state);
84 	}
85 }
86 
vrb_save_priv_data(struct vrb_xrc_ep * ep,const void * data,size_t len)87 void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data,
88 			   size_t len)
89 {
90 	ep->conn_setup->event_len = MIN(sizeof(ep->conn_setup->event_data),
91 					len);
92 	memcpy(ep->conn_setup->event_data, data, ep->conn_setup->event_len);
93 }
94 
vrb_set_xrc_cm_data(struct vrb_xrc_cm_data * local,int reciprocal,uint32_t conn_tag,uint16_t port,uint32_t tgt_qpn,uint32_t srqn)95 void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal,
96 			    uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn,
97 			    uint32_t srqn)
98 {
99 	local->version = VRB_XRC_VERSION;
100 	local->reciprocal = reciprocal ? 1 : 0;
101 	local->port = htons(port);
102 	local->conn_tag = htonl(conn_tag);
103 	local->tgt_qpn = htonl(tgt_qpn);
104 	local->srqn = htonl(srqn);
105 }
106 
vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data * remote,int private_data_len)107 int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote,
108 			      int private_data_len)
109 {
110 	if (sizeof(*remote) > private_data_len) {
111 		VERBS_WARN(FI_LOG_EP_CTRL,
112 			   "XRC MSG EP CM data length mismatch\n");
113 		return -FI_EINVAL;
114 	}
115 
116 	if (remote->version != VRB_XRC_VERSION) {
117 		VERBS_WARN(FI_LOG_EP_CTRL,
118 			   "XRC MSG EP connection protocol mismatch "
119 			   "(local %"PRIu8", remote %"PRIu8")\n",
120 			   VRB_XRC_VERSION, remote->version);
121 		return -FI_EINVAL;
122 	}
123 	return FI_SUCCESS;
124 }
125 
vrb_log_ep_conn(struct vrb_xrc_ep * ep,char * desc)126 void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc)
127 {
128 	struct sockaddr *addr;
129 	char buf[OFI_ADDRSTRLEN];
130 	size_t len = sizeof(buf);
131 
132 	if (!fi_log_enabled(&vrb_prov, FI_LOG_INFO, FI_LOG_EP_CTRL))
133 		return;
134 
135 	VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, %s\n", ep, desc);
136 	VERBS_INFO(FI_LOG_EP_CTRL,
137 		  "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n",
138 		  ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn);
139 
140 
141 	if (ep->base_ep.id) {
142 		addr = rdma_get_local_addr(ep->base_ep.id);
143 		if (addr) {
144 			ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
145 				    addr);
146 			VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n",
147 				   ep, buf);
148 		}
149 		addr = rdma_get_peer_addr(ep->base_ep.id);
150 		if (addr) {
151 			len = sizeof(buf);
152 			ofi_straddr(buf, &len, ep->base_ep.info->addr_format,
153 				    addr);
154 			VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n",
155 				   ep, buf);
156 		}
157 	}
158 
159 	if (ep->base_ep.ibv_qp) {
160 		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, INI QP Num %d\n",
161 			  ep, ep->base_ep.ibv_qp->qp_num);
162 		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, Remote TGT QP Num %d\n", ep,
163 			  ep->ini_conn->tgt_qpn);
164 	}
165 	if (ep->tgt_ibv_qp)
166 		VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, TGT QP Num %d\n",
167 			  ep, ep->tgt_ibv_qp->qp_num);
168 }
169 
170 /* Caller must hold eq:lock */
vrb_free_xrc_conn_setup(struct vrb_xrc_ep * ep,int disconnect)171 void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect)
172 {
173 	assert(ep->conn_setup);
174 
175 	/* If a disconnect is requested then the XRC bidirectional connection
176 	 * has completed and a disconnect sequence is started (the XRC INI QP
177 	 * side disconnect is initiated when the remote target disconnect is
178 	 * received). */
179 	if (disconnect) {
180 		assert(ep->tgt_id);
181 		assert(!ep->tgt_id->qp);
182 
183 		if (ep->tgt_id->ps == RDMA_PS_UDP) {
184 			rdma_destroy_id(ep->tgt_id);
185 			ep->tgt_id = NULL;
186 		} else {
187 			rdma_disconnect(ep->tgt_id);
188 		}
189 
190 		if (ep->base_ep.id->ps == RDMA_PS_UDP) {
191 			rdma_destroy_id(ep->base_ep.id);
192 			ep->base_ep.id = NULL;
193 		}
194 	}
195 
196 	vrb_eq_clear_xrc_conn_tag(ep);
197 	if (!disconnect) {
198 		free(ep->conn_setup);
199 		ep->conn_setup = NULL;
200 	}
201 }
202 
203 /* Caller must hold the eq:lock */
vrb_connect_xrc(struct vrb_xrc_ep * ep,struct sockaddr * addr,int reciprocal,void * param,size_t paramlen)204 int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr,
205 		       int reciprocal, void *param, size_t paramlen)
206 {
207 	int ret;
208 
209 	assert(!ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn);
210 
211 	ret = vrb_get_shared_ini_conn(ep, &ep->ini_conn);
212 	if (ret) {
213 		VERBS_WARN(FI_LOG_EP_CTRL,
214 			   "Get of shared XRC INI connection failed %d\n", ret);
215 		if (!reciprocal) {
216 			free(ep->conn_setup);
217 			ep->conn_setup = NULL;
218 		}
219 		return ret;
220 	}
221 
222 	vrb_eq_set_xrc_conn_tag(ep);
223 	vrb_add_pending_ini_conn(ep, reciprocal, param, paramlen);
224 	vrb_sched_ini_conn(ep->ini_conn);
225 
226 	return FI_SUCCESS;
227 }
228 
229 /* Caller must hold the eq:lock */
vrb_ep_ini_conn_done(struct vrb_xrc_ep * ep,uint32_t tgt_qpn)230 void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn)
231 {
232 	assert(ep->base_ep.id && ep->ini_conn);
233 
234 	assert(ep->ini_conn->state == VRB_INI_QP_CONNECTING ||
235 	       ep->ini_conn->state == VRB_INI_QP_CONNECTED);
236 
237 	/* If this was a physical INI/TGT QP connection, remove the QP
238 	 * from control of the RDMA CM. We don't want the shared INI QP
239 	 * to be destroyed if this endpoint closes. */
240 	if (ep->base_ep.id == ep->ini_conn->phys_conn_id) {
241 		ep->ini_conn->phys_conn_id = NULL;
242 		ep->ini_conn->state = VRB_INI_QP_CONNECTED;
243 		ep->ini_conn->tgt_qpn = tgt_qpn;
244 		ep->base_ep.id->qp = NULL;
245 		VERBS_DBG(FI_LOG_EP_CTRL,
246 			  "Set INI Conn QP %d remote TGT QP %d\n",
247 			  ep->ini_conn->ini_qp->qp_num,
248 			  ep->ini_conn->tgt_qpn);
249 	}
250 
251 	vrb_log_ep_conn(ep, "INI Connection Done");
252 	vrb_sched_ini_conn(ep->ini_conn);
253 }
254 
255 /* Caller must hold the eq:lock */
vrb_ep_ini_conn_rejected(struct vrb_xrc_ep * ep)256 void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep)
257 {
258 	assert(ep->base_ep.id && ep->ini_conn);
259 
260 	vrb_log_ep_conn(ep, "INI Connection Rejected");
261 	vrb_put_shared_ini_conn(ep);
262 	ep->conn_state = VRB_XRC_ERROR;
263 }
264 
vrb_ep_tgt_conn_done(struct vrb_xrc_ep * ep)265 void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *ep)
266 {
267 	vrb_log_ep_conn(ep, "TGT Connection Done\n");
268 
269 	if (ep->tgt_id->qp) {
270 		assert(ep->tgt_ibv_qp == ep->tgt_id->qp);
271 		ep->tgt_id->qp = NULL;
272 	}
273 }
274 
275 /* Caller must hold the eq:lock */
vrb_resend_shared_accept_xrc(struct vrb_xrc_ep * ep,struct vrb_connreq * connreq,struct rdma_cm_id * id)276 int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep,
277 				    struct vrb_connreq *connreq,
278 				    struct rdma_cm_id *id)
279 {
280 	struct rdma_conn_param conn_param = { 0 };
281 	struct vrb_xrc_cm_data *cm_data = ep->accept_param_data;
282 
283 	assert(cm_data && ep->tgt_ibv_qp);
284 	assert(ep->tgt_ibv_qp->qp_num == connreq->xrc.tgt_qpn);
285 	assert(ep->peer_srqn == connreq->xrc.peer_srqn);
286 
287 	vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal,
288 			       connreq->xrc.conn_tag, connreq->xrc.port,
289 			       0, ep->srqn);
290 	conn_param.private_data = cm_data;
291 	conn_param.private_data_len = ep->accept_param_len;
292 
293 	conn_param.responder_resources = RDMA_MAX_RESP_RES;
294 	conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
295 	conn_param.flow_control = 1;
296 	conn_param.rnr_retry_count = 7;
297 	if (ep->base_ep.srq_ep)
298 		conn_param.srq = 1;
299 	conn_param.qp_num = ep->tgt_ibv_qp->qp_num;
300 
301 	return rdma_accept(id, &conn_param);
302 }
303 
304 /* Caller must hold the eq:lock */
vrb_accept_xrc(struct vrb_xrc_ep * ep,int reciprocal,void * param,size_t paramlen)305 int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal,
306 		      void *param, size_t paramlen)
307 {
308 	struct sockaddr *addr;
309 	struct vrb_connreq *connreq;
310 	struct rdma_conn_param conn_param = { 0 };
311 	struct vrb_xrc_cm_data *cm_data = param;
312 	struct vrb_xrc_cm_data connect_cm_data;
313 	int ret;
314 
315 	addr = rdma_get_local_addr(ep->tgt_id);
316 	if (addr)
317 		ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "src_addr", addr);
318 
319 	addr = rdma_get_peer_addr(ep->tgt_id);
320 	if (addr)
321 		ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "dest_addr", addr);
322 
323 	connreq = container_of(ep->base_ep.info->handle,
324 			       struct vrb_connreq, handle);
325 	ret = vrb_ep_create_tgt_qp(ep, connreq->xrc.tgt_qpn);
326 	if (ret)
327 		return ret;
328 
329 	ep->peer_srqn = connreq->xrc.peer_srqn;
330 	ep->remote_pep_port = connreq->xrc.port;
331 	ep->recip_accept = connreq->xrc.is_reciprocal;
332 	vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal,
333 			       connreq->xrc.conn_tag, connreq->xrc.port,
334 			       0, ep->srqn);
335 	conn_param.private_data = cm_data;
336 	conn_param.private_data_len = paramlen;
337 	conn_param.responder_resources = RDMA_MAX_RESP_RES;
338 	conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH;
339 	conn_param.flow_control = 1;
340 	conn_param.rnr_retry_count = 7;
341 	if (ep->base_ep.srq_ep)
342 		conn_param.srq = 1;
343 
344 	if (!ep->tgt_id->qp)
345 		conn_param.qp_num = ep->tgt_ibv_qp->qp_num;
346 
347 	ep->conn_setup->remote_conn_tag = connreq->xrc.conn_tag;
348 
349 	assert(ep->conn_state == VRB_XRC_UNCONNECTED ||
350 	       ep->conn_state == VRB_XRC_ORIG_CONNECTED);
351 	vrb_next_xrc_conn_state(ep);
352 
353 	ret = rdma_accept(ep->tgt_id, &conn_param);
354 	if (OFI_UNLIKELY(ret)) {
355 		ret = -errno;
356 		VERBS_WARN(FI_LOG_EP_CTRL,
357 			   "XRC TGT, rdma_accept error %d\n", ret);
358 		vrb_prev_xrc_conn_state(ep);
359 		return ret;
360 	}
361 	free(connreq);
362 
363 	if (ep->tgt_id->ps == RDMA_PS_UDP &&
364 	    vrb_eq_add_sidr_conn(ep, cm_data, paramlen))
365 		VERBS_WARN(FI_LOG_EP_CTRL,
366 			   "SIDR connection accept not added to map\n");
367 
368 	/* The passive side of the initial shared connection using
369 	 * SIDR is complete, initiate reciprocal connection */
370 	if (ep->tgt_id->ps == RDMA_PS_UDP && !reciprocal) {
371 		vrb_next_xrc_conn_state(ep);
372 		vrb_ep_tgt_conn_done(ep);
373 		ret = vrb_connect_xrc(ep, NULL, VRB_RECIP_CONN,
374 					 &connect_cm_data,
375 					 sizeof(connect_cm_data));
376 		if (ret) {
377 			VERBS_WARN(FI_LOG_EP_CTRL,
378 				   "XRC reciprocal connect error %d\n", ret);
379 			vrb_prev_xrc_conn_state(ep);
380 			ep->tgt_id->qp = NULL;
381 		}
382 	}
383 
384 	return ret;
385 }
386 
vrb_process_xrc_connreq(struct vrb_ep * ep,struct vrb_connreq * connreq)387 int vrb_process_xrc_connreq(struct vrb_ep *ep,
388 			       struct vrb_connreq *connreq)
389 {
390 	struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep,
391 						    base_ep);
392 
393 	assert(ep->info->src_addr);
394 	assert(ep->info->dest_addr);
395 
396 	xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup));
397 	if (!xrc_ep->conn_setup) {
398 		VERBS_WARN(FI_LOG_EP_CTRL,
399 			  "Unable to allocate connection setup memory\n");
400 		return -FI_ENOMEM;
401 	}
402 	xrc_ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID;
403 
404 	/* This endpoint was created on the passive side of a connection
405 	 * request. The reciprocal connection request will go back to the
406 	 * passive port indicated by the active side */
407 	ofi_addr_set_port(ep->info->src_addr, 0);
408 	ofi_addr_set_port(ep->info->dest_addr, connreq->xrc.port);
409 	xrc_ep->tgt_id = connreq->id;
410 	xrc_ep->tgt_id->context = &ep->util_ep.ep_fid.fid;
411 
412 	return FI_SUCCESS;
413 }
414