1 /*
2  * Copyright (c) 2006 Mellanox Technologies Ltd.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  * $Id$
33  */
34 #include "sdp.h"
35 
36 #define SDP_MAJV_MINV 0x22
37 
38 SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 1, "Support only link layer of "
39 		"type Infiniband");
40 
41 enum {
42 	SDP_HH_SIZE = 76,
43 	SDP_HAH_SIZE = 180,
44 };
45 
46 static void
47 sdp_qp_event_handler(struct ib_event *event, void *data)
48 {
49 }
50 
51 static int
52 sdp_get_max_dev_sge(struct ib_device *dev)
53 {
54 	struct ib_device_attr attr;
55 	static int max_sges = -1;
56 
57 	if (max_sges > 0)
58 		goto out;
59 
60 	ib_query_device(dev, &attr);
61 
62 	max_sges = attr.max_sge;
63 
64 out:
65 	return max_sges;
66 }
67 
68 static int
69 sdp_init_qp(struct socket *sk, struct rdma_cm_id *id)
70 {
71 	struct ib_qp_init_attr qp_init_attr = {
72 		.event_handler = sdp_qp_event_handler,
73 		.cap.max_send_wr = SDP_TX_SIZE,
74 		.cap.max_recv_wr = SDP_RX_SIZE,
75         	.sq_sig_type = IB_SIGNAL_REQ_WR,
76         	.qp_type = IB_QPT_RC,
77 	};
78 	struct ib_device *device = id->device;
79 	struct sdp_sock *ssk;
80 	int rc;
81 
82 	sdp_dbg(sk, "%s\n", __func__);
83 
84 	ssk = sdp_sk(sk);
85 	ssk->max_sge = sdp_get_max_dev_sge(device);
86 	sdp_dbg(sk, "Max sges: %d\n", ssk->max_sge);
87 
88 	qp_init_attr.cap.max_send_sge = MIN(ssk->max_sge, SDP_MAX_SEND_SGES);
89 	sdp_dbg(sk, "Setting max send sge to: %d\n",
90 	    qp_init_attr.cap.max_send_sge);
91 
92 	qp_init_attr.cap.max_recv_sge = MIN(ssk->max_sge, SDP_MAX_RECV_SGES);
93 	sdp_dbg(sk, "Setting max recv sge to: %d\n",
94 	    qp_init_attr.cap.max_recv_sge);
95 
96 	ssk->sdp_dev = ib_get_client_data(device, &sdp_client);
97 	if (!ssk->sdp_dev) {
98 		sdp_warn(sk, "SDP not available on device %s\n", device->name);
99 		rc = -ENODEV;
100 		goto err_rx;
101 	}
102 
103 	rc = sdp_rx_ring_create(ssk, device);
104 	if (rc)
105 		goto err_rx;
106 
107 	rc = sdp_tx_ring_create(ssk, device);
108 	if (rc)
109 		goto err_tx;
110 
111 	qp_init_attr.recv_cq = ssk->rx_ring.cq;
112 	qp_init_attr.send_cq = ssk->tx_ring.cq;
113 
114 	rc = rdma_create_qp(id, ssk->sdp_dev->pd, &qp_init_attr);
115 	if (rc) {
116 		sdp_warn(sk, "Unable to create QP: %d.\n", rc);
117 		goto err_qp;
118 	}
119 	ssk->qp = id->qp;
120 	ssk->ib_device = device;
121 	ssk->qp_active = 1;
122 	ssk->context.device = device;
123 
124 	sdp_dbg(sk, "%s done\n", __func__);
125 	return 0;
126 
127 err_qp:
128 	sdp_tx_ring_destroy(ssk);
129 err_tx:
130 	sdp_rx_ring_destroy(ssk);
131 err_rx:
132 	return rc;
133 }
134 
135 static int
136 sdp_connect_handler(struct socket *sk, struct rdma_cm_id *id,
137     struct rdma_cm_event *event)
138 {
139 	struct sockaddr_in *src_addr;
140 	struct sockaddr_in *dst_addr;
141 	struct socket *child;
142 	const struct sdp_hh *h;
143 	struct sdp_sock *ssk;
144 	int rc;
145 
146 	sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id);
147 
148 	h = event->param.conn.private_data;
149 	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
150 
151 	if (!h->max_adverts)
152 		return -EINVAL;
153 
154 	child = sonewconn(sk, SS_ISCONNECTED);
155 	if (!child)
156 		return -ENOMEM;
157 
158 	ssk = sdp_sk(child);
159 	rc = sdp_init_qp(child, id);
160 	if (rc)
161 		return rc;
162 	SDP_WLOCK(ssk);
163 	id->context = ssk;
164 	ssk->id = id;
165 	ssk->socket = child;
166 	ssk->cred = crhold(child->so_cred);
167 	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
168 	src_addr = (struct sockaddr_in *)&id->route.addr.src_addr;
169 	ssk->fport = dst_addr->sin_port;
170 	ssk->faddr = dst_addr->sin_addr.s_addr;
171 	ssk->lport = src_addr->sin_port;
172 	ssk->max_bufs = ntohs(h->bsdh.bufs);
173 	atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
174 	ssk->min_bufs = tx_credits(ssk) / 4;
175 	ssk->xmit_size_goal = ntohl(h->localrcvsz) - sizeof(struct sdp_bsdh);
176 	sdp_init_buffers(ssk, rcvbuf_initial_size);
177 	ssk->state = TCPS_SYN_RECEIVED;
178 	SDP_WUNLOCK(ssk);
179 
180 	return 0;
181 }
182 
183 static int
184 sdp_response_handler(struct socket *sk, struct rdma_cm_id *id,
185     struct rdma_cm_event *event)
186 {
187 	const struct sdp_hah *h;
188 	struct sockaddr_in *dst_addr;
189 	struct sdp_sock *ssk;
190 	sdp_dbg(sk, "%s\n", __func__);
191 
192 	ssk = sdp_sk(sk);
193 	SDP_WLOCK(ssk);
194 	ssk->state = TCPS_ESTABLISHED;
195 	sdp_set_default_moderation(ssk);
196 	if (ssk->flags & SDP_DROPPED) {
197 		SDP_WUNLOCK(ssk);
198 		return 0;
199 	}
200 	if (sk->so_options & SO_KEEPALIVE)
201 		sdp_start_keepalive_timer(sk);
202 	h = event->param.conn.private_data;
203 	SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh);
204 	ssk->max_bufs = ntohs(h->bsdh.bufs);
205 	atomic_set(&ssk->tx_ring.credits, ssk->max_bufs);
206 	ssk->min_bufs = tx_credits(ssk) / 4;
207 	ssk->xmit_size_goal =
208 		ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh);
209 	ssk->poll_cq = 1;
210 
211 	dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr;
212 	ssk->fport = dst_addr->sin_port;
213 	ssk->faddr = dst_addr->sin_addr.s_addr;
214 	soisconnected(sk);
215 	SDP_WUNLOCK(ssk);
216 
217 	return 0;
218 }
219 
220 static int
221 sdp_connected_handler(struct socket *sk, struct rdma_cm_event *event)
222 {
223 	struct sdp_sock *ssk;
224 
225 	sdp_dbg(sk, "%s\n", __func__);
226 
227 	ssk = sdp_sk(sk);
228 	SDP_WLOCK(ssk);
229 	ssk->state = TCPS_ESTABLISHED;
230 
231 	sdp_set_default_moderation(ssk);
232 
233 	if (sk->so_options & SO_KEEPALIVE)
234 		sdp_start_keepalive_timer(sk);
235 
236 	if ((ssk->flags & SDP_DROPPED) == 0)
237 		soisconnected(sk);
238 	SDP_WUNLOCK(ssk);
239 	return 0;
240 }
241 
242 static int
243 sdp_disconnected_handler(struct socket *sk)
244 {
245 	struct sdp_sock *ssk;
246 
247 	ssk = sdp_sk(sk);
248 	sdp_dbg(sk, "%s\n", __func__);
249 
250 	SDP_WLOCK_ASSERT(ssk);
251 	if (sdp_sk(sk)->state == TCPS_SYN_RECEIVED) {
252 		sdp_connected_handler(sk, NULL);
253 
254 		if (rcv_nxt(ssk))
255 			return 0;
256 	}
257 
258 	return -ECONNRESET;
259 }
260 
261 int
262 sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
263 {
264 	struct rdma_conn_param conn_param;
265 	struct socket *sk;
266 	struct sdp_sock *ssk;
267 	struct sdp_hah hah;
268 	struct sdp_hh hh;
269 
270 	int rc = 0;
271 
272 	ssk = id->context;
273 	sk = NULL;
274 	if (ssk)
275 		sk = ssk->socket;
276 	if (!ssk || !sk || !ssk->id) {
277 		sdp_dbg(sk,
278 		    "cm_id is being torn down, event %d, ssk %p, sk %p, id %p\n",
279 		       	event->event, ssk, sk, id);
280 		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ?
281 			-EINVAL : 0;
282 	}
283 
284 	sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id);
285 	switch (event->event) {
286 	case RDMA_CM_EVENT_ADDR_RESOLVED:
287 		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n");
288 
289 		if (sdp_link_layer_ib_only &&
290 			rdma_node_get_transport(id->device->node_type) ==
291 				RDMA_TRANSPORT_IB &&
292 			rdma_port_get_link_layer(id->device, id->port_num) !=
293 				IB_LINK_LAYER_INFINIBAND) {
294 			sdp_dbg(sk, "Link layer is: %d. Only IB link layer "
295 				"is allowed\n",
296 				rdma_port_get_link_layer(id->device, id->port_num));
297 			rc = -ENETUNREACH;
298 			break;
299 		}
300 
301 		rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT);
302 		break;
303 	case RDMA_CM_EVENT_ADDR_ERROR:
304 		sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n");
305 		rc = -ENETUNREACH;
306 		break;
307 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
308 		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id);
309 		rc = sdp_init_qp(sk, id);
310 		if (rc)
311 			break;
312 		atomic_set(&sdp_sk(sk)->remote_credits,
313 				rx_ring_posted(sdp_sk(sk)));
314 		memset(&hh, 0, sizeof hh);
315 		hh.bsdh.mid = SDP_MID_HELLO;
316 		hh.bsdh.len = htonl(sizeof(struct sdp_hh));
317 		hh.max_adverts = 1;
318 		hh.ipv_cap = 0x40;
319 		hh.majv_minv = SDP_MAJV_MINV;
320 		sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size);
321 		hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk)));
322 		hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_bytes);
323 		hh.max_adverts = 0x1;
324 		sdp_sk(sk)->laddr =
325 			((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
326 		memset(&conn_param, 0, sizeof conn_param);
327 		conn_param.private_data_len = sizeof hh;
328 		conn_param.private_data = &hh;
329 		conn_param.responder_resources = 4 /* TODO */;
330 		conn_param.initiator_depth = 4 /* TODO */;
331 		conn_param.retry_count = SDP_RETRY_COUNT;
332 		SDP_DUMP_PACKET(NULL, "TX", NULL, &hh.bsdh);
333 		rc = rdma_connect(id, &conn_param);
334 		break;
335 	case RDMA_CM_EVENT_ROUTE_ERROR:
336 		sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id);
337 		rc = -ETIMEDOUT;
338 		break;
339 	case RDMA_CM_EVENT_CONNECT_REQUEST:
340 		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n");
341 		rc = sdp_connect_handler(sk, id, event);
342 		if (rc) {
343 			sdp_dbg(sk, "Destroying qp\n");
344 			rdma_reject(id, NULL, 0);
345 			break;
346 		}
347 		ssk = id->context;
348 		atomic_set(&ssk->remote_credits, rx_ring_posted(ssk));
349 		memset(&hah, 0, sizeof hah);
350 		hah.bsdh.mid = SDP_MID_HELLO_ACK;
351 		hah.bsdh.bufs = htons(rx_ring_posted(ssk));
352 		hah.bsdh.len = htonl(sizeof(struct sdp_hah));
353 		hah.majv_minv = SDP_MAJV_MINV;
354 		hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec,
355 					    but just in case */
356 		hah.actrcvsz = htonl(ssk->recv_bytes);
357 		memset(&conn_param, 0, sizeof conn_param);
358 		conn_param.private_data_len = sizeof hah;
359 		conn_param.private_data = &hah;
360 		conn_param.responder_resources = 4 /* TODO */;
361 		conn_param.initiator_depth = 4 /* TODO */;
362 		conn_param.retry_count = SDP_RETRY_COUNT;
363 		SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh);
364 		rc = rdma_accept(id, &conn_param);
365 		if (rc) {
366 			ssk->id = NULL;
367 			id->qp = NULL;
368 			id->context = NULL;
369 		}
370 		break;
371 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
372 		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n");
373 		rc = sdp_response_handler(sk, id, event);
374 		if (rc) {
375 			sdp_dbg(sk, "Destroying qp\n");
376 			rdma_reject(id, NULL, 0);
377 		} else
378 			rc = rdma_accept(id, NULL);
379 		break;
380 	case RDMA_CM_EVENT_CONNECT_ERROR:
381 		sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n");
382 		rc = -ETIMEDOUT;
383 		break;
384 	case RDMA_CM_EVENT_UNREACHABLE:
385 		sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n");
386 		rc = -ENETUNREACH;
387 		break;
388 	case RDMA_CM_EVENT_REJECTED:
389 		sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n");
390 		rc = -ECONNREFUSED;
391 		break;
392 	case RDMA_CM_EVENT_ESTABLISHED:
393 		sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n");
394 		sdp_sk(sk)->laddr =
395 			((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr;
396 		rc = sdp_connected_handler(sk, event);
397 		break;
398 	case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */
399 		sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n");
400 
401 		SDP_WLOCK(ssk);
402 		if (ssk->state == TCPS_LAST_ACK) {
403 			sdp_cancel_dreq_wait_timeout(ssk);
404 
405 			sdp_dbg(sk, "%s: waiting for Infiniband tear down\n",
406 				__func__);
407 		}
408 		ssk->qp_active = 0;
409 		SDP_WUNLOCK(ssk);
410 		rdma_disconnect(id);
411 		SDP_WLOCK(ssk);
412 		if (ssk->state != TCPS_TIME_WAIT) {
413 			if (ssk->state == TCPS_CLOSE_WAIT) {
414 				sdp_dbg(sk, "IB teardown while in "
415 					"TCPS_CLOSE_WAIT taking reference to "
416 					"let close() finish the work\n");
417 			}
418 			rc = sdp_disconnected_handler(sk);
419 			if (rc)
420 				rc = -EPIPE;
421 		}
422 		SDP_WUNLOCK(ssk);
423 		break;
424 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
425 		sdp_dbg(sk, "RDMA_CM_EVENT_TIMEWAIT_EXIT\n");
426 		SDP_WLOCK(ssk);
427 		rc = sdp_disconnected_handler(sk);
428 		SDP_WUNLOCK(ssk);
429 		break;
430 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
431 		sdp_dbg(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n");
432 		rc = -ENETRESET;
433 		break;
434 	default:
435 		printk(KERN_ERR "SDP: Unexpected CMA event: %d\n",
436 		       event->event);
437 		rc = -ECONNABORTED;
438 		break;
439 	}
440 
441 	sdp_dbg(sk, "event %d done. status %d\n", event->event, rc);
442 
443 	if (rc) {
444 		SDP_WLOCK(ssk);
445 		if (ssk->id == id) {
446 			ssk->id = NULL;
447 			id->qp = NULL;
448 			id->context = NULL;
449 			if (sdp_notify(ssk, -rc))
450 				SDP_WUNLOCK(ssk);
451 		} else
452 			SDP_WUNLOCK(ssk);
453 	}
454 
455 	return rc;
456 }
457