1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/rds.h>
58 
59 #include <sys/ib/clients/of/ofed_kernel.h>
60 #include <sys/ib/clients/of/rdma/ib_addr.h>
61 #include <sys/ib/clients/of/rdma/rdma_cm.h>
62 
63 #include <sys/ib/clients/rdsv3/rdsv3.h>
64 #include <sys/ib/clients/rdsv3/ib.h>
65 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
66 
67 extern int rdsv3_enable_snd_cq;
68 
69 /*
70  * Set the selected protocol version
71  */
72 static void
73 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version)
74 {
75 	RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d",
76 	    conn, version);
77 	conn->c_version = version;
78 }
79 
80 /*
81  * Set up flow control
82  */
83 static void
84 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits)
85 {
86 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
87 
88 	RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
89 	    "Enter: conn: %p credits: %d", conn, credits);
90 
91 	if (rdsv3_ib_sysctl_flow_control && credits != 0) {
92 		/* We're doing flow control */
93 		ic->i_flowctl = 1;
94 		rdsv3_ib_send_add_credits(conn, credits);
95 	} else {
96 		ic->i_flowctl = 0;
97 	}
98 
99 	RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
100 	    "Return: conn: %p credits: %d",
101 	    conn, credits);
102 }
103 
104 /*
105  * Tune RNR behavior. Without flow control, we use a rather
106  * low timeout, but not the absolute minimum - this should
107  * be tunable.
108  *
109  * We already set the RNR retry count to 7 (which is the
110  * smallest infinite number :-) above.
111  * If flow control is off, we want to change this back to 0
112  * so that we learn quickly when our credit accounting is
113  * buggy.
114  *
115  * Caller passes in a qp_attr pointer - don't waste stack spacv
116  * by allocation this twice.
117  */
118 static void
119 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr)
120 {
121 	int ret;
122 
123 	RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p",
124 	    ic, attr);
125 
126 	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
127 	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
128 	if (ret)
129 		RDSV3_DPRINTF2("rdsv3_ib_tune_rnr",
130 		    "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret);
131 }
132 
133 /*
134  * Connection established.
135  * We get here for both outgoing and incoming connection.
136  */
137 void
138 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
139     struct rdma_cm_event *event)
140 {
141 	const struct rdsv3_ib_connect_private *dp = NULL;
142 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
143 	struct rdsv3_ib_device *rds_ibdev =
144 	    ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
145 	struct ib_qp_attr qp_attr;
146 	int err;
147 
148 	RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
149 	    "Enter conn: %p event: %p", conn, event);
150 
151 	if (event->param.conn.private_data_len >= sizeof (*dp)) {
152 		dp = event->param.conn.private_data;
153 
154 		/* make sure it isn't empty data */
155 		if (dp->dp_protocol_major) {
156 			rdsv3_ib_set_protocol(conn,
157 			    RDS_PROTOCOL(dp->dp_protocol_major,
158 			    dp->dp_protocol_minor));
159 			rdsv3_ib_set_flow_control(conn,
160 			    ntohl(dp->dp_credit));
161 		}
162 	}
163 
164 	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
165 		RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
166 		    "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed",
167 		    NIPQUAD(conn->c_faddr),
168 		    RDS_PROTOCOL_MAJOR(conn->c_version),
169 		    RDS_PROTOCOL_MINOR(conn->c_version));
170 		rdsv3_conn_destroy(conn);
171 		return;
172 	} else {
173 		RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
174 		    "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s",
175 		    NIPQUAD(conn->c_faddr),
176 		    RDS_PROTOCOL_MAJOR(conn->c_version),
177 		    RDS_PROTOCOL_MINOR(conn->c_version),
178 		    ic->i_flowctl ? ", flow control" : "");
179 	}
180 
181 	ASSERT(ic->i_soft_cq == NULL);
182 	ic->i_soft_cq = rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn,
183 	    (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
184 	    ic->i_cq->ibt_cq);
185 	if (rdsv3_enable_snd_cq) {
186 		ic->i_snd_soft_cq = rdsv3_af_intr_thr_create(
187 		    rdsv3_ib_snd_tasklet_fn,
188 		    (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
189 		    ic->i_snd_cq->ibt_cq);
190 	}
191 	ic->i_refill_rq = rdsv3_af_thr_create(rdsv3_ib_refill_fn, (void *)conn,
192 	    SCQ_WRK_BIND_CPU, rds_ibdev->aft_hcagp);
193 	rdsv3_af_grp_draw(rds_ibdev->aft_hcagp);
194 
195 	(void) ib_req_notify_cq(ic->i_cq, IB_CQ_SOLICITED);
196 	if (rdsv3_enable_snd_cq) {
197 		(void) ib_req_notify_cq(ic->i_snd_cq, IB_CQ_NEXT_COMP);
198 	}
199 
200 	/*
201 	 * Init rings and fill recv. this needs to wait until protocol
202 	 * negotiation
203 	 * is complete, since ring layout is different from 3.0 to 3.1.
204 	 */
205 	rdsv3_ib_send_init_ring(ic);
206 	rdsv3_ib_recv_init_ring(ic);
207 	/*
208 	 * Post receive buffers - as a side effect, this will update
209 	 * the posted credit count.
210 	 */
211 	(void) rdsv3_ib_recv_refill(conn, 1);
212 
213 	/* Tune RNR behavior */
214 	rdsv3_ib_tune_rnr(ic, &qp_attr);
215 
216 	qp_attr.qp_state = IB_QPS_RTS;
217 	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
218 	if (err)
219 		RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
220 		    "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err);
221 
222 	/* update ib_device with this local ipaddr & conn */
223 	err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
224 	if (err)
225 		RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
226 		    "rdsv3_ib_update_ipaddr failed (%d)", err);
227 	rdsv3_ib_add_conn(rds_ibdev, conn);
228 
229 	/*
230 	 * If the peer gave us the last packet it saw, process this as if
231 	 * we had received a regular ACK.
232 	 */
233 	if (dp && dp->dp_ack_seq)
234 		rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
235 
236 	rdsv3_connect_complete(conn);
237 
238 	RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
239 	    "Return conn: %p event: %p",
240 	    conn, event);
241 }
242 
243 static void
244 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn,
245     struct rdma_conn_param *conn_param,
246     struct rdsv3_ib_connect_private *dp,
247     uint32_t protocol_version,
248     uint32_t max_responder_resources,
249     uint32_t max_initiator_depth)
250 {
251 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
252 	struct rdsv3_ib_device *rds_ibdev;
253 
254 	RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
255 	    "Enter conn: %p conn_param: %p private: %p version: %d",
256 	    conn, conn_param, dp, protocol_version);
257 
258 	(void) memset(conn_param, 0, sizeof (struct rdma_conn_param));
259 
260 	rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
261 
262 	conn_param->responder_resources =
263 	    MIN(rds_ibdev->max_responder_resources, max_responder_resources);
264 	conn_param->initiator_depth =
265 	    MIN(rds_ibdev->max_initiator_depth, max_initiator_depth);
266 	conn_param->retry_count = min(rdsv3_ib_retry_count, 7);
267 	conn_param->rnr_retry_count = 7;
268 
269 	if (dp) {
270 		(void) memset(dp, 0, sizeof (*dp));
271 		dp->dp_saddr = conn->c_laddr;
272 		dp->dp_daddr = conn->c_faddr;
273 		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
274 		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
275 		dp->dp_protocol_minor_mask =
276 		    htons(RDSV3_IB_SUPPORTED_PROTOCOLS);
277 		dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic);
278 
279 		/* Advertise flow control */
280 		if (ic->i_flowctl) {
281 			unsigned int credits;
282 
283 			credits = IB_GET_POST_CREDITS(
284 			    atomic_get(&ic->i_credits));
285 			dp->dp_credit = htonl(credits);
286 			atomic_add_32(&ic->i_credits,
287 			    -IB_SET_POST_CREDITS(credits));
288 		}
289 
290 		conn_param->private_data = dp;
291 		conn_param->private_data_len = sizeof (*dp);
292 	}
293 
294 	RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
295 	    "Return conn: %p conn_param: %p private: %p version: %d",
296 	    conn, conn_param, dp, protocol_version);
297 }
298 
299 static void
300 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data)
301 {
302 	RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p",
303 	    event->event, data);
304 }
305 
306 static void
307 rdsv3_ib_snd_cq_comp_handler(struct ib_cq *cq, void *context)
308 {
309 	struct rdsv3_connection *conn = context;
310 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
311 
312 	RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler",
313 	    "Enter(conn: %p ic: %p cq: %p)", conn, ic, cq);
314 
315 	rdsv3_af_thr_fire(ic->i_snd_soft_cq);
316 }
317 
318 void
319 rdsv3_ib_snd_tasklet_fn(void *data)
320 {
321 	struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
322 	struct rdsv3_connection *conn = ic->conn;
323 	struct rdsv3_ib_ack_state ack_state = { 0, };
324 	ibt_wc_t wc;
325 	uint_t polled;
326 
327 	RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
328 	    "Enter(conn: %p ic: %p)", conn, ic);
329 
330 	/*
331 	 * Poll in a loop before and after enabling the next event
332 	 */
333 	while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
334 	    IBT_SUCCESS) {
335 		RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
336 		    "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
337 		    (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
338 		    wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
339 
340 		ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
341 		rdsv3_ib_send_cqe_handler(ic, &wc);
342 	}
343 	(void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_snd_cq),
344 	    IBT_NEXT_COMPLETION);
345 	if (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
346 	    IBT_SUCCESS) {
347 		ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
348 		rdsv3_ib_send_cqe_handler(ic, &wc);
349 	}
350 }
351 
352 static void
353 rdsv3_ib_cq_comp_handler(struct ib_cq *cq, void *context)
354 {
355 	struct rdsv3_connection *conn = context;
356 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
357 
358 	RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler",
359 	    "Enter(conn: %p cq: %p)", conn, cq);
360 
361 	rdsv3_ib_stats_inc(s_ib_evt_handler_call);
362 
363 	rdsv3_af_thr_fire(ic->i_soft_cq);
364 }
365 
366 void
367 rdsv3_ib_refill_fn(void *data)
368 {
369 	struct rdsv3_connection *conn = (struct rdsv3_connection *)data;
370 
371 	(void) rdsv3_ib_recv_refill(conn, 0);
372 }
373 
374 void
375 rdsv3_ib_tasklet_fn(void *data)
376 {
377 	struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
378 	struct rdsv3_connection *conn = ic->conn;
379 	struct rdsv3_ib_ack_state ack_state = { 0, };
380 	ibt_wc_t wc;
381 	uint_t polled;
382 
383 	RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
384 	    "Enter(conn: %p ic: %p)", conn, ic);
385 
386 	rdsv3_ib_stats_inc(s_ib_tasklet_call);
387 
388 	/*
389 	 * Poll in a loop before and after enabling the next event
390 	 */
391 	while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc, 1, &polled) ==
392 	    IBT_SUCCESS) {
393 		RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
394 		    "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
395 		    (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
396 		    wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
397 
398 		if (wc.wc_id & RDSV3_IB_SEND_OP) {
399 			rdsv3_ib_send_cqe_handler(ic, &wc);
400 		} else {
401 			rdsv3_ib_recv_cqe_handler(ic, &wc, &ack_state);
402 		}
403 	}
404 	(void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_cq),
405 	    IBT_NEXT_SOLICITED);
406 
407 	if (ack_state.ack_next_valid) {
408 		rdsv3_ib_set_ack(ic, ack_state.ack_next,
409 		    ack_state.ack_required);
410 	}
411 	if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) {
412 		rdsv3_send_drop_acked(conn, ack_state.ack_recv, NULL);
413 		ic->i_ack_recv = ack_state.ack_recv;
414 	}
415 	if (rdsv3_conn_up(conn)) {
416 		if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags))
417 			(void) rdsv3_send_xmit(ic->conn);
418 		rdsv3_ib_attempt_ack(ic);
419 	}
420 }
421 
422 static void
423 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data)
424 {
425 	struct rdsv3_connection *conn = data;
426 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
427 
428 	RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u",
429 	    conn, ic, event->event);
430 
431 	switch (event->event) {
432 	case IB_EVENT_COMM_EST:
433 		(void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
434 		break;
435 	default:
436 		if (conn) {
437 			RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
438 			    "RDS/IB: Fatal QP Event %u - "
439 			    "connection %u.%u.%u.%u ->%u.%u.%u.%u "
440 			    "...reconnecting",
441 			    event->event, NIPQUAD(conn->c_laddr),
442 			    NIPQUAD(conn->c_faddr));
443 			rdsv3_conn_drop(conn);
444 		} else {
445 			RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
446 			    "RDS/IB: Fatal QP Event %u - connection"
447 			    "...reconnecting", event->event);
448 		}
449 		break;
450 	}
451 
452 	RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p",
453 	    conn, event);
454 }
455 
456 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev,
457     struct rdsv3_ib_connection *ic);
458 extern void rdsv3_ib_free_hdrs(ib_device_t *dev,
459     struct rdsv3_ib_connection *ic);
460 
461 /*
462  * This needs to be very careful to not leave IS_ERR pointers around for
463  * cleanup to trip over.
464  */
465 static int
466 rdsv3_ib_setup_qp(struct rdsv3_connection *conn)
467 {
468 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
469 	struct ib_device *dev = ic->i_cm_id->device;
470 	struct ib_qp_init_attr attr;
471 	struct rdsv3_ib_device *rds_ibdev;
472 	ibt_send_wr_t *wrp;
473 	ibt_wr_ds_t *sgl;
474 	int ret, i;
475 
476 	RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn);
477 
478 	/*
479 	 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device,
480 	 * and allocates a protection domain, memory range and FMR pool
481 	 * for each.  If that fails for any reason, it will not register
482 	 * the rds_ibdev at all.
483 	 */
484 	rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client);
485 	if (!rds_ibdev) {
486 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
487 		    "RDS/IB: No client_data for device %s", dev->name);
488 		return (-EOPNOTSUPP);
489 	}
490 	ic->rds_ibdev = rds_ibdev;
491 
492 	if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
493 		rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
494 	if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
495 		rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
496 
497 	/* Protection domain and memory range */
498 	ic->i_pd = rds_ibdev->pd;
499 
500 	/*
501 	 * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is
502 	 * not implmeneted in Hermon yet, but we can pass it to ib_create_cq()
503 	 * anyway.
504 	 */
505 	ic->i_cq = ib_create_cq(dev, rdsv3_ib_cq_comp_handler,
506 	    rdsv3_ib_cq_event_handler, conn,
507 	    ic->i_recv_ring.w_nr + ic->i_send_ring.w_nr + 1,
508 	    (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
509 	if (IS_ERR(ic->i_cq)) {
510 		ret = PTR_ERR(ic->i_cq);
511 		ic->i_cq = NULL;
512 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
513 		    "ib_create_cq failed: %d", ret);
514 		goto out;
515 	}
516 	if (rdsv3_enable_snd_cq) {
517 		ic->i_snd_cq = ib_create_cq(dev, rdsv3_ib_snd_cq_comp_handler,
518 		    rdsv3_ib_cq_event_handler, conn, ic->i_send_ring.w_nr + 1,
519 		    (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
520 		if (IS_ERR(ic->i_snd_cq)) {
521 			ret = PTR_ERR(ic->i_snd_cq);
522 			(void) ib_destroy_cq(ic->i_cq);
523 			ic->i_cq = NULL;
524 			ic->i_snd_cq = NULL;
525 			RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
526 			    "ib_create_cq send cq failed: %d", ret);
527 			goto out;
528 		}
529 	}
530 
531 	/* XXX negotiate max send/recv with remote? */
532 	(void) memset(&attr, 0, sizeof (attr));
533 	attr.event_handler = rdsv3_ib_qp_event_handler;
534 	attr.qp_context = conn;
535 	/* + 1 to allow for the single ack message */
536 	attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
537 	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
538 	attr.cap.max_send_sge = rds_ibdev->max_sge;
539 	attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE;
540 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
541 	attr.qp_type = IB_QPT_RC;
542 	if (rdsv3_enable_snd_cq) {
543 		attr.send_cq = ic->i_snd_cq;
544 	} else {
545 		attr.send_cq = ic->i_cq;
546 	}
547 	attr.recv_cq = ic->i_cq;
548 
549 	/*
550 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
551 	 * to back off until we get a value that the hardware can support?
552 	 */
553 	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
554 	if (ret) {
555 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
556 		    "rdma_create_qp failed: %d", ret);
557 		goto out;
558 	}
559 
560 	ret = rdsv3_ib_alloc_hdrs(dev, ic);
561 	if (ret != 0) {
562 		ret = -ENOMEM;
563 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
564 		    "rdsv3_ib_alloc_hdrs failed: %d", ret);
565 		goto out;
566 	}
567 
568 	ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr *
569 	    sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP);
570 	if (ic->i_sends == NULL) {
571 		ret = -ENOMEM;
572 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
573 		    "send allocation failed: %d", ret);
574 		goto out;
575 	}
576 	(void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr *
577 	    sizeof (struct rdsv3_ib_send_work));
578 
579 	ic->i_send_wrs =
580 	    kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) +
581 	    RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP);
582 	if (ic->i_send_wrs == NULL) {
583 		ret = -ENOMEM;
584 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
585 		    "Send WR allocation failed: %d", ret);
586 		goto out;
587 	}
588 	sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs +
589 	    (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t)));
590 	for (i = 0; i < RDSV3_IB_SEND_WRS; i++) {
591 		wrp = &ic->i_send_wrs[i];
592 		wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE];
593 	}
594 
595 	ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr *
596 	    sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP);
597 	if (ic->i_recvs == NULL) {
598 		ret = -ENOMEM;
599 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
600 		    "recv allocation failed: %d", ret);
601 		goto out;
602 	}
603 	(void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr *
604 	    sizeof (struct rdsv3_ib_recv_work));
605 
606 	ic->i_recv_wrs =
607 	    kmem_alloc(ic->i_recv_ring.w_nr * sizeof (ibt_recv_wr_t),
608 	    KM_NOSLEEP);
609 	if (ic->i_recv_wrs == NULL) {
610 		ret = -ENOMEM;
611 		RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
612 		    "Recv WR allocation failed: %d", ret);
613 		goto out;
614 	}
615 
616 	rdsv3_ib_recv_init_ack(ic);
617 
618 	RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p",
619 	    conn, ic->i_pd, ic->i_mr, ic->i_cq);
620 
621 out:
622 	return (ret);
623 }
624 
625 static uint32_t
626 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event)
627 {
628 	const struct rdsv3_ib_connect_private *dp =
629 	    event->param.conn.private_data;
630 	uint16_t common;
631 	uint32_t version = 0;
632 
633 	RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p",
634 	    event);
635 
636 	/*
637 	 * rdma_cm private data is odd - when there is any private data in the
638 	 * request, we will be given a pretty large buffer without telling us
639 	 * the
640 	 * original size. The only way to tell the difference is by looking at
641 	 * the contents, which are initialized to zero.
642 	 * If the protocol version fields aren't set,
643 	 * this is a connection attempt
644 	 * from an older version. This could could be 3.0 or 2.0 -
645 	 * we can't tell.
646 	 * We really should have changed this for OFED 1.3 :-(
647 	 */
648 
649 	/* Be paranoid. RDS always has privdata */
650 	if (!event->param.conn.private_data_len) {
651 		RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
652 		    "RDS incoming connection has no private data, rejecting");
653 		return (0);
654 	}
655 
656 	/* Even if len is crap *now* I still want to check it. -ASG */
657 	if (event->param.conn.private_data_len < sizeof (*dp) ||
658 	    dp->dp_protocol_major == 0)
659 		return (RDS_PROTOCOL_3_0);
660 
661 	common = ntohs(dp->dp_protocol_minor_mask) &
662 	    RDSV3_IB_SUPPORTED_PROTOCOLS;
663 	if (dp->dp_protocol_major == 3 && common) {
664 		version = RDS_PROTOCOL_3_0;
665 		while ((common >>= 1) != 0)
666 			version++;
667 	} else {
668 		RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
669 		    "RDS: Connection from %u.%u.%u.%u using "
670 		    "incompatible protocol version %u.%u\n",
671 		    NIPQUAD(dp->dp_saddr),
672 		    dp->dp_protocol_major,
673 		    dp->dp_protocol_minor);
674 	}
675 
676 	RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p",
677 	    event);
678 
679 	return (version);
680 }
681 
682 int
683 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
684     struct rdma_cm_event *event)
685 {
686 	uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id;
687 	uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id;
688 	const struct rdsv3_ib_connect_private *dp =
689 	    event->param.conn.private_data;
690 	struct rdsv3_ib_connect_private dp_rep;
691 	struct rdsv3_connection *conn = NULL;
692 	struct rdsv3_ib_connection *ic = NULL;
693 	struct rdma_conn_param conn_param;
694 	uint32_t version;
695 	int err, destroy = 1;
696 	boolean_t conn_created = B_FALSE;
697 
698 	RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
699 	    "Enter cm_id: %p event: %p", cm_id, event);
700 
701 	/* Check whether the remote protocol version matches ours. */
702 	version = rdsv3_ib_protocol_compatible(event);
703 	if (!version) {
704 		RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
705 		    "version mismatch");
706 		goto out;
707 	}
708 
709 	RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
710 	    "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid "
711 	    "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr),
712 	    RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
713 	    (unsigned long long)ntohll(lguid),
714 	    (unsigned long long)ntohll(fguid));
715 
716 	conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr,
717 	    &rdsv3_ib_transport, KM_NOSLEEP);
718 	if (IS_ERR(conn)) {
719 		RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
720 		    "rdsv3_conn_create failed (%ld)", PTR_ERR(conn));
721 		conn = NULL;
722 		goto out;
723 	}
724 
725 	/*
726 	 * The connection request may occur while the
727 	 * previous connection exist, e.g. in case of failover.
728 	 * But as connections may be initiated simultaneously
729 	 * by both hosts, we have a random backoff mechanism -
730 	 * see the comment above rdsv3_queue_reconnect()
731 	 */
732 	mutex_enter(&conn->c_cm_lock);
733 	if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
734 	    RDSV3_CONN_CONNECTING)) {
735 		if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
736 			RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
737 			    "incoming connect when connected: %p",
738 			    conn);
739 			rdsv3_conn_drop(conn);
740 			rdsv3_ib_stats_inc(s_ib_listen_closed_stale);
741 			mutex_exit(&conn->c_cm_lock);
742 			goto out;
743 		} else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) {
744 			/* Wait and see - our connect may still be succeeding */
745 			RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
746 			    "peer-to-peer connection request: %p, "
747 			    "lguid: 0x%llx fguid: 0x%llx",
748 			    conn, lguid, fguid);
749 			rdsv3_ib_stats_inc(s_ib_connect_raced);
750 		}
751 		mutex_exit(&conn->c_cm_lock);
752 		goto out;
753 	}
754 
755 	ic = conn->c_transport_data;
756 
757 	rdsv3_ib_set_protocol(conn, version);
758 	rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit));
759 
760 	/*
761 	 * If the peer gave us the last packet it saw, process this as if
762 	 * we had received a regular ACK.
763 	 */
764 	if (dp->dp_ack_seq)
765 		rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
766 
767 	ASSERT(!cm_id->context);
768 	ASSERT(!ic->i_cm_id);
769 
770 	if (ic->i_cm_id != NULL)
771 		RDSV3_PANIC();
772 
773 	ic->i_cm_id = cm_id;
774 	cm_id->context = conn;
775 
776 	/*
777 	 * We got halfway through setting up the ib_connection, if we
778 	 * fail now, we have to take the long route out of this mess.
779 	 */
780 	destroy = 0;
781 
782 	err = rdsv3_ib_setup_qp(conn);
783 	if (err) {
784 		RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
785 		    "rdsv3_ib_setup_qp failed (%d)", err);
786 		mutex_exit(&conn->c_cm_lock);
787 		rdsv3_conn_drop(conn);
788 		goto out;
789 	}
790 
791 	rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
792 	    event->param.conn.responder_resources,
793 	    event->param.conn.initiator_depth);
794 
795 	/* rdma_accept() calls rdma_reject() internally if it fails */
796 	err = rdma_accept(cm_id, &conn_param);
797 	mutex_exit(&conn->c_cm_lock);
798 	if (err) {
799 		RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
800 		    "rdma_accept failed (%d)", err);
801 		rdsv3_conn_drop(conn);
802 		goto out;
803 	}
804 
805 	RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
806 	    "Return cm_id: %p event: %p", cm_id, event);
807 
808 	return (0);
809 
810 out:
811 	(void) rdma_reject(cm_id, NULL, 0);
812 	return (destroy);
813 }
814 
815 
816 int
817 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
818 {
819 	struct rdsv3_connection *conn = cm_id->context;
820 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
821 	struct rdma_conn_param conn_param;
822 	struct rdsv3_ib_connect_private dp;
823 	int ret;
824 
825 	RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p",
826 	    cm_id);
827 
828 	/*
829 	 * If the peer doesn't do protocol negotiation, we must
830 	 * default to RDSv3.0
831 	 */
832 	rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
833 	ic->i_flowctl =
834 	    rdsv3_ib_sysctl_flow_control;	/* advertise flow control */
835 
836 	ret = rdsv3_ib_setup_qp(conn);
837 	if (ret) {
838 		RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
839 		    "rdsv3_ib_setup_qp failed (%d)", ret);
840 		rdsv3_conn_drop(conn);
841 		goto out;
842 	}
843 
844 	rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp,
845 	    RDS_PROTOCOL_VERSION, UINT_MAX, UINT_MAX);
846 
847 	ret = rdma_connect(cm_id, &conn_param);
848 	if (ret) {
849 		RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
850 		    "rdma_connect failed (%d)", ret);
851 		rdsv3_conn_drop(conn);
852 	}
853 
854 	RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
855 	    "Return: cm_id: %p", cm_id);
856 
857 out:
858 	/*
859 	 * Beware - returning non-zero tells the rdma_cm to destroy
860 	 * the cm_id. We should certainly not do it as long as we still
861 	 * "own" the cm_id.
862 	 */
863 	if (ret) {
864 		if (ic->i_cm_id == cm_id)
865 			ret = 0;
866 	}
867 	return (ret);
868 }
869 
870 int
871 rdsv3_ib_conn_connect(struct rdsv3_connection *conn)
872 {
873 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
874 	struct sockaddr_in src, dest;
875 	ipaddr_t	laddr, faddr;
876 	int ret;
877 
878 	RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn);
879 
880 	/*
881 	 * XXX I wonder what affect the port space has
882 	 */
883 	/* delegate cm event handler to rdma_transport */
884 	ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn,
885 	    RDMA_PS_TCP);
886 	if (IS_ERR(ic->i_cm_id)) {
887 		ret = PTR_ERR(ic->i_cm_id);
888 		ic->i_cm_id = NULL;
889 		RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
890 		    "rdma_create_id() failed: %d", ret);
891 		goto out;
892 	}
893 
894 	RDSV3_DPRINTF3("rdsv3_ib_conn_connect",
895 	    "created cm id %p for conn %p", ic->i_cm_id, conn);
896 
897 	/* The ipaddr should be in the network order */
898 	laddr = conn->c_laddr;
899 	faddr = conn->c_faddr;
900 	ret = rdsv3_sc_path_lookup(&laddr, &faddr);
901 	if (ret == 0) {
902 		RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
903 		    ntohl(laddr), ntohl(faddr));
904 	}
905 
906 	src.sin_family = AF_INET;
907 	src.sin_addr.s_addr = (uint32_t)laddr;
908 	src.sin_port = (uint16_t)htons(0);
909 
910 	dest.sin_family = AF_INET;
911 	dest.sin_addr.s_addr = (uint32_t)faddr;
912 	dest.sin_port = (uint16_t)htons(RDSV3_PORT);
913 
914 	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
915 	    (struct sockaddr *)&dest,
916 	    RDSV3_RDMA_RESOLVE_TIMEOUT_MS);
917 	if (ret) {
918 		RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
919 		    "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret);
920 		rdma_destroy_id(ic->i_cm_id);
921 		ic->i_cm_id = NULL;
922 	}
923 
924 	RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn);
925 
926 out:
927 	return (ret);
928 }
929 
930 /*
931  * This is so careful about only cleaning up resources that were built up
932  * so that it can be called at any point during startup.  In fact it
933  * can be called multiple times for a given connection.
934  */
935 void
936 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn)
937 {
938 	struct rdsv3_ib_connection *ic = conn->c_transport_data;
939 	int err = 0;
940 
941 	RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
942 	    "cm %p pd %p cq %p qp %p", ic->i_cm_id,
943 	    ic->i_pd, ic->i_cq, ic->i_cm_id ? ic->i_cm_id->qp : NULL);
944 
945 	if (ic->i_cm_id) {
946 		struct ib_device *dev = ic->i_cm_id->device;
947 
948 		RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
949 		    "disconnecting cm %p", ic->i_cm_id);
950 		err = rdma_disconnect(ic->i_cm_id);
951 		if (err) {
952 			/*
953 			 * Actually this may happen quite frequently, when
954 			 * an outgoing connect raced with an incoming connect.
955 			 */
956 			RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
957 			    "failed to disconnect, cm: %p err %d",
958 			    ic->i_cm_id, err);
959 		}
960 
961 		if (ic->i_cm_id->qp) {
962 			(void) ibt_flush_qp(
963 			    ib_get_ibt_channel_hdl(ic->i_cm_id));
964 			/*
965 			 * Don't wait for the send ring to be empty -- there
966 			 * may be completed non-signaled entries sitting on
967 			 * there. We unmap these below.
968 			 */
969 			rdsv3_wait_event(&ic->i_recv_ring.w_empty_wait,
970 			    rdsv3_ib_ring_empty(&ic->i_recv_ring));
971 			/*
972 			 * Note that Linux original code calls
973 			 * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic).
974 			 */
975 			rdma_destroy_qp(ic->i_cm_id);
976 		}
977 
978 		if (rdsv3_enable_snd_cq) {
979 			if (ic->i_snd_soft_cq) {
980 				rdsv3_af_thr_destroy(ic->i_snd_soft_cq);
981 				ic->i_snd_soft_cq = NULL;
982 			}
983 			if (ic->i_snd_cq)
984 				(void) ib_destroy_cq(ic->i_snd_cq);
985 		}
986 		if (ic->i_soft_cq) {
987 			rdsv3_af_thr_destroy(ic->i_soft_cq);
988 			ic->i_soft_cq = NULL;
989 		}
990 		if (ic->i_refill_rq) {
991 			rdsv3_af_thr_destroy(ic->i_refill_rq);
992 			ic->i_refill_rq = NULL;
993 		}
994 		if (ic->i_cq)
995 			(void) ib_destroy_cq(ic->i_cq);
996 
997 		if (ic->i_mr)
998 			rdsv3_ib_free_hdrs(dev, ic);
999 
1000 		if (ic->i_sends)
1001 			rdsv3_ib_send_clear_ring(ic);
1002 		if (ic->i_recvs)
1003 			rdsv3_ib_recv_clear_ring(ic);
1004 
1005 		rdma_destroy_id(ic->i_cm_id);
1006 
1007 		/*
1008 		 * Move connection back to the nodev list.
1009 		 */
1010 		if (ic->i_on_dev_list)
1011 			rdsv3_ib_remove_conn(ic->rds_ibdev, conn);
1012 
1013 		ic->i_cm_id = NULL;
1014 		ic->i_pd = NULL;
1015 		ic->i_mr = NULL;
1016 		ic->i_cq = NULL;
1017 		ic->i_snd_cq = NULL;
1018 		ic->i_send_hdrs = NULL;
1019 		ic->i_recv_hdrs = NULL;
1020 		ic->i_ack = NULL;
1021 	}
1022 	ASSERT(!ic->i_on_dev_list);
1023 
1024 	/* Clear pending transmit */
1025 	if (ic->i_rm) {
1026 		rdsv3_message_put(ic->i_rm);
1027 		ic->i_rm = NULL;
1028 	}
1029 
1030 	/* Clear the ACK state */
1031 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
1032 	ic->i_ack_next = 0;
1033 	ic->i_ack_recv = 0;
1034 
1035 	/* Clear flow control state */
1036 	ic->i_flowctl = 0;
1037 	ic->i_credits = 0;
1038 
1039 	rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
1040 	rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
1041 
1042 	if (ic->i_ibinc) {
1043 		rdsv3_inc_put(&ic->i_ibinc->ii_inc);
1044 		ic->i_ibinc = NULL;
1045 	}
1046 
1047 	if (ic->i_sends) {
1048 		kmem_free(ic->i_sends,
1049 		    ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work));
1050 		ic->i_sends = NULL;
1051 	}
1052 	if (ic->i_send_wrs) {
1053 		kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS *
1054 		    (sizeof (ibt_send_wr_t) +
1055 		    RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)));
1056 		ic->i_send_wrs = NULL;
1057 	}
1058 	if (ic->i_recvs) {
1059 		kmem_free(ic->i_recvs,
1060 		    ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work));
1061 		ic->i_recvs = NULL;
1062 	}
1063 	if (ic->i_recv_wrs) {
1064 		kmem_free(ic->i_recv_wrs, ic->i_recv_ring.w_nr *
1065 		    (sizeof (ibt_recv_wr_t)));
1066 		ic->i_recv_wrs = NULL;
1067 	}
1068 
1069 	RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn);
1070 }
1071 
1072 /*
1073  * the connection can be allocated from either rdsv3_conn_create_outgoing()
1074  * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the
1075  * same string. This can print the kstat warning on the console. To prevent
1076  * it, this counter value is used.
1077  * Note that requests from rdsv3_conn_create_outgoing() refers to the cached
1078  * value with the mutex lock before it allocates the connection, so that
1079  * the warning cannot be produced in the case. (only between
1080  * rdsv3_conn_create() and rdsv3_conn_create_outgoing().
1081  */
1082 static int conn_cnt;
1083 
1084 /* ARGSUSED */
1085 int
1086 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp)
1087 {
1088 	struct rdsv3_ib_connection *ic;
1089 
1090 	RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn);
1091 
1092 	/* XXX too lazy? */
1093 	ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp);
1094 	if (!ic)
1095 		return (-ENOMEM);
1096 
1097 	list_link_init(&ic->ib_node);
1098 
1099 	mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL);
1100 	mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL);
1101 
1102 	/*
1103 	 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they
1104 	 * must be initialized before it can be called.
1105 	 */
1106 	rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
1107 	rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
1108 
1109 	ic->conn = conn;
1110 	conn->c_transport_data = ic;
1111 
1112 	mutex_enter(&ib_nodev_conns_lock);
1113 	list_insert_tail(&ib_nodev_conns, ic);
1114 	mutex_exit(&ib_nodev_conns_lock);
1115 
1116 	RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p",
1117 	    conn, conn->c_transport_data);
1118 	return (0);
1119 }
1120 
1121 /*
1122  * Free a connection. Connection must be shut down and not set for reconnect.
1123  */
1124 void
1125 rdsv3_ib_conn_free(void *arg)
1126 {
1127 	struct rdsv3_ib_connection *ic = arg;
1128 	kmutex_t	*lock_ptr;
1129 
1130 	RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic);
1131 
1132 #ifndef __lock_lint
1133 	/*
1134 	 * Conn is either on a dev's list or on the nodev list.
1135 	 * A race with shutdown() or connect() would cause problems
1136 	 * (since rds_ibdev would change) but that should never happen.
1137 	 */
1138 	lock_ptr = ic->i_on_dev_list ?
1139 	    &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
1140 
1141 	mutex_enter(lock_ptr);
1142 	list_remove_node(&ic->ib_node);
1143 	mutex_exit(lock_ptr);
1144 #endif
1145 	kmem_free(ic, sizeof (*ic));
1146 }
1147 
1148 /*
1149  * An error occurred on the connection
1150  */
1151 void
1152 __rdsv3_ib_conn_error(struct rdsv3_connection *conn)
1153 {
1154 	rdsv3_conn_drop(conn);
1155 }
1156